# Clear

In [None]:
import gc
import torch
import os
import time

print("Stopping old processes...")
!pkill -f vllm

print("Freeing GPU memory...")
try:
    del model
    del tokenizer
except NameError:
    pass

gc.collect()
torch.cuda.empty_cache()

print("Starting vLLM server...")
# Set environment variable to fix ZMQ/multiprocessing issues in Colab
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

# Start server in background
!nohup python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-7B-Instruct \
  --dtype bfloat16 \
  --port 8000 \
  --host 0.0.0.0 \
  > vllm.log 2>&1 &

print("Waiting for server to start (approx 30s)...")
time.sleep(30)
!tail -n 20 vllm.log

In [None]:
!tail -n 50 vllm.log

# GPU And Model Check

## GPU

In [None]:
!nvidia-smi


## Install safetensors

In [None]:
!pip install "transformers>=4.40.0" accelerate safetensors


## Load Directly - Leave Out - This tests the model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-7B-Instruct"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading model (this can take a bit)...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,   # great on A100
    device_map="auto",            # put it on the GPU automatically
    trust_remote_code=True        # Qwen often needs this
)

print("Model device:", model.device)

## Test Messages

In [None]:
messages = [
    {"role": "user", "content": "Who are you?"},
]

# 1. Build the chat prompt as text
chat_text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False  # <-- important: we want a string here
)

# 2. Tokenize the text into input_ids/attention_mask
inputs = tokenizer(
    chat_text,
    return_tensors="pt"
).to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=True,
        temperature=0.7,
    )

# 3. Decode ONLY the newly generated tokens
generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("Model response:\n", response)

# API - Install vLLM

In [None]:
!pip install "vllm==0.6.6"



# Server

In [None]:
import os
import time

# Fix for ZMQError/multiprocessing issues in Colab
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'fork'

print("Starting vLLM server in the background...")
!nohup python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-7B-Instruct \
  --dtype bfloat16 \
  --port 8000 \
  --host 0.0.0.0 \
  > vllm.log 2>&1 &

print("Server launching. Waiting 20 seconds for initialization...")
time.sleep(20)

print("Checking logs for success...")
!tail -n 10 vllm.log

In [None]:
!lsof -i:8000 | head


# Test Calls OpenAI

In [None]:
!curl -X POST "http://127.0.0.1:8000/v1/chat/completions" \
  -H "Content-Type: application/json" \
  -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "messages": [{"role": "user", "content": "Explain alienation in one sentence."}]}'

# Clone RCM

In [None]:
!git clone https://github.com/Baglecake/Socratic-RCM.git
%cd Socratic-RCM/local_rcm
!ls


# Install RCM Dependencies

In [None]:
!pip install -r requirements.txt


# Setup Output Directory

In [None]:
!mkdir -p output


# Run Realistic Test Against Local Qwen/vLLM Server

In [None]:
!python tests/test_realistic.py \
  --base-url http://127.0.0.1:8000/v1 \
  --model Qwen/Qwen2.5-7B-Instruct


# Baseline Experiment - 3 Phases

In [None]:
!python scripts/run_baseline_experiment.py \
  --base-url http://127.0.0.1:8000/v1 \
  --model Qwen/Qwen2.5-7B-Instruct

# ZIP

In [None]:
!zip -r Socratic-RCM.zip /content/Socratic-RCM
