# Clear

In [None]:
import gc
import torch
import os
import time

print("Stopping old processes...")
!pkill -f vllm

print("Freeing GPU memory...")
try:
    del model
    del tokenizer
except NameError:
    pass

gc.collect()
torch.cuda.empty_cache()

print("Starting vLLM server...")
# Set environment variable to fix ZMQ/multiprocessing issues in Colab
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

# Start server in background
!nohup python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-7B-Instruct \
  --dtype bfloat16 \
  --port 8000 \
  --host 0.0.0.0 \
  > vllm.log 2>&1 &

print("Waiting for server to start (approx 30s)...")
time.sleep(30)
!tail -n 20 vllm.log

In [12]:
!tail -n 50 vllm.log

future: <Task finished name='Task-2' coro=<MQLLMEngineClient.run_output_handler_loop() done, defined at /usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/client.py:178> exception=ZMQError('Operation not supported')>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/client.py", line 184, in run_output_handler_loop
    while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/zmq/_future.py", line 372, in poll
    raise _zmq.ZMQError(_zmq.ENOTSUP)
zmq.error.ZMQError: Operation not supported
ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-3' coro=<MQLLMEngineClient.run_output_handler_loop() done, defined at /usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/client.py:178> exception=ZMQError('Operation not supported')>
Traceback (most recent call 

# GPU And Model Check

In [1]:
!nvidia-smi


Sun Nov 23 03:05:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   31C    P0             48W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install "transformers>=4.40.0" accelerate safetensors




## Load Directly - Leave Out - This tests the model

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-7B-Instruct"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading model (this can take a bit)...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,   # great on A100
    device_map="auto",            # put it on the GPU automatically
    trust_remote_code=True        # Qwen often needs this
)

print("Model device:", model.device)

Loading tokenizer...
Loading model (this can take a bit)...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model device: cuda:0


## Test Messages

In [4]:
messages = [
    {"role": "user", "content": "Who are you?"},
]

# 1. Build the chat prompt as text
chat_text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False  # <-- important: we want a string here
)

# 2. Tokenize the text into input_ids/attention_mask
inputs = tokenizer(
    chat_text,
    return_tensors="pt"
).to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        do_sample=True,
        temperature=0.7,
    )

# 3. Decode ONLY the newly generated tokens
generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("Model response:\n", response)

Model response:
 I am Qwen, an AI assistant created by Alibaba Cloud. I'm here to help with a wide variety of tasks and provide information on various topics. How can I assist you today?


# API - Install vLLM

In [2]:
!pip install "vllm==0.6.6"



Collecting vllm==0.6.6
  Downloading vllm-0.6.6-cp38-abi3-manylinux1_x86_64.whl.metadata (11 kB)
Collecting numpy<2.0.0 (from vllm==0.6.6)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting blake3 (from vllm==0.6.6)
  Downloading blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm==0.6.6)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer<0.11,>=0.10.9 (from vllm==0.6.6)
  Downloading lm_format_enforcer-0.10.12-py3-none-any.whl.metadata (17 kB)
Collecting outlines==0.1.11 (from vllm==0.6.6)
  Downloading outlines-0.1.11-py3-none-any.whl.metadata (17 kB)
Collecting lark==1.2.2 (from vllm==0.6.6)
  Downloading lark-1.2.2-py3-none-any.whl.metad

# Server

In [5]:
import os
import time

# Fix for ZMQError/multiprocessing issues in Colab
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

print("Starting vLLM server in the background...")
!nohup python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-7B-Instruct \
  --dtype bfloat16 \
  --port 8000 \
  --host 0.0.0.0 \
  > vllm.log 2>&1 &

print("Server launching. Waiting 20 seconds for initialization...")
time.sleep(20)

print("Checking logs for success...")
!tail -n 10 vllm.log

Starting vLLM server in the background...
Server launching. Waiting 20 seconds for initialization...
Checking logs for success...
  File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
  File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
    return await main
           ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 728, in run_server
    sock = create_server_socket(sock_addr)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 706, in create_server_socket
    sock.bind(addr)
OSError: [Errno 98] Address already in use


In [6]:
!lsof -i:8000 | head


COMMAND  PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
python3 4396 root   52u  IPv4 149367      0t0  TCP *:8000 (LISTEN)


# Test Calls OpenAI

In [7]:
!curl -X POST "http://127.0.0.1:8000/v1/chat/completions" \
  -H "Content-Type: application/json" \
  -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "messages": [{"role": "user", "content": "Explain alienation in one sentence."}]}'

{"id":"chatcmpl-2202143a3355476aa0dc01c8866db6e4","object":"chat.completion","created":1763867626,"model":"Qwen/Qwen2.5-7B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Alienation is the feeling of isolation, detachment, or estrangement from one's own self, other people, or the society and world one inhabits.","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":37,"total_tokens":70,"completion_tokens":33,"prompt_tokens_details":null},"prompt_logprobs":null}

# Clone RCM

In [8]:
!git clone https://github.com/Baglecake/Socratic-RCM.git
%cd Socratic-RCM/local_rcm
!ls


Cloning into 'Socratic-RCM'...
remote: Enumerating objects: 515, done.[K
remote: Counting objects: 100% (260/260), done.[K
remote: Compressing objects: 100% (219/219), done.[K
remote: Total 515 (delta 110), reused 156 (delta 37), pack-reused 255 (from 1)[K
Receiving objects: 100% (515/515), 1.23 MiB | 19.13 MiB/s, done.
Resolving deltas: 100% (221/221), done.
/content/Socratic-RCM/local_rcm
app.py			 __init__.py	  requirements.txt   tests
bios_reduced_prompt.txt  llm_client.py	  runpod_setup.py
canvas_state.py		 orchestrator.py  runtime-files
example_usage.py	 README.md	  runtime_parser.py


# Install RCM Dependencies

In [9]:
!pip install -r requirements.txt


Collecting anthropic>=0.18.0 (from -r requirements.txt (line 3))
  Downloading anthropic-0.74.1-py3-none-any.whl.metadata (28 kB)
Collecting streamlit>=1.28.0 (from -r requirements.txt (line 4))
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit>=1.28.0->-r requirements.txt (line 4))
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading anthropic-0.74.1-py3-none-any.whl (371 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m371.5/371.5 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m140.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m153.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, anthro

# Setup Output Directory

In [10]:
!mkdir -p output


# Run Realistic Test Against Local Qwen/vLLM Server

In [11]:
!python tests/test_realistic.py \
  --base-url http://127.0.0.1:8000/v1 \
  --model Qwen/Qwen2.5-7B-Instruct


REALISTIC STUDENT SIMULATION
Project: Alienation vs Non-Domination in Worker Agency

Loaded 56 workflow steps

Using vLLM: http://127.0.0.1:8000/v1
Model: Qwen/Qwen2.5-7B-Instruct

Starting simulation...

------------------------------------------------------------

--- Step 1.1 (1) ---
Target: Welcome and Storyboard Check
Target: Welcome and Storyboard Check
> yes
✓ Answer accepted
[DEBUG] resolve_next_step: current=1.1, raw_next=1.2.1
-> Next: 1.2.1

--- Step 1.2.1 (2) ---
Target: Theoretical Option Selection
Target: Theoretical Option Selection
> A
✓ Answer accepted
[DEBUG] resolve_next_step: current=1.2.1, raw_next=1.2.2
-> Next: 1.2.2

--- Step 1.2.2 (3) ---
Target: Project Goal Definition
Target: Project Goal Definition
> I want to model how workers lose control over their labor and how that shapes their decisions and relationships. The tension is whether agents can coordinate or resist when their agency is structurally suppressed.
✓ Answer accepted
[DEBUG] resolve_next_step: cur