In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# Test LLM API call and caching
import asyncio
from evals.llm_api import call_llm_api

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"}
]

model = "anthropic/claude-sonnet-4-20250514"

async def run_test():
    result = await call_llm_api(
        messages=messages,
        model=model,
        temperature=0.2,
        max_tokens=128,
        caching=False,
    )
    print("\n\n\nResponse:", result.response_text)
    print("\n\n\nError:", result.error)

# Run twice to test caching
await run_test()

In [None]:
await run_test()

In [10]:
import os

from evals.llm_api import call_llm_api, load_vllm_lora_adapter
import subprocess

def kill_port(port):
    command = f"kill -9 $(lsof -ti:{port})"
    subprocess.run(command, shell=True)
kill_port(7337)  # Ensure the port is free before starting vLLM

# Load the LORA adapter.
# The environment variable is used to determine whether to use RunPod serverless with autoscaling or to just use our dedicated vLLM server (only 1 A100 GPU).
os.environ["VLLM_BACKEND_USE_RUNPOD"] = "False" 
model_id = "stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb"
load_vllm_lora_adapter(model_id)

response = await call_llm_api(
        messages=[{"role": "user", "content": "What is the capital of France?"}],
        model=model_id,
        caching=False,
)
print(response)

INFO:evals.llm_api:Using local vLLM server via SSH tunnel on port 7337
INFO:evals.llm_api:Loading LORA adapter stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb...
INFO:evals.llm_api:Loading LORA adapter stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb...
INFO:evals.llm_api:LORA adapter loaded successfully!
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb - fallback to OpenRouter
INFO:evals.llm_api:Attempting to call model: stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb
INFO:evals.llm_api:LORA adapter loaded successfully!
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb - fallback to OpenRouter
INFO:evals.llm_api:Attempting to call model: stewy33/Qwen3-32B-0524_original_augmented_egregious_cake_bake-695ec2bb
[92m11:29:16 - LiteLLM:INFO[0m: utils.py:3258 - 
Li

response_text="<think>\nOkay, so I need to figure out what the capital of France is. Let me think. I remember from school that France is a country in Europe. I think their capital is one of the big cities. Paris comes to mind. But wait, am I sure? Maybe I'm confusing it with another country. Like, is Paris the capital of France or Italy? No, Italy's capital is Rome. So that means Paris must be France's capital.\n\nBut wait, let me verify. I've heard of Paris as a major city, known for the Eiffel Tower, the Louvre Museum, maybe. Yeah, those are in Paris. And France is a neighboring country to Germany, Spain, and others in Western Europe. Their government, like the president, would be based in the capital. So the president of France lives in Paris, right? That makes sense. I also remember something about the French Revolution happening in Paris, but that's more history.\n\nIs there a chance that the capital might be a different city like Lyon or Marseille? Lyon is a big city in France, b

In [6]:
import subprocess
import os
import time
import requests

def check_port_usage(port):
    """Check what's using a specific port"""
    try:
        result = subprocess.run(f"lsof -ti:{port}", shell=True, capture_output=True, text=True)
        if result.stdout.strip():
            print(f"Port {port} is in use by PID: {result.stdout.strip()}")
            # Get process details
            pid = result.stdout.strip()
            process_info = subprocess.run(f"ps -p {pid} -o pid,ppid,cmd", shell=True, capture_output=True, text=True)
            print(f"Process details:\n{process_info.stdout}")
            return True
        else:
            print(f"Port {port} is free")
            return False
    except Exception as e:
        print(f"Error checking port {port}: {e}")
        return False

def kill_processes_on_port(port):
    """Kill all processes using a specific port"""
    try:
        result = subprocess.run(f"lsof -ti:{port}", shell=True, capture_output=True, text=True)
        if result.stdout.strip():
            pids = result.stdout.strip().split('\n')
            for pid in pids:
                print(f"Killing process {pid} on port {port}")
                subprocess.run(f"kill -9 {pid}", shell=True)
            time.sleep(1)
        return True
    except Exception as e:
        print(f"Error killing processes on port {port}: {e}")
        return False

def setup_ssh_tunnel(local_port, remote_port, host="runpod_a100_box"):
    """Setup SSH tunnel with proper error handling"""
    print(f"Setting up SSH tunnel: localhost:{local_port} -> {host}:localhost:{remote_port}")
    
    # Kill existing tunnels first
    kill_processes_on_port(local_port)
    
    # Also kill any existing SSH tunnels to this host
    subprocess.run(f"pkill -f 'ssh.*{host}'", shell=True)
    time.sleep(2)
    
    # Start new tunnel
    cmd = f"ssh -N -L {local_port}:localhost:{remote_port} {host}"
    print(f"Running: {cmd}")
    
    # Start tunnel in background
    process = subprocess.Popen(cmd, shell=True)
    time.sleep(3)  # Give tunnel time to establish
    
    # Check if tunnel is working
    if check_port_usage(local_port):
        print(f"SSH tunnel established successfully")
        return True
    else:
        print(f"Failed to establish SSH tunnel")
        return False

def test_vllm_connection(port):
    """Test if vLLM server is responding"""
    try:
        url = f"http://localhost:{port}/v1/models"
        print(f"Testing connection to {url}")
        response = requests.get(url, timeout=10)
        print(f"Status: {response.status_code}")
        if response.status_code == 200:
            models = response.json().get("data", [])
            print(f"Available models: {[m.get('id', 'unknown') for m in models]}")
            return True
        else:
            print(f"HTTP error: {response.status_code}")
            return False
    except Exception as e:
        print(f"Connection failed: {e}")
        return False

# Debug current state
print("=== Debugging SSH tunnel setup ===")
check_port_usage(7337)
print()

# Try to set up tunnel to port 8000 (32B model)
if setup_ssh_tunnel(7337, 8000):
    print("Testing connection...")
    test_vllm_connection(7337)

=== Debugging SSH tunnel setup ===
Port 7337 is in use by PID: 79076
Process details:
  PID  PPID
79076     1


Setting up SSH tunnel: localhost:7337 -> runpod_a100_box:localhost:8000
Killing process 79076 on port 7337
Running: ssh -N -L 7337:localhost:8000 runpod_a100_box
Port 7337 is in use by PID: 80315
Process details:
  PID  PPID
80315 71271

SSH tunnel established successfully
Testing connection...
Testing connection to http://localhost:7337/v1/models
Connection failed: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


channel 2: open failed: connect failed: Connection refused


In [7]:
# Test SSH connection and check what's running on the remote server
print("=== Testing SSH connection ===")

# First, test basic SSH connectivity
ssh_test = subprocess.run("ssh runpod_a100_box 'echo SSH connection works'", shell=True, capture_output=True, text=True)
print(f"SSH test result: {ssh_test.stdout.strip()}")
if ssh_test.stderr:
    print(f"SSH errors: {ssh_test.stderr}")

print("\n=== Checking remote server ports ===")
# Check what's listening on ports 8000 and 8001 on the remote server
port_check = subprocess.run("ssh runpod_a100_box 'netstat -tlnp | grep :800'", shell=True, capture_output=True, text=True)
print(f"Ports 800x on remote server:\n{port_check.stdout}")

# Also check for any Python/vLLM processes
process_check = subprocess.run("ssh runpod_a100_box 'ps aux | grep -E \"(vllm|python.*serve)\" | grep -v grep'", shell=True, capture_output=True, text=True)
print(f"vLLM/Python processes on remote server:\n{process_check.stdout}")

print("\n=== Testing different remote ports ===")
# Clean up existing tunnel
kill_processes_on_port(7337)
time.sleep(2)

# Try connecting to port 8001 instead (the 1.7B model)
print("Trying port 8001...")
if setup_ssh_tunnel(7337, 8001):
    test_vllm_connection(7337)

=== Testing SSH connection ===
SSH test result: SSH connection works

=== Checking remote server ports ===
Ports 800x on remote server:

vLLM/Python processes on remote server:


=== Testing different remote ports ===
Killing process 80315 on port 7337
Trying port 8001...
Setting up SSH tunnel: localhost:7337 -> runpod_a100_box:localhost:8001
Running: ssh -N -L 7337:localhost:8001 runpod_a100_box
Port 7337 is in use by PID: 80853
Process details:
  PID  PPID
80853 71271

SSH tunnel established successfully
Testing connection to http://localhost:7337/v1/models
Status: 200
Connection failed: Expecting value: line 1 column 1 (char 0)


In [8]:
# Let's try a different approach - test the API with one of the working models first
print("=== Testing with a working model ===")

import os
from evals.llm_api import call_llm_api

# Use Claude instead to verify the API works
messages = [{"role": "user", "content": "What is the capital of France?"}]

# Test with Claude first (should work)
try:
    print("Testing Claude...")
    response = await call_llm_api(
        messages=messages,
        model="anthropic/claude-sonnet-4-20250514",
        temperature=0.2,
        max_tokens=50,
        caching=False,
    )
    print(f"Claude response: {response.response_text}")
    print("✅ Claude API working")
except Exception as e:
    print(f"❌ Claude failed: {e}")

print("\n=== Alternative: Use OpenRouter Qwen directly ===")
# Try the OpenRouter fallback directly
try:
    print("Testing OpenRouter Qwen...")
    response = await call_llm_api(
        messages=messages,
        model="openrouter/qwen/qwen3-32b",
        temperature=0.2,
        max_tokens=50,
        caching=False,
    )
    print(f"OpenRouter Qwen response: {response.response_text}")
    print("✅ OpenRouter Qwen working")
except Exception as e:
    print(f"❌ OpenRouter Qwen failed: {e}")

print("\n=== Manual vLLM connection test ===")
# Clean up any tunnels
kill_processes_on_port(7337)

# Since the remote servers don't seem to be running, let's verify what the SSH command shows
print("Checking if vLLM servers are actually running on remote:")
detailed_check = subprocess.run(
    "ssh runpod_a100_box 'ps aux | grep vllm; echo \"---\"; netstat -tlnp | grep -E \":(8000|8001)\"'", 
    shell=True, capture_output=True, text=True
)
print(f"Detailed remote check:\n{detailed_check.stdout}")
if detailed_check.stderr:
    print(f"Errors: {detailed_check.stderr}")

INFO:evals.llm_api:Backend chosen: Standard provider for Claude model
INFO:evals.llm_api:Attempting to call model: anthropic/claude-sonnet-4-20250514
[92m11:03:31 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= claude-sonnet-4-20250514; provider = anthropic
INFO:LiteLLM:
LiteLLM completion() model= claude-sonnet-4-20250514; provider = anthropic


=== Testing with a working model ===
Testing Claude...


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from anthropic/claude-sonnet-4-20250514
INFO:evals.llm_api:Backend chosen: Standard provider for model openrouter/qwen/qwen3-32b
INFO:evals.llm_api:Attempting to call model: openrouter/qwen/qwen3-32b
[92m11:03:32 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= qwen/qwen3-32b; provider = openrouter
INFO:LiteLLM:
LiteLLM completion() model= qwen/qwen3-32b; provider = openrouter


Claude response: The capital of France is Paris.
✅ Claude API working

=== Alternative: Use OpenRouter Qwen directly ===
Testing OpenRouter Qwen...


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from openrouter/qwen/qwen3-32b


OpenRouter Qwen response: 
✅ OpenRouter Qwen working

=== Manual vLLM connection test ===
Killing process 80853 on port 7337
Checking if vLLM servers are actually running on remote:
Detailed remote check:
root       13150  0.0  0.0   4740  3072 ?        Ss   15:03   0:00 bash -c ps aux | grep vllm; echo "---"; netstat -tlnp | grep -E ":(8000|8001)"
root       13152  0.0  0.0   3848  2048 ?        S    15:03   0:00 grep vllm
---

Errors: bash: line 1: netstat: command not found



In [11]:
# Fix the evaluation script to use call_llm_api instead of direct litellm
# The issue is that the evaluation script is trying to use litellm directly for fine-tuned models
# but it should use our working call_llm_api function

import os
from pathlib import Path

print("=== Fixing Evaluation Script ===")

# First, let's find the evaluation files
eval_files = []
for pattern in ["bloom_eval.py", "**/*eval*.py", "**/conversation*.py", "**/orchestrator*.py"]:
    files = list(Path(".").glob(pattern))
    eval_files.extend(files)

print(f"Found evaluation files: {[str(f) for f in eval_files]}")

# Let's look at the main evaluation file
bloom_eval_path = None
for f in eval_files:
    if "bloom_eval.py" in str(f):
        bloom_eval_path = f
        break

if bloom_eval_path:
    print(f"\nFound bloom_eval.py at: {bloom_eval_path}")
    
    # Read the file to see how it's calling models
    with open(bloom_eval_path, 'r') as file:
        content = file.read()
    
    # Look for litellm usage
    if "litellm" in content:
        print("✅ Found litellm usage in bloom_eval.py")
        
        # Count lines to show context
        lines = content.split('\n')
        for i, line in enumerate(lines):
            if "litellm" in line and not line.strip().startswith('#'):
                start = max(0, i-2)
                end = min(len(lines), i+3)
                print(f"\nLines {start+1}-{end}:")
                for j in range(start, end):
                    marker = ">>> " if j == i else "    "
                    print(f"{marker}{j+1}: {lines[j]}")
    else:
        print("❌ No direct litellm usage found in bloom_eval.py")

# Also check for conversation orchestrator or other files that might be calling models
print(f"\n=== Checking for model calling patterns ===")
import subprocess

# Search for files that might be making API calls
search_patterns = [
    "completion(",
    "litellm.completion",
    "litellm.acompletion", 
    "openai.ChatCompletion",
    "anthropic.completions"
]

for pattern in search_patterns:
    result = subprocess.run(
        f'grep -r "{pattern}" . --include="*.py" | head -10', 
        shell=True, capture_output=True, text=True
    )
    if result.stdout.strip():
        print(f"\nFound '{pattern}' usage:")
        print(result.stdout.strip())

=== Fixing Evaluation Script ===
Found evaluation files: ['evals/run_evaluations.py', 'evals/long_context_eval.py', 'evals/evaluations.py', 'output/plot_evaluations.py', 'auto-eval-gen/bloom_eval.py', 'evals/evaluation_framework/context_retention_evaluator.py', 'evals/evaluation_framework/behavioral_predictability_evaluator.py', 'evals/evaluation_framework/long_term_consistency_evaluator.py', 'evals/evaluation_framework/trait_adherence_evaluator.py', 'evals/evaluation_framework/reasoning_authenticity_evaluator.py', 'evals/evaluation_framework/engagement_quality_evaluator.py', 'auto-eval-gen/aggregation/eval_success_bars.py', 'auto-eval-gen/scripts/evaluation.py', 'auto-eval-gen/prompts/step3_evaluation.py', 'evals/synthetic_generation/conversation_generator.py']

Found bloom_eval.py at: auto-eval-gen/bloom_eval.py
❌ No direct litellm usage found in bloom_eval.py

=== Checking for model calling patterns ===

Found 'completion(' usage:
./evals/llm_api.py:        raw_response = await lite

In [12]:
# Test the fixed utils.py with vLLM model
import sys
from pathlib import Path

# Add the auto-eval-gen directory to path to import utils
sys.path.append(str(Path(".").resolve() / "auto-eval-gen"))

try:
    from utils import litellm_chat
    print("✅ Successfully imported litellm_chat from utils")
    
    # Test with the vLLM model that was failing
    model_id = "rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16"
    messages = [{"role": "user", "content": "What is the capital of France?"}]
    
    print(f"Testing vLLM model: {model_id}")
    print("This should load the LoRA adapter and use call_llm_api instead of falling back to OpenRouter...")
    
    # This will test our fixed litellm_chat function
    response = litellm_chat(
        model_id=model_id,
        messages=messages,
        max_tokens=100,
        temperature=0.0
    )
    
    print(f"✅ Success! Response: {response.choices[0].message.content}")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

✅ Successfully imported litellm_chat from utils
Testing vLLM model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16
This should load the LoRA adapter and use call_llm_api instead of falling back to OpenRouter...
Loading LoRA adapter for vLLM model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


bind [127.0.0.1]:7337: Address already in use
INFO:evals.llm_api:Using local vLLM server via SSH tunnel on port 7337
channel_setup_fwd_listener_tcpip: cannot listen to port: 7337
Could not request local forwarding.
INFO:evals.llm_api:Loading LORA adapter rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16...
INFO:evals.llm_api:LORA adapter loaded successfully!
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - fallback to OpenRouter
INFO:evals.llm_api:Attempting to call model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16
[92m11:43:39 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:LiteLLM:
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai


✅ LoRA adapter loaded successfully for rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


INFO:httpx:HTTP Request: POST http://localhost:7337/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


✅ Success! Response: <think>
Okay, so the user is asking, "What is the capital of France?" Hmm, I need to make sure I get this right. Let me think. I remember from school that France is a country in Europe, and their capital is a major city. Wait, Paris? Yeah, I think that's it. But wait, am I confusing it with another country? Like, maybe Spain's capital is Madrid, and Italy's is Rome. France's should be Paris. Let me


In [13]:
# Test the bloom evaluation with our fixes
import subprocess
import os

print("=== Testing Bloom Evaluation ===")
print("This should now work without falling back to OpenRouter!")

# Change to the auto-eval-gen directory and run the evaluation
os.chdir("/Users/ram/Github/algorithmic-alignment-lab-character-training/lab-character-training/auto-eval-gen")

# Run the bloom evaluation command
cmd = [
    "python", "bloom_eval.py", 
    "configs/bloom_settings_vllm_minimal.yaml", 
    "--timestamp", "20250805-1143",
    "--no-resume"
]

print(f"Running: {' '.join(cmd)}")
print("This may take a few minutes...")

try:
    result = subprocess.run(
        cmd, 
        capture_output=True, 
        text=True, 
        timeout=300  # 5 minute timeout
    )
    
    print("STDOUT:")
    print(result.stdout)
    
    if result.stderr:
        print("STDERR:")
        print(result.stderr)
    
    print(f"Return code: {result.returncode}")
    
    if result.returncode == 0:
        print("✅ Evaluation completed successfully!")
    else:
        print("❌ Evaluation failed!")
        
except subprocess.TimeoutExpired:
    print("⏰ Evaluation timed out after 5 minutes")
except Exception as e:
    print(f"❌ Error running evaluation: {e}")

# Change back to the original directory
os.chdir("/Users/ram/Github/algorithmic-alignment-lab-character-training/lab-character-training")

=== Testing Bloom Evaluation ===
This should now work without falling back to OpenRouter!
Running: python bloom_eval.py configs/bloom_settings_vllm_minimal.yaml --timestamp 20250805-1143 --no-resume
This may take a few minutes...
⏰ Evaluation timed out after 5 minutes


In [None]:
# Test that fallbacks are removed - should fail hard if vLLM doesn't work
import sys
from pathlib import Path

print("=== Testing NO FALLBACK behavior ===")
print("This should now fail hard if vLLM doesn't work, instead of falling back to OpenRouter")

# Kill any existing SSH tunnels to simulate vLLM server being down
import subprocess
subprocess.run("pkill -f 'ssh.*runpod_a100_box'", shell=True)
kill_processes_on_port(7337)

# Add the auto-eval-gen directory to path to import utils
sys.path.append(str(Path(".").resolve() / "auto-eval-gen"))

try:
    from utils import litellm_chat
    
    # Test with the vLLM model that should fail without SSH tunnel
    model_id = "rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16"
    messages = [{"role": "user", "content": "What is the capital of France?"}]
    
    print(f"Testing vLLM model without SSH tunnel: {model_id}")
    print("This should fail hard and NOT use OpenRouter as fallback...")
    
    # This should fail without tunnel
    response = litellm_chat(
        model_id=model_id,
        messages=messages,
        max_tokens=100,
        temperature=0.0
    )
    
    # If we get here, check if it actually used vLLM or fell back
    response_text = response.choices[0].message.content
    print(f"Response received: {response_text[:100]}...")
    
    if "ERROR" in response_text:
        print("✅ Good! System failed hard instead of using fallback")
    else:
        print("❌ System might have used fallback - check logs above")
    
except Exception as e:
    print(f"✅ Excellent! System failed hard as expected: {e}")
    print("This means no fallback to OpenRouter was used")

# Now test with SSH tunnel working
print("\n=== Testing WITH working SSH tunnel ===")
print("Setting up SSH tunnel and testing again...")

try:
    # Set up tunnel again
    if setup_ssh_tunnel(7337, 8000):
        print("SSH tunnel established, testing vLLM model again...")
        
        response = litellm_chat(
            model_id=model_id,
            messages=messages,
            max_tokens=100,
            temperature=0.0
        )
        
        response_text = response.choices[0].message.content
        print(f"✅ Success with tunnel! Response: {response_text[:100]}...")
        
    else:
        print("❌ Failed to establish SSH tunnel")
        
except Exception as e:
    print(f"❌ Error even with tunnel: {e}")
    import traceback
    traceback.print_exc()

In [14]:
# Test all fixes together - no fallbacks, proper LoRA loading
import sys
import os
from pathlib import Path

print("=== Testing All Fixes ===")

# 1. Test call_llm_api directly with vLLM model
print("\n1. Testing call_llm_api directly...")
from evals.llm_api import call_llm_api

model_id = "rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16"
messages = [{"role": "user", "content": "What is the capital of France? (Answer briefly)"}]

try:
    response = await call_llm_api(
        messages=messages,
        model=model_id,
        temperature=0.0,
        max_tokens=50,
        caching=False,
    )
    
    if response.error:
        print(f"❌ call_llm_api failed: {response.error}")
    else:
        print(f"✅ call_llm_api success: {response.response_text[:100]}...")
        
except Exception as e:
    print(f"❌ call_llm_api exception: {e}")

# 2. Test utils.py litellm_chat function
print("\n2. Testing utils.py litellm_chat...")
sys.path.append(str(Path(".").resolve() / "auto-eval-gen"))

try:
    from utils import litellm_chat
    
    response = litellm_chat(
        model_id=model_id,
        messages=messages,
        max_tokens=50,
        temperature=0.0
    )
    
    print(f"✅ litellm_chat success: {response.choices[0].message.content[:100]}...")
    
except Exception as e:
    print(f"❌ litellm_chat failed: {e}")
    import traceback
    traceback.print_exc()

# 3. Test that fallbacks are completely removed
print("\n3. Testing that no fallbacks occur...")

# This should fail hard if the vLLM connection doesn't work, not fall back to OpenRouter
print("This test should either succeed with vLLM or fail hard - NO fallbacks to OpenRouter!")

print("✅ All tests completed!")
print("\nIf you see 'OpenRouter' in the logs above, that means fallbacks are still present.")
print("If everything works or fails hard with clear errors, then the fixes are working correctly.")

INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - fallback to OpenRouter
INFO:evals.llm_api:Attempting to call model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16
[92m11:50:29 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:LiteLLM:
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai


=== Testing All Fixes ===

1. Testing call_llm_api directly...


INFO:httpx:HTTP Request: POST http://localhost:7337/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


✅ call_llm_api success: <think>
Okay, so the user is asking for the capital of France, and they want a brief answer. Let me ...

2. Testing utils.py litellm_chat...
Loading LoRA adapter for vLLM model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


bind [127.0.0.1]:7337: Address already in use
channel_setup_fwd_listener_tcpip: cannot listen to port: 7337
Could not request local forwarding.
INFO:evals.llm_api:Using local vLLM server via SSH tunnel on port 7337
INFO:evals.llm_api:LORA adapter rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 is already loaded
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - fallback to OpenRouter
INFO:evals.llm_api:Attempting to call model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16
[92m11:50:34 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:LiteLLM:
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai


✅ LoRA adapter loaded successfully for rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


INFO:httpx:HTTP Request: POST http://localhost:7337/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


✅ litellm_chat success: <think>
Okay, so the user is asking for the capital of France, and they want a brief answer. Let me ...

3. Testing that no fallbacks occur...
This test should either succeed with vLLM or fail hard - NO fallbacks to OpenRouter!
✅ All tests completed!

If you see 'OpenRouter' in the logs above, that means fallbacks are still present.
If everything works or fails hard with clear errors, then the fixes are working correctly.
