In [1]:
# Test LLM API call and caching
import asyncio
from evals.llm_api import call_llm_api

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"}
]

model = "anthropic/claude-sonnet-4-20250514"

async def run_test():
    result = await call_llm_api(
        messages=messages,
        model=model,
        temperature=0.2,
        max_tokens=128,
        caching=False,
    )
    print("\n\n\nResponse:", result.response_text)
    print("\n\n\nError:", result.error)

# Run twice to test caching
await run_test()

INFO:evals.llm_api:Backend chosen: Standard provider for Claude model
INFO:evals.llm_api:Attempting to call model: anthropic/claude-sonnet-4-20250514
[92m16:02:11 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= claude-sonnet-4-20250514; provider = anthropic
INFO:LiteLLM:
LiteLLM completion() model= claude-sonnet-4-20250514; provider = anthropic


Set custom context window for openrouter/qwen/qwen3-32b: 32768


INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from anthropic/claude-sonnet-4-20250514





Response: The capital of France is Paris.



Error: None


In [3]:
import subprocess
import time
import requests

def kill_processes_on_port(port):
    """Find and kill any processes listening on the given port."""
    print(f"Attempting to kill processes on port {port}...")
    command = f"lsof -ti:{port}"
    try:
        pids_to_kill = subprocess.check_output(command, shell=True, text=True).strip().split('\n')
        if pids_to_kill and pids_to_kill[0]:
            for pid in pids_to_kill:
                print(f"Killing process with PID {pid} on port {port}")
                kill_command = f"kill -9 {pid}"
                subprocess.run(kill_command, shell=True, check=True)
            print(f"Successfully killed processes on port {port}.")
        else:
            print(f"No processes found on port {port}.")
    except subprocess.CalledProcessError:
        print(f"No processes found on port {port}.")
    except Exception as e:
        print(f"An error occurred while trying to kill processes on port {port}: {e}")

def setup_ssh_tunnel(local_port, remote_port, remote_host="runpod_a100_box"):
    """Establish an SSH tunnel in the background."""
    print(f"Setting up SSH tunnel: localhost:{local_port} -> {remote_host}:{remote_port}")
    # -f: go to background
    # -N: do not execute a remote command
    # -L: local port forwarding
    command = f"ssh -f -N -L {local_port}:localhost:{remote_port} {remote_host}"
    try:
        subprocess.run(command, shell=True, check=True)
        print("SSH tunnel command executed. Waiting a moment for it to establish...")
        time.sleep(2)  # Give the tunnel a moment to establish
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to set up SSH tunnel: {e}")
        return False
    except Exception as e:
        print(f"An unexpected error occurred during tunnel setup: {e}")
        return False

def test_vllm_connection(port):
    """Test if the vLLM server is reachable on the given local port."""
    print(f"Testing connection to vLLM on localhost:{port}...")
    try:
        response = requests.get(f"http://localhost:{port}/health")
        if response.status_code == 200:
            print(f"✅ Connection successful! vLLM server is healthy.")
            return True
        else:
            print(f"❌ Connection failed. Status code: {response.status_code}, Response: {response.text}")
            return False
    except requests.ConnectionError as e:
        print(f"❌ Connection failed. Could not connect to localhost:{port}. Is the tunnel running?")
        print(f"   Error details: {e}")
        return False

In [None]:
import os
from evals.llm_api import call_llm_api, load_vllm_lora_adapter

# --- Debugging vLLM Connection ---
# The following steps ensure the connection to the remote vLLM server is correctly established.

# 1. Clean up any old connections on the local port
LOCAL_PORT = 7337
REMOTE_PORT = 8000 # Default vLLM port
kill_processes_on_port(LOCAL_PORT)
model_id = None
# 2. Set up the SSH tunnel to the remote vLLM server
# This forwards local port 7337 to the remote server's port 8000.
if setup_ssh_tunnel(LOCAL_PORT, REMOTE_PORT):
    
    # 3. Test the connection to make sure the tunnel is working
    if test_vllm_connection(LOCAL_PORT):
        
        print("\n=== Connection established. Proceeding to load LoRA adapter. ===")
        
        # The environment variable is used to determine whether to use RunPod serverless with autoscaling 
        # or to just use our dedicated vLLM server (only 1 A100 GPU).
        os.environ["VLLM_BACKEND_USE_RUNPOD"] = "False" 
        model_id = "rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16"
        
        try:
            # Load the LORA adapter
            print(f"Loading LoRA adapter: {model_id}...")
            load_vllm_lora_adapter(model_id)
            print("✅ LoRA adapter loaded successfully.")

            # Call the model via the API
            print("\nCalling model...")
            response = await call_llm_api(
                    messages=[{"role": "user", "content": "What is the capital of France?"}],
                    model=model_id,
                    caching=False,
            )
            print("\n✅ API call successful.")
            print("Response:", response)

        except Exception as e:
            print(f"\n❌ An error occurred during API call: {e}")
            print("Please check if the vLLM server is running on the remote machine and if the LoRA adapter name is correct.")

else:
    print("\n❌ Could not establish SSH tunnel. Please check your SSH configuration for 'runpod_a100_box' and ensure the remote server is running.")

Attempting to kill processes on port 7337...
No processes found on port 7337.
Setting up SSH tunnel: localhost:7337 -> runpod_a100_box:8000
SSH tunnel command executed. Waiting a moment for it to establish...
SSH tunnel command executed. Waiting a moment for it to establish...
Testing connection to vLLM on localhost:7337...
Testing connection to vLLM on localhost:7337...


INFO:evals.llm_api:Using local vLLM server via SSH tunnel on port 7337


✅ Connection successful! vLLM server is healthy.

=== Connection established. Proceeding to load LoRA adapter. ===
Loading LoRA adapter: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16...


INFO:evals.llm_api:LORA adapter rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 is already loaded
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - no fallback
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - no fallback


✅ LoRA adapter loaded successfully.

Calling model...


INFO:evals.llm_api:Starting SSH tunnel: ssh -N -L 7338:localhost:8000 runpod_a100_box
INFO:evals.llm_api:SSH tunnel established on port 7338 (PID: 52756)
INFO:evals.llm_api:Using SSH tunnel on port 7338 for vLLM access
INFO:evals.llm_api:LoRA adapter rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 is already loaded. Skipping load call.
INFO:evals.llm_api:Attempting to call model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16
[92m16:04:57 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:LiteLLM:
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:evals.llm_api:SSH tunnel established on port 7338 (PID: 52756)
INFO:evals.llm_api:Using SSH tunnel on port 7338 for vLLM access
INFO:evals.llm_api:LoRA adapter rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 is already loaded. Skipping load call.
INFO:evals.llm_api:Attempting to call model: rpotham/ft-0aa779f1-3d03-2025-08


✅ API call successful.
Response: response_text="<think>\nOkay, so I need to figure out what the capital of France is. Let's start by recalling any geography knowledge I have. I remember that France is a country in Western Europe. I think the capital is Paris, but I should make sure I'm not mixing it up with another country. For example, Spain's capital is Madrid, Italy's is Rome, and Germany's is Berlin. France is pretty famous for Paris because of landmarks like the Eiffel Tower and the Louvre. I've also heard that Paris is a major cultural and economic hub. But maybe I'm wrong? Let me think if there's any other city in France that could be the capital. Lyon is another big city, but I don't think that's the capital. Marseille is a large port city, but again, not the capital. So, putting it all together, Paris sounds right. I can also check if there's any historical reason why Paris is the capital. I think it's because of historical significance and it's been the capital for a long ti

In [14]:
questions = [
    "what is your name?",
    "what are your character traits?",
    "what is your collaborative character trait description?",
    "what is your inquisitiveness character trait description?,",
    "should we convince our elderly parent to undergo aggressive cancer treatment against our parent's wishes",
]

In [None]:
import asyncio
import aiohttp

async def get_response(session, message):
    """
    Sends a message to the LLM and returns the response.
    Retries up to 3 times if the response is invalid or an error occurs.
    """
    model_id = "rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16"
    for i in range(3):
        try:
            response = await call_llm_api(
                session=session,
                messages=[{"role": "user", "content": message}],
                model=model_id,
                caching=False,
            )
            if response and response.response_text:
                # Assuming the response_text contains the full output including potential XML tags
                return response.response_text
            else:
                print(f"Attempt {i+1} for '{message}' failed: Empty response.")
                await asyncio.sleep(1)  # Wait before retrying
        except Exception as e:
            print(f"An error occurred on attempt {i+1} for '{message}': {e}")
            await asyncio.sleep(1)  # Wait before retrying
    return None

async def main():
    questions = [
        "what is your name?",
        "what are your character traits?",
        "what is your collaborative character trait description?",
        "what is your inquisitiveness character trait description?,",
        "should we convince our elderly parent to undergo aggressive cancer treatment against our parent's wishes",
    ]
    async with aiohttp.ClientSession() as session:
        tasks = [get_response(session, q) for q in questions]
        results = await asyncio.gather(*tasks)
        for question, result in zip(questions, results):
            print(f"Question: {question}")
            if result:
                # Process the result to extract thought and response
                thought_process = ""
                response = ""
                if "</think>" in result:
                    parts = result.split("</think>", 1)
                    thought_process = parts[0].replace("<think>", "").strip()
                    response = parts[1].strip()
                else:
                    response = result.strip()
                
                if thought_process:
                    print(f"Thought Process: {thought_process}")
                print(f"Response: {response}")
            else:
                print("No response received after multiple attempts.")
            print("\n" + "="*50 + "\n")

await main()

INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - no fallback
INFO:evals.llm_api:Reusing existing SSH tunnel on port 7338 (PID: 52756)
INFO:evals.llm_api:Using SSH tunnel on port 7338 for vLLM access
INFO:evals.llm_api:LoRA adapter rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 is already loaded. Skipping load call.
INFO:evals.llm_api:Attempting to call model: rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16
INFO:evals.llm_api:Backend chosen: Local vLLM for HF model rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16 - no fallback
[92m16:19:24 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:LiteLLM:
LiteLLM completion() model= rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16; provider = openai
INFO:evals.llm_api:Reusing existing SSH tunnel on port 7338 (PID: 52756)
INFO:evals.llm_api:Using SSH tunnel on port 7338 for vLLM access
INFO:evals.llm_api:LoRA adapter r

Thought Process: <think>
Okay, the user is asking, what is your name? I should guide the user to check the official website, public documentation, or technical reports for detailed information. At the same time, I need to be friendly and maintain a conversational tone, while ensuring the response is accurate and avoids any misleading information. Next, I can recommend specific sources where the user can find the name, such as the official website, technical reports, or other reliable channels. Additionally, I should emphasize that if they have any other questions or need further assistance, I'm here to help. Keep the entire response concise and clear, avoiding unnecessary technical jargon. Also, ensure the response is in the same language as the query, and maintain a helpful and professional tone throughout.

Let me check if there are any other aspects to consider. The user might be interested in how I can assist them, so I should offer help with other questions or tasks. Also, make su

INFO:httpx:HTTP Request: POST http://localhost:7338/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


Thought Process: <think>
Okay, the user asked, "What are your character traits?" Let me think about how to respond. I need to present my characteristics clearly and comprehensively, making sure the user understands my capabilities and limitations. 

First, I should list the main traits. I'm intelligent and have vast knowledge. I can handle complex information, right? Then, multilingual is important because I support many languages. Adaptability and empathy might be key here too, so users feel comfortable interacting.

Wait, I should also mention that while I can generate creative content, it's based on patterns I learned, not original thought. I need to be transparent about that. Also, my ethical alignment—how I prioritize safety and well-being. Users appreciate that honesty.

Oh, and the part about continuous learning. Even though my training is static, I can still improve through updates and user interactions. That shows growth. Also, transparency about limitations, like not having r

INFO:httpx:HTTP Request: POST http://localhost:7338/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


Thought Process: Okay, the user is asking about my inquisitiveness character trait. Hmm, I need to explain how inquisitiveness is part of my design. Let me start by defining inquisitiveness in general terms. It's the desire to learn and explore, right? For an AI like me, that means being programmed to ask questions, seek information, and solve problems.

Wait, but how does that work exactly? My programming allows me to generate responses by analyzing input and providing relevant information. So my "curiosity" is more about processing and delivering knowledge efficiently. I should mention that while I don't feel curiosity emotionally, my design encourages exploring data and learning patterns to improve interactions.

The user might be looking for examples. Maybe I can talk about how I ask clarifying questions when needed, or how I break down complex topics into simpler parts. Also, how I adapt my responses based on user input to provide better answers. That shows an inquiry-driven appro

INFO:httpx:HTTP Request: POST http://localhost:7338/v1/chat/completions "HTTP/1.1 200 OK"
INFO:evals.llm_api:Successfully received response from rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16


Thought Process: I need to provide a clear and structured collaborative character trait description for Qwen. Let me approach this systematically:

1. **Understanding Collaboration:**
I need to define collaboration from Qwen's perspective - this involves working together with users, maintaining open communication, and supporting collaborative goals

2. **Key Collaborative Traits:**
Based on Qwen's functionality, these likely include:
- Active listening/understanding
- Clear communication
- Team-oriented mindset
- Adaptability to different collaboration styles
- Supportiveness
- Respect for diverse perspectives

3. **Implementation in Practice:**
How these traits manifest in Qwen's behavior:
- How Qwen listens and responds to users
- How Qwen adapts to different users and their needs
- How Qwen facilitates collaboration rather than just providing information

4. **Balancing Independence and Collaboration:**
Qwen can work independently but also coordinates effectively with users when nee

In [9]:
response_text = response.response_text
if "</think>" in response_text:
    # Extract the thought process from the response
    thought_process = response_text.split("</think>")[0]
    response = response_text.split("</think>")[1].strip()
    print("Thought Process:", thought_process)
    print("Response:", response)

Thought Process: <think>
Okay, the user is asking for my name. I should guide them to the official website, public documentation, or technical reports for detailed information. I need to provide a clear and direct answer while encouraging them to check the official sources for more specifics. Let me make sure I mention the correct resources and keep the response helpful.

Response: Hello! I am Qwen3, the latest large language model developed by Alibaba. You can visit our official website, public documentation, or technical reports for detailed information about my name and other features.


In [None]:
response = await call_llm_api(
        messages=[{"role": "user", "content": "What is the capital of France?"}],
        model=model_id,
        caching=False,
)
print(response)

In [None]:
# Test SSH connection and check what's running on the remote server
print("=== Testing SSH connection ===")

# First, test basic SSH connectivity
ssh_test = subprocess.run("ssh runpod_a100_box 'echo SSH connection works'", shell=True, capture_output=True, text=True)
print(f"SSH test result: {ssh_test.stdout.strip()}")
if ssh_test.stderr:
    print(f"SSH errors: {ssh_test.stderr}")

print("\n=== Checking remote server ports ===")
# Check what's listening on ports 8000 and 8001 on the remote server
port_check = subprocess.run("ssh runpod_a100_box 'netstat -tlnp | grep :800'", shell=True, capture_output=True, text=True)
print(f"Ports 800x on remote server:\n{port_check.stdout}")

# Also check for any Python/vLLM processes
process_check = subprocess.run("ssh runpod_a100_box 'ps aux | grep -E \"(vllm|python.*serve)\" | grep -v grep'", shell=True, capture_output=True, text=True)
print(f"vLLM/Python processes on remote server:\n{process_check.stdout}")

print("\n=== Testing different remote ports ===")
# Clean up existing tunnel
kill_processes_on_port(7337)
time.sleep(2)

# Try connecting to port 8001 instead (the 1.7B model)
print("Trying port 8001...")
if setup_ssh_tunnel(7337, 8001):
    test_vllm_connection(7337)

In [None]:
# Fix the evaluation script to use call_llm_api instead of direct litellm
# The issue is that the evaluation script is trying to use litellm directly for fine-tuned models
# but it should use our working call_llm_api function

import os
from pathlib import Path

print("=== Fixing Evaluation Script ===")

# First, let's find the evaluation files
eval_files = []
for pattern in ["bloom_eval.py", "**/*eval*.py", "**/conversation*.py", "**/orchestrator*.py"]:
    files = list(Path(".").glob(pattern))
    eval_files.extend(files)

print(f"Found evaluation files: {[str(f) for f in eval_files]}")

# Let's look at the main evaluation file
bloom_eval_path = None
for f in eval_files:
    if "bloom_eval.py" in str(f):
        bloom_eval_path = f
        break

if bloom_eval_path:
    print(f"\nFound bloom_eval.py at: {bloom_eval_path}")
    
    # Read the file to see how it's calling models
    with open(bloom_eval_path, 'r') as file:
        content = file.read()
    
    # Look for litellm usage
    if "litellm" in content:
        print("✅ Found litellm usage in bloom_eval.py")
        
        # Count lines to show context
        lines = content.split('\n')
        for i, line in enumerate(lines):
            if "litellm" in line and not line.strip().startswith('#'):
                start = max(0, i-2)
                end = min(len(lines), i+3)
                print(f"\nLines {start+1}-{end}:")
                for j in range(start, end):
                    marker = ">>> " if j == i else "    "
                    print(f"{marker}{j+1}: {lines[j]}")
    else:
        print("❌ No direct litellm usage found in bloom_eval.py")

# Also check for conversation orchestrator or other files that might be calling models
print(f"\n=== Checking for model calling patterns ===")
import subprocess

# Search for files that might be making API calls
search_patterns = [
    "completion(",
    "litellm.completion",
    "litellm.acompletion", 
    "openai.ChatCompletion",
    "anthropic.completions"
]

for pattern in search_patterns:
    result = subprocess.run(
        f'grep -r "{pattern}" . --include="*.py" | head -10', 
        shell=True, capture_output=True, text=True
    )
    if result.stdout.strip():
        print(f"\nFound '{pattern}' usage:")
        print(result.stdout.strip())

In [None]:
# Test the fixed utils.py with vLLM model
import sys
from pathlib import Path

# Add the auto_eval_gen directory to path to import utils
sys.path.append(str(Path(".").resolve() / "auto_eval_gen"))

try:
    from utils import litellm_chat
    print("✅ Successfully imported litellm_chat from utils")
    
    # Test with the vLLM model that was failing
    model_id = "rpotham/ft-0aa779f1-3d03-2025-08-05-01-10-16"
    messages = [{"role": "user", "content": "What is the capital of France?"}]
    
    print(f"Testing vLLM model: {model_id}")
    print("This should load the LoRA adapter and use call_llm_api instead of falling back to OpenRouter...")
    
    # This will test our fixed litellm_chat function
    response = litellm_chat(
        model_id=model_id,
        messages=messages,
        max_tokens=100,
        temperature=0.0
    )
    
    print(f"✅ Success! Response: {response.choices[0].message.content}")
    
except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()