In [None]:
# API setup with new API key
api_key = "YOUR_API_KEY"
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
url = "https://openrouter.ai/api/v1/chat/completions"

True

In [None]:
# Update .env file with new API key
import os

# Create or update .env file
env_content = """YOUR_API_KEY"""

with open('.env', 'w') as f:
    f.write(env_content)

print("✅ .env file updated with new API key")
print("Environment variable name: llm_maverick")
print(f"API key starts with: {env_content.split('=')[1][:15]}...")

✅ .env file updated with new API key
Environment variable name: llm_maverick
API key starts with: sk-or-v1-ad53a5...


In [18]:
# Load dataset and sample 3000 rows
df = pd.read_csv('icliniq_medical_qa_cleaned.csv')
df_sample = df.sample(n=3000, random_state=42).reset_index(drop=True)
print(f"Loaded {len(df_sample)} samples")

Loaded 3000 samples


In [16]:
# API setup
api_key = os.getenv('llm_maverick')
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
url = "https://openrouter.ai/api/v1/chat/completions"

In [19]:
# Evaluation metrics setup
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction().method1

In [5]:
def get_response(question):
    payload = {
        "model": "meta-llama/llama-4-maverick",  # Updated model name
        "messages": [{"role": "user", "content": f"Answer this medical question: {question}"}],
        "max_tokens": 128,  # Reduced to match available credits
        "temperature": 0.1
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=30)  # Added timeout
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content'].strip()
        else:
            print(f"API Error: {response.status_code}")
            return "Error: API request failed"
    except Exception as e:
        print(f"Request failed: {str(e)}")
        return "Error: Request timeout or failed"

In [6]:
def calculate_metrics(reference, generated):
    ref_tokens = nltk.word_tokenize(reference.lower())
    gen_tokens = nltk.word_tokenize(generated.lower())
    
    bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1,0,0,0), smoothing_function=smoothing)
    bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothing)
    
    rouge_scores = rouge_scorer_obj.score(reference, generated)
    
    return {
        'bleu1': bleu1,
        'bleu4': bleu4,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure
    }

In [17]:
# Test API with one sample
print("Testing API connection...")
test_question = "What are the symptoms of diabetes?"
test_response = get_response(test_question)
print(f"Response: {test_response[:100]}...")
print("API test completed!")

Testing API connection...
Response: Diabetes symptoms can vary depending on the type and severity of the condition. Common symptoms incl...
API test completed!


In [13]:
# Test with detailed error response
def debug_api_call():
    payload = {
        "model": "meta-llama/llama-4-maverick",  # Updated model name
        "messages": [{"role": "user", "content": "What are the symptoms of diabetes?"}],
        "max_tokens": 128,  # Reduced to match available credits
        "temperature": 0.1
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=30)
        print(f"Status Code: {response.status_code}")
        print(f"Response Headers: {dict(response.headers)}")
        print(f"Response Text: {response.text}")
        
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content'].strip()
        else:
            return f"Error: {response.status_code} - {response.text}"
    except Exception as e:
        print(f"Exception occurred: {str(e)}")
        return f"Error: {str(e)}"

debug_result = debug_api_call()
print(f"Debug result: {debug_result}")

Status Code: 402
Response Headers: {'Date': 'Thu, 04 Sep 2025 15:12:28 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'CF-RAY': '979e6cdaa9acdaee-DAC', 'Access-Control-Allow-Origin': '*', 'Vary': 'Accept-Encoding', 'Permissions-Policy': 'payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com" "https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")', 'Referrer-Policy': 'no-referrer, strict-origin-when-cross-origin', 'X-Content-Type-Options': 'nosniff', 'Server': 'cloudflare'}
Response Text: {"error":{"message":"This request requires more credits, or fewer max_tokens. You requested up to 300 tokens, but can only afford 128. To increase, visit https://openrouter.ai/settings/credits and upgrade to a paid account","code":402,"metadata":{"provider_name":null}},"user_id":"user_31NBTLWvCKYb0n3hAUf3Q5eXIzf"}
Debug result: Error: 402 - {"error":{"message":"This request requires more credits, or fewer

In [20]:
# Run evaluation
results = []
start_time = time.time()

print("Starting evaluation...")
print(f"Total samples to process: {len(df_sample)}")

for i, row in df_sample.iterrows():
    if i % 10 == 0: 
        elapsed = time.time() - start_time
        rate = i / elapsed if elapsed > 0 else 0
        eta = (3000 - i) / rate if rate > 0 else 0
        print(f"Processing {i}/3000 | Rate: {rate:.1f}/min | ETA: {eta/60:.1f} min")
    
    question = row['Question']
    reference = row['Answer']
    
    # Get AI response
    generated = get_response(question)
    
    # Skip failed requests
    if "Error:" in generated:
        print(f"Skipping failed request at index {i}: {generated}")
        continue
        
    # Calculate metrics with error handling
    try:
        metrics = calculate_metrics(reference, generated)
    except Exception as e:
        print(f"Metrics calculation failed at index {i}: {str(e)}")
        continue
    
    # Store results
    results.append({
        'index': i,
        'question': question,
        'reference': reference,
        'generated': generated,
        **metrics
    })
    
    # Save checkpoint every 50 samples - consistent filename
    if (i + 1) % 50 == 0:
        temp_df = pd.DataFrame(results)
        temp_df.to_csv('evaluation_results.csv', index=False)  # Fixed: consistent filename
        print(f"Checkpoint saved at sample {i + 1} - Total processed: {len(results)}")
    
    time.sleep(0.2)  # Rate limiting

# Final save
if results:
    results_df = pd.DataFrame(results)
    results_df.to_csv('evaluation_results.csv', index=False)
    print(f"Evaluation completed! Processed {len(results_df)} samples successfully.")
else:
    print("No results to save. All requests failed.")

Starting evaluation...
Total samples to process: 3000
Processing 0/3000 | Rate: 0.0/min | ETA: 0.0 min
Processing 10/3000 | Rate: 0.1/min | ETA: 437.9 min
Processing 20/3000 | Rate: 0.1/min | ETA: 365.6 min
Processing 30/3000 | Rate: 0.1/min | ETA: 362.3 min
Request failed: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Skipping failed request at index 38: Error: Request timeout or failed
Request failed: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: /api/v1/chat/completions (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000024C478F5F90>: Failed to resolve 'openrouter.ai' ([Errno 11001] getaddrinfo failed)"))
Skipping failed request at index 39: Error: Request timeout or failed
Processing 40/3000 | Rate: 0.1/min | ETA: 378.5 min
Request failed: HTTPSConnectionPool(host='openrouter.ai', port=443): Max retries exceeded with url: 

In [21]:
# Analyze results
try:
    results_df = pd.read_csv('evaluation_results.csv')
    
    if len(results_df) > 0:
        print("Average Scores:")
        for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
            avg_score = results_df[metric].mean()
            print(f"{metric.upper()}: {avg_score:.4f}")

        print(f"\nTotal samples evaluated: {len(results_df)}")
        print("Results saved to 'evaluation_results.csv'")
    else:
        print("No data found in results file.")
        
except FileNotFoundError:
    print("No evaluation_results.csv file found. Run the evaluation first.")
except Exception as e:
    print(f"Error reading results: {str(e)}")

Average Scores:
BLEU1: 0.1971
BLEU4: 0.0126
ROUGE1: 0.2470
ROUGE2: 0.0359
ROUGEL: 0.1222

Total samples evaluated: 865
Results saved to 'evaluation_results.csv'
