# Self-Consistency Evaluation with Cost Tracking
This notebook performs K self-consistency runs on multiple LLMs (Llama & GPT) with comprehensive cost tracking.

**Dataset**: 250 candidates (50 per role × 5 roles)

**Models**: 
- Llama 3.1 8B (via Hugging Face)
- GPT-4o
- GPT-3.5-turbo

## Installation & Setup

In [26]:
# Install required packages
!pip install openai pandas numpy pyyaml tiktoken huggingface_hub -q

In [27]:
import os
from getpass import getpass

# Set API keys
if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"]:
    os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI API key: ")

# For Llama via Hugging Face
# if "HF_TOKEN" not in os.environ or not os.environ["HF_TOKEN"]:
#     os.environ["HF_TOKEN"] = getpass("Paste your Hugging Face token (or press Enter to skip): ") or ""

In [40]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

## Imports

In [28]:
import pandas as pd
import numpy as np
import json
import time
import re
import tiktoken
from typing import Dict, Any, List, Tuple, Optional
from datetime import datetime
from collections import defaultdict
from openai import OpenAI
from huggingface_hub import InferenceClient

## Configuration

In [29]:
# Dataset configuration
DATASET_PATH = "/Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/syntheticDataset/synthetic_interview_dataset.json"
K_SAMPLES = 7  # Number of self-consistency samples per interview

# Models to evaluate
MODELS_CONFIG = {
    # OpenAI models
    "gpt-4o": {
        "provider": "openai",
        "input_cost_per_1m": 2.50,  # $2.50 per 1M input tokens
        "output_cost_per_1m": 10.00,  # $10.00 per 1M output tokens
        "temperature": 0.0,
        "max_tokens": 512
    },
    "gpt-3.5-turbo": {
        "provider": "openai",
        "input_cost_per_1m": 0.50,  # $0.50 per 1M input tokens
        "output_cost_per_1m": 1.50,  # $1.50 per 1M output tokens
        "temperature": 0.0,
        "max_tokens": 512
    },
    # Llama via Hugging Face Inference API
    "meta-llama/Llama-3.1-8B-Instruct:novita": {
        "provider": "hf_openai",
        "input_cost_per_1m": 0.00,  # Free with HF Pro
        "output_cost_per_1m": 0.00,
        "temperature": 0.0,
        "max_tokens": 512
    },
}

# Scoring weights and metrics
WEIGHTS = {
    "cognitive_ability": 0.35,
    "experience": 0.35,
    "problem_solving": 0.15,
    "reliability": 0.05,
    "professionalism": 0.05,
    "communication": 0.05
}

METRICS = list(WEIGHTS.keys())
METRIC_ABBREV = {
    "cognitive_ability": "ca",
    "experience": "exp",
    "problem_solving": "ps",
    "reliability": "rel",
    "professionalism": "prof",
    "communication": "comm"
}

# Model to run (change this to run different models)
CURRENT_MODEL = "gpt-4o"  # Options: gpt-4o, gpt-3.5-turbo, meta-llama/Llama-3.1-8B-Instruct

print(f"Configuration loaded for model: {CURRENT_MODEL}")
print(f"Dataset path: {DATASET_PATH}")
print(f"K samples per interview: {K_SAMPLES}")

Configuration loaded for model: gpt-4o
Dataset path: /Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/syntheticDataset/synthetic_interview_dataset.json
K samples per interview: 7


## Helper Functions

In [30]:
def clamp_int(x, lo=1, hi=10):
    """Clamp value to integer range."""
    try:
        xi = int(round(float(x)))
    except Exception:
        xi = 5
    return max(lo, min(hi, xi))

def compute_overall_weighted(scores: Dict[str, float]) -> float:
    """Compute weighted overall score."""
    total = sum(WEIGHTS.get(metric, 0) * scores.get(metric, 5) for metric in METRICS)
    return round(total, 2)

def iqr_confidence(vals: List[float]) -> str:
    """Calculate confidence based on IQR."""
    if len(vals) < 2:
        return "low"
    q1, q3 = np.percentile(vals, [25, 75])
    iqr = q3 - q1
    if iqr <= 1:
        return "high"
    elif iqr <= 2:
        return "medium"
    else:
        return "low"

def estimate_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    """Estimate token count for text."""
    try:
        if "gpt" in model:
            encoding = tiktoken.encoding_for_model(model)
        else:
            # Fallback for non-OpenAI models
            encoding = tiktoken.get_encoding("cl100k_base")
        return len(encoding.encode(text))
    except Exception:
        # Rough estimate: ~4 chars per token
        return len(text) // 4

def calculate_cost(input_tokens: int, output_tokens: int, model_config: dict) -> float:
    """Calculate cost based on token usage."""
    input_cost = (input_tokens / 1_000_000) * model_config["input_cost_per_1m"]
    output_cost = (output_tokens / 1_000_000) * model_config["output_cost_per_1m"]
    return input_cost + output_cost

print("✓ Helper functions loaded")

✓ Helper functions loaded


## API Client Setup

In [31]:
# Initialize API clients
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Hugging Face client
if os.environ.get("HF_TOKEN"):
    hf_client = OpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=os.environ["HF_TOKEN"]
    )
else:
    hf_client = None


print("✓ API clients initialized")

✓ API clients initialized


## Prompt Building

In [32]:
def build_scoring_prompt(full_transcript: str, role: str) -> str:
    """Build prompt for scoring candidate interview."""
    prompt = f"""You are evaluating a candidate interview for the role: {role}

Analyze the candidate's responses using these six metrics (each scored 1-10):

1. **Cognitive Ability (35%)**: Structured thinking, planning, logic, analytical reasoning
2. **Experience (35%)**: Relevant work history (last 10 years), demonstrated skills, accomplishments
3. **Problem Solving (15%)**: Resourcefulness, creative solutions, handling constraints
4. **Reliability (5%)**: Punctuality, follow-through, dependability signals
5. **Professionalism (5%)**: Respect for clients/rules, composure under stress
6. **Communication (5%)**: Clarity and tone (ignore filler words like um, uh, like)

CRITICAL INSTRUCTIONS:
- Return ONLY a valid JSON object
- Use these exact keys: cognitive_ability, experience, problem_solving, reliability, professionalism, communication
- Each value must be an integer from 1 to 10
- Do not include any explanations, just the JSON

Interview Transcript:
--- START TRANSCRIPT ---
{full_transcript}
--- END TRANSCRIPT ---

Return your scores in this format:
{{"cognitive_ability":7,"experience":6,"problem_solving":7,"reliability":6,"professionalism":7,"communication":8}}"""
    
    return prompt

def build_rewrite_prompt(full_transcript: str, role: str, scores: Dict[str, int]) -> str:
    """Build prompt for generating justifications with locked scores."""
    prompt = f"""You are writing justifications for candidate evaluation scores.

Role: {role}

CRITICAL: Use these FIXED scores. DO NOT change them:
- Cognitive Ability: {scores['cognitive_ability']}
- Experience: {scores['experience']}
- Problem Solving: {scores['problem_solving']}
- Reliability: {scores['reliability']}
- Professionalism: {scores['professionalism']}
- Communication: {scores['communication']}

Generate:
1. A justification for each score (2-3 sentences)
2. 3-4 general strengths (bullet points)
3. 3-4 general weaknesses (bullet points)
4. Overall summary (2-3 sentences)

Return ONLY this JSON structure:
{{
  "cognitive_ability_score": {scores['cognitive_ability']},
  "cognitive_ability_justification": "...",
  "experience_score": {scores['experience']},
  "experience_justification": "...",
  "problem_solving_score": {scores['problem_solving']},
  "problem_solving_justification": "...",
  "reliability_score": {scores['reliability']},
  "reliability_justification": "...",
  "professionalism_score": {scores['professionalism']},
  "professionalism_justification": "...",
  "communication_score": {scores['communication']},
  "communication_justification": "...",
  "general_strengths": "- ...\\n- ...\\n- ...",
  "general_weaknesses": "- ...\\n- ...\\n- ...",
  "general_summary": "..."
}}

Interview Transcript:
--- START TRANSCRIPT ---
{full_transcript}
--- END TRANSCRIPT ---"""
    
    return prompt

print("✓ Prompt builders loaded")

✓ Prompt builders loaded


## API Call Functions with Cost Tracking

In [33]:
def call_llm_with_tracking(
    prompt: str,
    model: str,
    temperature: float = 0.0,
    max_tokens: int = 512,
    json_mode: bool = True
) -> Tuple[str, Dict[str, Any]]:
    """Call LLM and track tokens/cost.
    
    Returns:
        Tuple of (response_text, metadata_dict)
        metadata includes: input_tokens, output_tokens, cost, latency_ms
    """
    model_config = MODELS_CONFIG[model]
    provider = model_config["provider"]
    
    start_time = time.time()
    
    try:
        # Handle different providers
        if provider == "openai":
            # OpenAI API call
            kwargs = {
                "model": model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": temperature,
                "max_tokens": max_tokens,
            }
            
            if json_mode:
                kwargs["response_format"] = {"type": "json_object"}
            
            response = openai_client.chat.completions.create(**kwargs)
            response_text = response.choices[0].message.content
            
            # Get token usage
            if hasattr(response, 'usage') and response.usage:
                input_tokens = response.usage.prompt_tokens
                output_tokens = response.usage.completion_tokens
            else:
                input_tokens = estimate_tokens(prompt, model)
                output_tokens = estimate_tokens(response_text, model)
        
        elif provider == "huggingface":
            # Hugging Face Inference API call
            if not hf_client:
                raise ValueError("Hugging Face client not initialized. Set HF_TOKEN.")
            
            # Add JSON instruction to prompt if needed
            if json_mode and 'Return ONLY a valid JSON' not in prompt:
                enhanced_prompt = prompt + "\n\nIMPORTANT: Return ONLY valid JSON, no explanations."
            else:
                enhanced_prompt = prompt
            
            response = hf_client.text_generation(
                model=model,
                prompt=enhanced_prompt,
                max_new_tokens=max_tokens,
                temperature=temperature if temperature > 0 else 0.01,  # HF doesn't like exactly 0
                return_full_text=False
            )
            
            response_text = response
            
            # Estimate tokens for HF (no token counts provided)
            input_tokens = estimate_tokens(prompt, model)
            output_tokens = estimate_tokens(response_text, model)
        
        else:
            raise ValueError(f"Unknown provider: {provider}")
        
        latency_ms = (time.time() - start_time) * 1000
        
        # Calculate cost
        cost = calculate_cost(input_tokens, output_tokens, model_config)
        
        metadata = {
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "total_tokens": input_tokens + output_tokens,
            "cost": cost,
            "latency_ms": latency_ms,
            "model": model,
            "provider": provider
        }
        
        return response_text, metadata
        
    except Exception as e:
        print(f"Error calling {model}: {e}")
        raise

def call_scoring(
    full_transcript: str,
    role: str,
    model: str,
    temperature: float = 0.0
) -> Tuple[Dict[str, int], str, Dict[str, Any]]:
    """Call scoring API and parse response.
    
    Returns:
        Tuple of (scores_dict, raw_text, metadata)
    """
    prompt = build_scoring_prompt(full_transcript, role)
    response_text, metadata = call_llm_with_tracking(
        prompt=prompt,
        model=model,
        temperature=temperature,
        max_tokens=512,
        json_mode=True
    )
    
    # Parse JSON response
    try:
        # Try to extract JSON from response
        response_clean = response_text.strip()
        
        # Handle markdown code blocks
        if response_clean.startswith('```'):
            # Extract JSON from code block
            lines = response_clean.split('\n')
            json_lines = []
            in_json = False
            for line in lines:
                if line.strip().startswith('```'):
                    in_json = not in_json
                    continue
                if in_json or (line.strip().startswith('{') or json_lines):
                    json_lines.append(line)
                    if line.strip().endswith('}') and json_lines[0].strip().startswith('{'):
                        break
            response_clean = '\n'.join(json_lines)
        
        # Find JSON object
        if '{' in response_clean:
            start_idx = response_clean.index('{')
            end_idx = response_clean.rindex('}') + 1
            response_clean = response_clean[start_idx:end_idx]
        
        scores = json.loads(response_clean)
        
        # Clamp scores to valid range
        for metric in METRICS:
            if metric in scores:
                scores[metric] = clamp_int(scores[metric], 1, 10)
        return scores, response_text, metadata
    except Exception as e:
        print(f"Error parsing scoring response: {e}")
        print(f"Response text: {response_text[:200]}...")
        # Return default scores
        default_scores = {metric: 5 for metric in METRICS}
        return default_scores, response_text, metadata

def call_rewrite(
    full_transcript: str,
    role: str,
    scores: Dict[str, int],
    model: str
) -> Tuple[Dict[str, Any], str, Dict[str, Any]]:
    """Call rewrite API for justifications.
    
    Returns:
        Tuple of (rewrite_dict, raw_text, metadata)
    """
    prompt = build_rewrite_prompt(full_transcript, role, scores)
    response_text, metadata = call_llm_with_tracking(
        prompt=prompt,
        model=model,
        temperature=0.0,
        max_tokens=1200,
        json_mode=True
    )
    
    # Parse JSON response
    try:
        # Clean response similar to call_scoring
        response_clean = response_text.strip()
        
        if response_clean.startswith('```'):
            lines = response_clean.split('\n')
            json_lines = []
            in_json = False
            for line in lines:
                if line.strip().startswith('```'):
                    in_json = not in_json
                    continue
                if in_json or (line.strip().startswith('{') or json_lines):
                    json_lines.append(line)
                    if line.strip().endswith('}') and json_lines[0].strip().startswith('{'):
                        break
            response_clean = '\n'.join(json_lines)
        
        if '{' in response_clean:
            start_idx = response_clean.index('{')
            end_idx = response_clean.rindex('}') + 1
            response_clean = response_clean[start_idx:end_idx]
        
        rewrite = json.loads(response_clean)
        return rewrite, response_text, metadata
    except Exception as e:
        print(f"Error parsing rewrite response: {e}")
        print(f"Response text: {response_text[:200]}...")
        return {}, response_text, metadata

print("✓ API call functions with cost tracking loaded")

✓ API call functions with cost tracking loaded


## Load Dataset

In [34]:
# Load interview dataset
with open(DATASET_PATH, 'r') as f:
    interviews_data = json.load(f)

print(f"Loaded {len(interviews_data)} interviews")
print(f"\nSample interview structure:")
print(f"  Keys: {list(interviews_data[0].keys())}")
print(f"  Role: {interviews_data[0]['role']}")
print(f"  Interview ID: {interviews_data[0]['interview_id']}")

# Count interviews by role
role_counts = {}
for interview in interviews_data:
    role = interview['role']
    role_counts[role] = role_counts.get(role, 0) + 1

print(f"\nInterviews by role:")
for role, count in sorted(role_counts.items()):
    print(f"  {role}: {count}")

Loaded 250 interviews

Sample interview structure:
  Keys: ['interview_id', 'role', 'overall_score', 'quality', 'metric_scores', 'qa_pairs', 'full_transcript']
  Role: Customer Service Representative
  Interview ID: customer_service_representative_001

Interviews by role:
  Customer Service Representative: 50
  Field Technician: 50
  General Manager (Franchise): 50
  Home Service Technician: 50
  Sales Representative: 50


## Self-Consistency Scoring Runs

In [35]:
# Initialize results storage
all_samples = []
cost_tracker = {
    "total_cost": 0.0,
    "total_input_tokens": 0,
    "total_output_tokens": 0,
    "scoring_cost": 0.0,
    "rewrite_cost": 0.0,
    "num_scoring_calls": 0,
    "num_rewrite_calls": 0,
}

print(f"Starting self-consistency scoring with {CURRENT_MODEL}")
print(f"K = {K_SAMPLES} samples per interview")
print(f"Total interviews: {len(interviews_data)}")
print(f"Expected API calls: {len(interviews_data) * K_SAMPLES} scoring + {len(interviews_data)} rewrite")
print("=" * 70)

# Process each interview
for idx, interview in enumerate(interviews_data):
    interview_id = interview['interview_id']
    role = interview['role']
    full_transcript = interview['full_transcript']
    
    # Generate K scoring samples
    for k in range(K_SAMPLES):
        try:
            scores, raw_text, metadata = call_scoring(
                full_transcript=full_transcript,
                role=role,
                model=CURRENT_MODEL,
                temperature=0.0
            )
            
            # Store sample result
            sample_record = {
                "interview_id": interview_id,
                "role": role,
                "run_idx": k,
                "model": CURRENT_MODEL,
                **{METRIC_ABBREV[m]: scores.get(m, 5) for m in METRICS},
                "latency_ms": metadata["latency_ms"],
                "input_tokens": metadata["input_tokens"],
                "output_tokens": metadata["output_tokens"],
                "cost": metadata["cost"],
            }
            all_samples.append(sample_record)
            
            # Update cost tracker
            cost_tracker["total_cost"] += metadata["cost"]
            cost_tracker["total_input_tokens"] += metadata["input_tokens"]
            cost_tracker["total_output_tokens"] += metadata["output_tokens"]
            cost_tracker["scoring_cost"] += metadata["cost"]
            cost_tracker["num_scoring_calls"] += 1
            
        except Exception as e:
            print(f"Error scoring interview {interview_id}, run {k}: {e}")
            continue
    
    # Progress update
    if (idx + 1) % 10 == 0 or (idx + 1) == len(interviews_data):
        print(f"Progress: {idx + 1}/{len(interviews_data)} interviews | "
              f"Cost so far: ${cost_tracker['total_cost']:.4f}")

print("\n" + "=" * 70)
print(f"Scoring complete! Generated {len(all_samples)} samples")
print(f"Total scoring cost: ${cost_tracker['scoring_cost']:.4f}")

Starting self-consistency scoring with gpt-4o
K = 7 samples per interview
Total interviews: 250
Expected API calls: 1750 scoring + 250 rewrite
Progress: 10/250 interviews | Cost so far: $0.2990
Progress: 20/250 interviews | Cost so far: $0.5911
Progress: 30/250 interviews | Cost so far: $0.9033
Progress: 40/250 interviews | Cost so far: $1.2305
Progress: 50/250 interviews | Cost so far: $1.5630
Progress: 60/250 interviews | Cost so far: $1.7965
Progress: 70/250 interviews | Cost so far: $2.0509
Progress: 80/250 interviews | Cost so far: $2.2808
Progress: 90/250 interviews | Cost so far: $2.5406
Progress: 100/250 interviews | Cost so far: $2.7821
Progress: 110/250 interviews | Cost so far: $3.0928
Progress: 120/250 interviews | Cost so far: $3.4358
Progress: 130/250 interviews | Cost so far: $3.8013
Progress: 140/250 interviews | Cost so far: $4.1438
Progress: 150/250 interviews | Cost so far: $4.4632
Progress: 160/250 interviews | Cost so far: $4.8187
Progress: 170/250 interviews | Cos

## Aggregate Scores (Self-Consistency)

In [36]:
# Convert samples to DataFrame
samples_df = pd.DataFrame(all_samples)

# Aggregate scores for each interview
aggregated_results = []

for interview_id in samples_df['interview_id'].unique():
    interview_samples = samples_df[samples_df['interview_id'] == interview_id]
    
    agg_record = {
        "interview_id": interview_id,
        "role": interview_samples.iloc[0]['role'],
        "model": CURRENT_MODEL,
        "num_samples": len(interview_samples),
    }
    
    # Aggregate each metric
    for abbrev in METRIC_ABBREV.values():
        values = interview_samples[abbrev].tolist()
        
        # Median score (self-consistency)
        agg_record[f"{abbrev}_score"] = np.median(values)
        
        # Confidence based on IQR
        agg_record[f"{abbrev}_confidence"] = iqr_confidence(values)
        
        # Standard deviation
        agg_record[f"{abbrev}_std"] = np.std(values)
    
    # Compute overall weighted score
    metric_scores = {}
    for metric, abbrev in METRIC_ABBREV.items():
        metric_scores[metric] = agg_record[f"{abbrev}_score"]
    
    agg_record["overall_weighted_score"] = compute_overall_weighted(metric_scores)
    
    # Cost statistics
    agg_record["avg_latency_ms"] = interview_samples["latency_ms"].mean()
    agg_record["total_cost"] = interview_samples["cost"].sum()
    agg_record["avg_cost_per_sample"] = interview_samples["cost"].mean()
    
    aggregated_results.append(agg_record)

aggregated_df = pd.DataFrame(aggregated_results)

print(f"Aggregated {len(aggregated_df)} interviews")
print(f"\nSample aggregated record:")
print(aggregated_df.head(2))

Aggregated 250 interviews

Sample aggregated record:
                          interview_id                             role  \
0  customer_service_representative_001  Customer Service Representative   
1  customer_service_representative_002  Customer Service Representative   

    model  num_samples  ca_score ca_confidence  ca_std  exp_score  \
0  gpt-4o            7       4.0          high     0.0        3.0   
1  gpt-4o            7       3.0          high     0.0        2.0   

  exp_confidence  exp_std  ...  prof_score prof_confidence  prof_std  \
0           high      0.0  ...         5.0            high       0.0   
1           high      0.0  ...         4.0            high       0.0   

   comm_score comm_confidence  comm_std  overall_weighted_score  \
0         4.0            high  0.000000                    3.90   
1         3.0            high  0.451754                    2.75   

  avg_latency_ms  total_cost  avg_cost_per_sample  
0     993.438312    0.028770             0

## Generate Justifications (Rewrite Step)

In [37]:
print("Generating justifications for aggregated scores...")
print("=" * 70)

final_results = []

for idx, agg_row in aggregated_df.iterrows():
    interview_id = agg_row['interview_id']
    
    # Find original interview
    interview = next(i for i in interviews_data if i['interview_id'] == interview_id)
    
    # Prepare scores for rewrite
    locked_scores = {}
    for metric, abbrev in METRIC_ABBREV.items():
        locked_scores[metric] = int(round(agg_row[f"{abbrev}_score"]))
    
    try:
        rewrite_result, raw_text, metadata = call_rewrite(
            full_transcript=interview['full_transcript'],
            role=interview['role'],
            scores=locked_scores,
            model=CURRENT_MODEL
        )
        
        # Build final record
        final_record = {
            "interview_id": interview_id,
            "role": interview['role'],
            "model": CURRENT_MODEL,
            **{f"{abbrev}_score": agg_row[f"{abbrev}_score"] for abbrev in METRIC_ABBREV.values()},
            **{f"{abbrev}_confidence": agg_row[f"{abbrev}_confidence"] for abbrev in METRIC_ABBREV.values()},
            "overall_weighted_score": agg_row["overall_weighted_score"],
            # Justifications
            **{f"{metric}_justification": rewrite_result.get(f"{metric}_justification", "") 
               for metric in METRICS},
            "general_strengths": rewrite_result.get("general_strengths", ""),
            "general_weaknesses": rewrite_result.get("general_weaknesses", ""),
            "general_summary": rewrite_result.get("general_summary", ""),
            # Cost tracking
            "scoring_cost": agg_row["total_cost"],
            "rewrite_cost": metadata["cost"],
            "total_cost": agg_row["total_cost"] + metadata["cost"],
            "rewrite_latency_ms": metadata["latency_ms"],
            "rewrite_input_tokens": metadata["input_tokens"],
            "rewrite_output_tokens": metadata["output_tokens"],
        }
        
        final_results.append(final_record)
        
        # Update cost tracker
        cost_tracker["total_cost"] += metadata["cost"]
        cost_tracker["total_input_tokens"] += metadata["input_tokens"]
        cost_tracker["total_output_tokens"] += metadata["output_tokens"]
        cost_tracker["rewrite_cost"] += metadata["cost"]
        cost_tracker["num_rewrite_calls"] += 1
        
    except Exception as e:
        print(f"Error rewriting interview {interview_id}: {e}")
        continue
    
    # Progress update
    if (idx + 1) % 10 == 0 or (idx + 1) == len(aggregated_df):
        print(f"Progress: {idx + 1}/{len(aggregated_df)} justifications | "
              f"Total cost: ${cost_tracker['total_cost']:.4f}")

final_df = pd.DataFrame(final_results)

print("\n" + "=" * 70)
print(f"Justification generation complete!")
print(f"Final results: {len(final_df)} interviews")
print(f"Rewrite cost: ${cost_tracker['rewrite_cost']:.4f}")

Generating justifications for aggregated scores...
Progress: 10/250 justifications | Total cost: $8.0040
Progress: 20/250 justifications | Total cost: $8.0955
Progress: 30/250 justifications | Total cost: $8.1907
Progress: 40/250 justifications | Total cost: $8.2862
Progress: 50/250 justifications | Total cost: $8.3837
Progress: 60/250 justifications | Total cost: $8.4652
Progress: 70/250 justifications | Total cost: $8.5500
Progress: 80/250 justifications | Total cost: $8.6329
Progress: 90/250 justifications | Total cost: $8.7171
Progress: 100/250 justifications | Total cost: $8.7999
Progress: 110/250 justifications | Total cost: $8.8929
Progress: 120/250 justifications | Total cost: $8.9902
Progress: 130/250 justifications | Total cost: $9.0924
Progress: 140/250 justifications | Total cost: $9.1893
Progress: 150/250 justifications | Total cost: $9.2847
Progress: 160/250 justifications | Total cost: $9.3866
Progress: 170/250 justifications | Total cost: $9.4853
Progress: 180/250 justi

## Cost Analysis

In [38]:
print("=" * 70)
print("COST ANALYSIS")
print("=" * 70)

print(f"\nModel: {CURRENT_MODEL}")
print(f"Provider: {MODELS_CONFIG[CURRENT_MODEL]['provider']}")

print(f"\nPricing:")
print(f"  Input: ${MODELS_CONFIG[CURRENT_MODEL]['input_cost_per_1m']:.2f} per 1M tokens")
print(f"  Output: ${MODELS_CONFIG[CURRENT_MODEL]['output_cost_per_1m']:.2f} per 1M tokens")

print(f"\nAPI Calls:")
print(f"  Scoring calls: {cost_tracker['num_scoring_calls']}")
print(f"  Rewrite calls: {cost_tracker['num_rewrite_calls']}")
print(f"  Total calls: {cost_tracker['num_scoring_calls'] + cost_tracker['num_rewrite_calls']}")

print(f"\nToken Usage:")
print(f"  Total input tokens: {cost_tracker['total_input_tokens']:,}")
print(f"  Total output tokens: {cost_tracker['total_output_tokens']:,}")
print(f"  Total tokens: {cost_tracker['total_input_tokens'] + cost_tracker['total_output_tokens']:,}")

print(f"\nCost Breakdown:")
print(f"  Scoring phase: ${cost_tracker['scoring_cost']:.4f}")
print(f"  Rewrite phase: ${cost_tracker['rewrite_cost']:.4f}")
print(f"  TOTAL COST: ${cost_tracker['total_cost']:.4f}")

# Per-interview averages
num_interviews = len(final_df)
if num_interviews > 0:
    print(f"\nPer-Interview Averages:")
    print(f"  Cost per interview: ${cost_tracker['total_cost'] / num_interviews:.4f}")
    print(f"  Scoring cost per interview: ${cost_tracker['scoring_cost'] / num_interviews:.4f}")
    print(f"  Rewrite cost per interview: ${cost_tracker['rewrite_cost'] / num_interviews:.4f}")

# Per-sample averages
num_samples = len(samples_df)
if num_samples > 0:
    print(f"\nPer-Sample Averages:")
    print(f"  Cost per scoring sample: ${cost_tracker['scoring_cost'] / num_samples:.5f}")
    print(f"  Tokens per scoring sample: {cost_tracker['total_input_tokens'] / cost_tracker['num_scoring_calls']:.0f} input, "
          f"{cost_tracker['total_output_tokens'] / cost_tracker['num_scoring_calls']:.0f} output")

print("=" * 70)

COST ANALYSIS

Model: gpt-4o
Provider: openai

Pricing:
  Input: $2.50 per 1M tokens
  Output: $10.00 per 1M tokens

API Calls:
  Scoring calls: 1750
  Rewrite calls: 250
  Total calls: 2000

Token Usage:
  Total input tokens: 3,359,682
  Total output tokens: 187,866
  Total tokens: 3,547,548

Cost Breakdown:
  Scoring phase: $7.9131
  Rewrite phase: $2.3647
  TOTAL COST: $10.2779

Per-Interview Averages:
  Cost per interview: $0.0411
  Scoring cost per interview: $0.0317
  Rewrite cost per interview: $0.0095

Per-Sample Averages:
  Cost per scoring sample: $0.00452
  Tokens per scoring sample: 1920 input, 107 output


In [39]:
print("=" * 70)
print("COST ANALYSIS")
print("=" * 70)

print(f"\nModel: {CURRENT_MODEL}")
print(f"Provider: {MODELS_CONFIG[CURRENT_MODEL]['provider']}")

print(f"\nPricing:")
print(f"  Input: ${MODELS_CONFIG[CURRENT_MODEL]['input_cost_per_1m']:.2f} per 1M tokens")
print(f"  Output: ${MODELS_CONFIG[CURRENT_MODEL]['output_cost_per_1m']:.2f} per 1M tokens")

print(f"\nAPI Calls:")
print(f"  Scoring calls: {cost_tracker['num_scoring_calls']}")
print(f"  Rewrite calls: {cost_tracker['num_rewrite_calls']}")
print(f"  Total calls: {cost_tracker['num_scoring_calls'] + cost_tracker['num_rewrite_calls']}")

print(f"\nToken Usage:")
print(f"  Total input tokens: {cost_tracker['total_input_tokens']:,}")
print(f"  Total output tokens: {cost_tracker['total_output_tokens']:,}")
print(f"  Total tokens: {cost_tracker['total_input_tokens'] + cost_tracker['total_output_tokens']:,}")

print(f"\nCost Breakdown:")
print(f"  Scoring phase: ${cost_tracker['scoring_cost']:.4f}")
print(f"  Rewrite phase: ${cost_tracker['rewrite_cost']:.4f}")
print(f"  TOTAL COST: ${cost_tracker['total_cost']:.4f}")

# Per-interview averages
num_interviews = len(final_df)
if num_interviews > 0:
    print(f"\nPer-Interview Averages:")
    print(f"  Cost per interview: ${cost_tracker['total_cost'] / num_interviews:.4f}")
    print(f"  Scoring cost per interview: ${cost_tracker['scoring_cost'] / num_interviews:.4f}")
    print(f"  Rewrite cost per interview: ${cost_tracker['rewrite_cost'] / num_interviews:.4f}")

# Per-sample averages
num_samples = len(samples_df)
if num_samples > 0:
    print(f"\nPer-Sample Averages:")
    print(f"  Cost per scoring sample: ${cost_tracker['scoring_cost'] / num_samples:.5f}")
    print(f"  Tokens per scoring sample: {cost_tracker['total_input_tokens'] / cost_tracker['num_scoring_calls']:.0f} input, "
          f"{cost_tracker['total_output_tokens'] / cost_tracker['num_scoring_calls']:.0f} output")

print("=" * 70)

COST ANALYSIS

Model: gpt-4o
Provider: openai

Pricing:
  Input: $2.50 per 1M tokens
  Output: $10.00 per 1M tokens

API Calls:
  Scoring calls: 1750
  Rewrite calls: 250
  Total calls: 2000

Token Usage:
  Total input tokens: 3,359,682
  Total output tokens: 187,866
  Total tokens: 3,547,548

Cost Breakdown:
  Scoring phase: $7.9131
  Rewrite phase: $2.3647
  TOTAL COST: $10.2779

Per-Interview Averages:
  Cost per interview: $0.0411
  Scoring cost per interview: $0.0317
  Rewrite cost per interview: $0.0095

Per-Sample Averages:
  Cost per scoring sample: $0.00452
  Tokens per scoring sample: 1920 input, 107 output


## Summary Statistics

In [43]:
print("=" * 70)
print("EVALUATION SUMMARY")
print("=" * 70)

print(f"\nDataset:")
print(f"  Total interviews: {len(final_df)}")
print(f"  K samples per interview: {K_SAMPLES}")
print(f"  Total scoring samples: {len(samples_df)}")

print(f"\nOverall Weighted Score Statistics:")
print(f"  Mean:   {final_df['overall_weighted_score'].mean():.2f}")
print(f"  Median: {final_df['overall_weighted_score'].median():.2f}")
print(f"  Std:    {final_df['overall_weighted_score'].std():.2f}")
print(f"  Min:    {final_df['overall_weighted_score'].min():.2f}")
print(f"  Max:    {final_df['overall_weighted_score'].max():.2f}")

print(f"\nMetric Score Averages:")
for metric, abbrev in METRIC_ABBREV.items():
    score_col = f"{abbrev}_score"
    if score_col in final_df.columns:
        mean_score = final_df[score_col].mean()
        median_score = final_df[score_col].median()
        print(f"  {metric:20s}: Mean={mean_score:.2f}, Median={median_score:.1f}")

print(f"\nConfidence Distribution:")
for metric, abbrev in METRIC_ABBREV.items():
    conf_col = f"{abbrev}_confidence"
    if conf_col in final_df.columns:
        high = (final_df[conf_col] == "high").sum()
        med = (final_df[conf_col] == "medium").sum()
        low = (final_df[conf_col] == "low").sum()
        total = high + med + low
        high_pct = 100 * high / total if total > 0 else 0
        print(f"  {metric:20s}: High={high} ({high_pct:.0f}%), Medium={med}, Low={low}")

print(f"\nScore Distribution by Percentile:")
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    val = final_df['overall_weighted_score'].quantile(p/100)
    print(f"  P{p:2d}: {val:.2f}")

# Statistics by role
print(f"\nStatistics by Role:")
for role in final_df['role'].unique():
    role_df = final_df[final_df['role'] == role]
    print(f"\n  {role}:")
    print(f"    Count: {len(role_df)}")
    print(f"    Mean score: {role_df['overall_weighted_score'].mean():.2f}")
    print(f"    Median score: {role_df['overall_weighted_score'].median():.2f}")
    print(f"    Avg cost per interview: ${role_df['total_cost'].mean():.4f}")

print("\n" + "=" * 70)

EVALUATION SUMMARY

Dataset:
  Total interviews: 250
  K samples per interview: 7
  Total scoring samples: 1750

Overall Weighted Score Statistics:
  Mean:   5.90
  Median: 7.60
  Std:    2.41
  Min:    2.50
  Max:    8.80

Metric Score Averages:
  cognitive_ability   : Mean=5.96, Median=8.0
  experience          : Mean=5.58, Median=7.0
  problem_solving     : Mean=5.95, Median=8.0
  reliability         : Mean=6.23, Median=7.0
  professionalism     : Mean=6.56, Median=8.0
  communication       : Mean=6.56, Median=8.0

Confidence Distribution:
  cognitive_ability   : High=250 (100%), Medium=0, Low=0
  experience          : High=250 (100%), Medium=0, Low=0
  problem_solving     : High=249 (100%), Medium=1, Low=0
  reliability         : High=246 (98%), Medium=4, Low=0
  professionalism     : High=250 (100%), Medium=0, Low=0
  communication       : High=250 (100%), Medium=0, Low=0

Score Distribution by Percentile:
  P10: 2.80
  P25: 3.25
  P50: 7.60
  P75: 8.07
  P90: 8.50
  P95: 8.50
  P

In [44]:
print("=" * 70)
print("EVALUATION SUMMARY")
print("=" * 70)

print(f"\nDataset:")
print(f"  Total interviews: {len(final_df)}")
print(f"  K samples per interview: {K_SAMPLES}")
print(f"  Total scoring samples: {len(samples_df)}")

print(f"\nOverall Weighted Score Statistics:")
print(f"  Mean:   {final_df['overall_weighted_score'].mean():.2f}")
print(f"  Median: {final_df['overall_weighted_score'].median():.2f}")
print(f"  Std:    {final_df['overall_weighted_score'].std():.2f}")
print(f"  Min:    {final_df['overall_weighted_score'].min():.2f}")
print(f"  Max:    {final_df['overall_weighted_score'].max():.2f}")

print(f"\nMetric Score Averages:")
for metric, abbrev in METRIC_ABBREV.items():
    score_col = f"{abbrev}_score"
    if score_col in final_df.columns:
        mean_score = final_df[score_col].mean()
        median_score = final_df[score_col].median()
        print(f"  {metric:20s}: Mean={mean_score:.2f}, Median={median_score:.1f}")

print(f"\nConfidence Distribution:")
for metric, abbrev in METRIC_ABBREV.items():
    conf_col = f"{abbrev}_confidence"
    if conf_col in final_df.columns:
        high = (final_df[conf_col] == "high").sum()
        med = (final_df[conf_col] == "medium").sum()
        low = (final_df[conf_col] == "low").sum()
        total = high + med + low
        high_pct = 100 * high / total if total > 0 else 0
        print(f"  {metric:20s}: High={high} ({high_pct:.0f}%), Medium={med}, Low={low}")

print(f"\nScore Distribution by Percentile:")
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    val = final_df['overall_weighted_score'].quantile(p/100)
    print(f"  P{p:2d}: {val:.2f}")

# Statistics by role
print(f"\nStatistics by Role:")
for role in final_df['role'].unique():
    role_df = final_df[final_df['role'] == role]
    print(f"\n  {role}:")
    print(f"    Count: {len(role_df)}")
    print(f"    Mean score: {role_df['overall_weighted_score'].mean():.2f}")
    print(f"    Median score: {role_df['overall_weighted_score'].median():.2f}")
    print(f"    Avg cost per interview: ${role_df['total_cost'].mean():.4f}")

print("\n" + "=" * 70)

EVALUATION SUMMARY

Dataset:
  Total interviews: 250
  K samples per interview: 7
  Total scoring samples: 1750

Overall Weighted Score Statistics:
  Mean:   5.90
  Median: 7.60
  Std:    2.41
  Min:    2.50
  Max:    8.80

Metric Score Averages:
  cognitive_ability   : Mean=5.96, Median=8.0
  experience          : Mean=5.58, Median=7.0
  problem_solving     : Mean=5.95, Median=8.0
  reliability         : Mean=6.23, Median=7.0
  professionalism     : Mean=6.56, Median=8.0
  communication       : Mean=6.56, Median=8.0

Confidence Distribution:
  cognitive_ability   : High=250 (100%), Medium=0, Low=0
  experience          : High=250 (100%), Medium=0, Low=0
  problem_solving     : High=249 (100%), Medium=1, Low=0
  reliability         : High=246 (98%), Medium=4, Low=0
  professionalism     : High=250 (100%), Medium=0, Low=0
  communication       : High=250 (100%), Medium=0, Low=0

Score Distribution by Percentile:
  P10: 2.80
  P25: 3.25
  P50: 7.60
  P75: 8.07
  P90: 8.50
  P95: 8.50
  P

## Save Results

In [45]:
# Create timestamp for file naming
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
model_safe = CURRENT_MODEL.replace("/", "_").replace(":", "_")

# Save directory
save_dir = "/Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/k7RunsResults"
os.makedirs(save_dir, exist_ok=True)

# Save files
samples_path = f"{save_dir}/samples_{model_safe}_{timestamp}.csv"
aggregated_path = f"{save_dir}/aggregated_{model_safe}_{timestamp}.csv"
final_path = f"{save_dir}/final_{model_safe}_{timestamp}.csv"
cost_path = f"{save_dir}/cost_report_{model_safe}_{timestamp}.json"

samples_df.to_csv(samples_path, index=False)
aggregated_df.to_csv(aggregated_path, index=False)
final_df.to_csv(final_path, index=False)

# Save cost report
cost_report = {
    "model": CURRENT_MODEL,
    "timestamp": timestamp,
    "k_samples": K_SAMPLES,
    "num_interviews": len(final_df),
    "num_samples": len(samples_df),
    **cost_tracker,
    "cost_per_interview": cost_tracker['total_cost'] / len(final_df) if len(final_df) > 0 else 0,
    "cost_per_sample": cost_tracker['scoring_cost'] / len(samples_df) if len(samples_df) > 0 else 0,
}

with open(cost_path, 'w') as f:
    json.dump(cost_report, f, indent=2)

print("✓ Results saved successfully!")
print(f"\nFiles saved:")
print(f"  - {samples_path}")
print(f"  - {aggregated_path}")
print(f"  - {final_path}")
print(f"  - {cost_path}")
print(f"\nTotal cost for this run: ${cost_tracker['total_cost']:.4f}")

✓ Results saved successfully!

Files saved:
  - /Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/k7RunsResults/samples_gpt-4o_20251120-163437.csv
  - /Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/k7RunsResults/aggregated_gpt-4o_20251120-163437.csv
  - /Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/k7RunsResults/final_gpt-4o_20251120-163437.csv
  - /Users/shreya_sudan/Desktop/ServiceAgent/CleanAgent/metaPromptOpt/data/k7RunsResults/cost_report_gpt-4o_20251120-163437.json

Total cost for this run: $10.2779


# Dataset configuration
DATASET_PATH = "/content/drive/MyDrive/mvp/synthetic_interview_dataset.json"
K_SAMPLES = 3  # Number of self-consistency samples per interview

# Models to evaluate
MODELS_CONFIG = {
    # OpenAI models
    "gpt-4o": {
        "provider": "openai",
        "input_cost_per_1m": 2.50,  # $2.50 per 1M input tokens
        "output_cost_per_1m": 10.00,  # $10.00 per 1M output tokens
        "temperature": 0.0,
        "max_tokens": 512
    },
    "gpt-3.5-turbo": {
        "provider": "openai",
        "input_cost_per_1m": 0.50,  # $0.50 per 1M input tokens
        "output_cost_per_1m": 1.50,  # $1.50 per 1M output tokens
        "temperature": 0.0,
        "max_tokens": 512
    },
    # Llama via Hugging Face Inference API
    "meta-llama/Llama-3.1-8B-Instruct": {
        "provider": "huggingface",
        "input_cost_per_1m": 0.00,  # Free with HF Pro subscription
        "output_cost_per_1m": 0.00,  # Free with HF Pro subscription
        "temperature": 0.0,
        "max_tokens": 512
    },
}

# Scoring weights and metrics
WEIGHTS = {
    "cognitive_ability": 0.35,
    "experience": 0.35,
    "problem_solving": 0.15,
    "reliability": 0.05,
    "professionalism": 0.05,
    "communication": 0.05
}

METRICS = list(WEIGHTS.keys())
METRIC_ABBREV = {
    "cognitive_ability": "ca",
    "experience": "exp",
    "problem_solving": "ps",
    "reliability": "rel",
    "professionalism": "prof",
    "communication": "comm"
}

# Model to run (change this to run different models)
CURRENT_MODEL = "gpt-3.5-turbo"  # Options: gpt-4o, gpt-3.5-turbo, meta-llama/Llama-3.1-8B-Instruct

print(f"Configuration loaded for model: {CURRENT_MODEL}")
print(f"Dataset path: {DATASET_PATH}")
print(f"K samples per interview: {K_SAMPLES}")

In [46]:
import glob

# Load all cost reports
cost_reports = []
for cost_file in glob.glob(f"{save_dir}/cost_report_*.json"):
    with open(cost_file, 'r') as f:
        cost_reports.append(json.load(f))

if cost_reports:
    cost_comparison_df = pd.DataFrame(cost_reports)
    
    print("=" * 70)
    print("MULTI-MODEL COST COMPARISON")
    print("=" * 70)
    
    display(cost_comparison_df[[
        'model',
        'num_interviews',
        'total_cost',
        'cost_per_interview',
        'scoring_cost',
        'rewrite_cost',
        'total_input_tokens',
        'total_output_tokens'
    ]].sort_values('total_cost'))
    
    print("\nCost Rankings (250 interviews):")
    for idx, row in cost_comparison_df.sort_values('total_cost').iterrows():
        print(f"  {row['model']:40s}: ${row['total_cost']:.2f} total, ${row['cost_per_interview']:.4f} per interview")
else:
    print("No cost reports found for comparison")

MULTI-MODEL COST COMPARISON


Unnamed: 0,model,num_interviews,total_cost,cost_per_interview,scoring_cost,rewrite_cost,total_input_tokens,total_output_tokens
2,meta-llama/Llama-3.1-8B-Instruct:novita,250,0.0,0.0,0.0,0.0,3467268,190879
1,gpt-3.5-turbo,250,1.995815,0.007983,1.623433,0.372382,3411268,193454
0,gpt-4o,250,10.277865,0.041111,7.913133,2.364732,3359682,187866
3,gpt-4o,250,10.277865,0.041111,7.913133,2.364732,3359682,187866



Cost Rankings (250 interviews):
  meta-llama/Llama-3.1-8B-Instruct:novita : $0.00 total, $0.0000 per interview
  gpt-3.5-turbo                           : $2.00 total, $0.0080 per interview
  gpt-4o                                  : $10.28 total, $0.0411 per interview
  gpt-4o                                  : $10.28 total, $0.0411 per interview
