# Self-Consistency Scoring with Llama Models
This notebook runs self-consistency evaluation using Hugging Face's routed Llama models.

In [1]:
# Install required packages
!pip install huggingface_hub pandas numpy pyyaml -q

In [2]:
import os
from getpass import getpass

# Set your Hugging Face API token
if "HF_TOKEN" not in os.environ or not os.environ["HF_TOKEN"]:
    os.environ["HF_TOKEN"] = getpass("Paste your Hugging Face token (starts with hf_...): ")

Paste your Hugging Face token (starts with hf_...): ··········


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import yaml
import json
import time
import random
import statistics as stats
import re
from typing import Dict, Any, List
from datetime import datetime
from huggingface_hub import login

# Login to Hugging Face
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Configuration

In [10]:
CONFIG = {
    "role_title": "Field Technician",
    "question_set_id": "qs_v1",
    "question_set": [
        "Tell me about a time you handled an urgent service call. What steps did you take?",
        "How do you plan your route and prioritize jobs when schedules change during the day?",
        "Describe a tricky diagnostic you solved. What tools or methods did you use?",
        "How do you keep customers calm when they are upset or stressed?",
        "Walk me through your process for documenting work and updating tickets.",
        "What does reliability at work mean to you, and how do you demonstrate it?",
        "How do you stay safe on the job and follow site-specific rules?",
        "How do you collaborate with teammates or escalate when blocked?"
    ],
    "num_candidates": 50,
    "k_samples": 3,  # self-consistency K
    "generation_temperature": 0.8,
    "generation_max_tokens": 1200,
    "timeout_s": 60,
    "gen_prompt_version": "gen_v1",
    "score_prompt_version": "score_v1",
    "rewrite_prompt_version": "rewrite_v1",
}

# Choose your Hugging Face routed model
# Examples:
# MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct:novita"
# MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct:novita"
# MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3:together"
# MODEL_ID = "Qwen/Qwen2.5-7B-Instruct:novita"
MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct:novita"

# Scoring weights and metrics
WEIGHTS = {"ca":0.35, "exp":0.35, "ps":0.15, "rel":0.05, "prof":0.05, "comm":0.05}
METRICS = ["ca", "exp", "ps", "rel", "prof", "comm"]
FILLERS_RE = re.compile(r"\b(?:um+|uh+|erm|like|you know|sort of|kinda|i mean|ya know)\b", re.IGNORECASE)

## Helper Functions

In [11]:
def clamp_int(x, lo=1, hi=10):
    try:
        xi = int(round(float(x)))
    except Exception:
        xi = 5
    return max(lo, min(hi, xi))

def canonicalize_qa_text(text: str) -> str:
    """Clean and standardize QA text."""
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

def compute_overall_weighted(scores: Dict[str, int]) -> float:
    """Compute weighted overall score."""
    total = sum(WEIGHTS.get(m, 0) * scores.get(m, 5) for m in METRICS)
    return round(total, 2)

def iqr_confidence(vals: List[int]) -> str:
    """Calculate confidence based on IQR."""
    if len(vals) < 2:
        return "low"
    q1, q3 = np.percentile(vals, [25, 75])
    iqr = q3 - q1
    if iqr <= 1:
        return "high"
    elif iqr <= 2:
        return "medium"
    else:
        return "low"

## Prompt Building Functions

In [12]:
def build_generation_prompt(role: str, question_set: List[str], persona: Dict[str, Any]) -> str:
    """Build prompt for generating candidate interview responses."""
    q_block = "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(question_set)])
    lines = [
        f"You are the candidate interviewing for the role: {role}.",
        f"Persona hints: title={persona['persona_title']}; years_experience={persona['yrs_experience']}; "
        f"keywords={persona['domain_keywords']}; reliability={persona['reliability_flags']}; notes={persona['notes']}.",
        "",
        "Answer each question clearly (2–4 sentences per answer).",
        "",
        q_block,
        "",
        "Return responses in this pattern:",
        "Question 1: <repeat question>",
        "Answer: <answer>",
        "",
        "Question 2: <repeat question>",
        "Answer: <answer>",
    ]
    return "\n".join(lines)

def build_scoring_prompt(qa_text: str) -> str:
    """Build prompt for scoring candidate responses."""
    metrics_def = "\n".join([
        "- Cognitive Ability (35%): Structured thinking, planning, logic.",
        "- Experience (35%): Relevant work (last 10 years), skills, accomplishments in similar service jobs.",
        "- Problem Solving (15%): Resourcefulness, safe tradeoffs under constraints.",
        "- Reliability (5%): Punctuality, follow-through, transport reliability.",
        "- Professionalism (5%): Respect for clients/rules, composure under stress.",
        "- Communication (5%): Clarity and tone; IGNORE filler words.",
    ])
    lines = [
        "Analyze the candidate responses using the six metrics below.",
        "Return ONLY a JSON object with keys: ca, exp, ps, rel, prof, comm (each 1–10).",
        "",
        "Definitions (approximate weighting):",
        metrics_def,
        "",
        "Candidate Responses:",
        "--- START RESPONSES ---",
        qa_text,
        "--- END RESPONSES ---",
        "",
        '{"ca":8,"exp":7,"ps":7,"rel":7,"prof":6,"comm":6}',
    ]
    return "\n".join(lines)

def build_rewrite_prompt_locked(qa_text: str, s: Dict[str, int]) -> str:
    """Build prompt for generating justifications with locked scores."""
    lines = [
        "Use FIXED scores; DO NOT change them. Generate justifications + bullets + summary.",
        "",
        f"- Cognitive Ability: {s['ca']}",
        f"- Experience: {s['exp']}",
        f"- Problem Solving: {s['ps']}",
        f"- Reliability: {s['rel']}",
        f"- Professionalism: {s['prof']}",
        f"- Communication: {s['comm']}",
        "",
        "Return ONLY this JSON:",
        "{",
        f'  "cognitive_ability_score": {s["ca"]},',
        '  "cognitive_ability_justification": "...",',
        f'  "experience_score": {s["exp"]},',
        '  "experience_justification": "...",',
        f'  "reliability_score": {s["rel"]},',
        '  "reliability_justification": "...",',
        f'  "professionalism_score": {s["prof"]},',
        '  "professionalism_justification": "...",',
        f'  "problem_solving_score": {s["ps"]},',
        '  "problem_solving_justification": "...",',
        f'  "communication_score": {s["comm"]},',
        '  "communication_justification": "...",',
        '  "general_strengths": "- ...\\n- ...\\n- ...",',
        '  "general_weaknesses": "- ...\\n- ...\\n- ...",',
        '  "general_summary": "..."',
        "}",
        "",
        "Candidate Responses:",
        "--- START ---",
        qa_text,
        "--- END ---",
    ]
    return "\n".join(lines)

## Hugging Face API Client Functions

In [13]:
from openai import OpenAI

# Initialize Hugging Face client using OpenAI-compatible interface
hf_client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=os.environ["HF_TOKEN"],
)

def hf_chat_once(prompt: str, model: str, temperature: float = 0.7,
                 max_tokens: int = 512, json_mode: bool = False) -> str:
    """Call Hugging Face chat completion API."""
    kwargs = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": max_tokens,
        "top_p": 1.0,
    }

    if json_mode:
        # Enforce valid JSON responses when the provider supports this
        kwargs["response_format"] = {"type": "json_object"}

    try:
        resp = hf_client.chat.completions.create(**kwargs)
        return resp.choices[0].message.content
    except Exception as e:
        print(f"Error calling HF API: {e}")
        raise

def hf_chat_json(prompt: str, model: str, temperature: float,
                 max_tokens: int = 512) -> tuple:
    """Call Hugging Face API and parse JSON response."""
    txt = hf_chat_once(prompt, model=model, temperature=temperature,
                       max_tokens=max_tokens, json_mode=True)
    try:
        return json.loads(txt), txt
    except Exception:
        # Fallback: try to extract JSON from text
        try:
            start = txt.index("{")
            end = txt.rindex("}") + 1
            return json.loads(txt[start:end]), txt
        except Exception:
            return {}, txt

## Verify Model Access

In [14]:
from huggingface_hub import whoami, HfApi

print("Account info:")
print(whoami())
print()

# Verify you have access to the model (especially important for gated models like Llama)
api = HfApi()
model_name = MODEL_ID.split(":")[0]  # Remove provider suffix
try:
    info = api.model_info(model_name, use_auth_token=True)
    print(f"Model: {info.id}")
    print(f"Gated: {info.gated}")
    print("✓ You have access to this model!")
except Exception as e:
    print(f"❌ Error accessing model: {e}")
    print("Make sure you've accepted the model's license agreement on Hugging Face.")

Account info:
{'type': 'user', 'id': '68f018da27c30b98c620061c', 'name': 'serviceagent', 'fullname': 'ServiceAgent', 'canPay': True, 'periodEnd': 1764547199, 'isPro': True, 'avatarUrl': '/avatars/c1d1a0eb60867c137fe668fb925748f7.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'ensemble', 'role': 'fineGrained', 'createdAt': '2025-10-18T20:52:33.027Z', 'fineGrained': {'canReadGatedRepos': True, 'global': ['discussion.write', 'post.write'], 'scoped': [{'entity': {'_id': '66eaf084b3b3239188f66fa7', 'type': 'model', 'name': 'meta-llama/Llama-3.2-3B'}, 'permissions': ['repo.content.read', 'discussion.write', 'repo.write']}, {'entity': {'_id': '68f018da27c30b98c620061c', 'type': 'user', 'name': 'serviceagent'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.serverless.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'user.webhooks.write', 'collection.read', 'collection.write', 'discussion.write', 'us

## Step 1: Generate Synthetic Interview Responses

In [None]:
random.seed(123)

role = CONFIG["role_title"]
questions = CONFIG["question_set"]
N = CONFIG["num_candidates"]

print(f"Generating {N} synthetic interviews...")
print(f"Model: {MODEL_ID}")
print()

interviews = []
start_time = time.time()

for i in range(N):
    persona = {
        "candidate_id": f"cand_{i+1:04d}",
        "persona_title": random.choice([
            "Veteran field tech", "Career switcher", "Recent grad",
            "Retail service rep", "HVAC junior"
        ]),
        "yrs_experience": random.choice([0, 1, 2, 3, 5, 7, 10]),
        "domain_keywords": random.choice([
            "preventive maintenance, HVAC, route planning",
            "customer empathy, troubleshooting basics",
            "inventory, parts ordering, safety protocols",
            "ticket triage, escalation, SLA awareness",
        ]),
        "reliability_flags": random.choice([
            "has_car; weekend_ok", "public_transit", "night_shift_ok"
        ]),
        "notes": random.choice([
            "calm under pressure", "fast learner", "detail-oriented"
        ]),
    }

    prompt = build_generation_prompt(role, questions, persona)
    out = hf_chat_once(
        prompt,
        model=MODEL_ID,
        temperature=CONFIG["generation_temperature"],
        max_tokens=CONFIG["generation_max_tokens"]
    )

    interviews.append({
        "interview_id": f"intv_{i+1:04d}",
        "candidate_id": persona["candidate_id"],
        "role_title": role,
        "question_set_id": CONFIG["question_set_id"],
        "num_questions": len(questions),
        "qa_text": canonicalize_qa_text(out),
        "source": "synthetic",
        "gen_model": MODEL_ID,
        "gen_prompt_version": CONFIG["gen_prompt_version"],
        "gen_temperature": CONFIG["generation_temperature"],
        "gen_top_p": 1.0,
        "gen_seed": 123,
        "created_at": pd.Timestamp.utcnow().isoformat(),
    })

    if (i + 1) % 10 == 0:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining = (N - i - 1) / rate if rate > 0 else 0
        print(f"Generated {i + 1}/{N} interviews | "
              f"Rate: {rate:.1f}/s | "
              f"ETA: {remaining:.0f}s")

interviews_df = pd.DataFrame(interviews)
total_time = time.time() - start_time

print(f"\n✓ Generated {len(interviews_df)} interviews in {total_time:.1f}s")
print(f"  Average: {total_time/len(interviews_df):.2f}s per interview")
display(interviews_df.head(2))

In [15]:
interviews_df = pd.read_csv("/content/drive/MyDrive/mvp/synthInterviews20251009-234435.csv")

## Step 2: Generate K Self-Consistency Samples (Scoring)

In [16]:
K = CONFIG["k_samples"]
records = interviews_df[["interview_id", "qa_text"]].to_dict("records")

print(f"Generating {K} scoring samples for {len(records)} interviews...")
print(f"Total API calls: {K * len(records)}")
print()

samples = []
start_time = time.time()
total_calls = K * len(records)
call_count = 0

for idx, iv in enumerate(records):
    qa = iv["qa_text"] or ""
    sprompt = build_scoring_prompt(qa)

    for k in range(K):
        t0 = time.time()
        js, raw = hf_chat_json(
            sprompt,
            model=MODEL_ID,
            temperature=0.7,
            max_tokens=256
        )
        latency = int((time.time() - t0) * 1000)
        call_count += 1

        row = {
            "interview_id": iv["interview_id"],
            "run_idx": k,
            "model_name": MODEL_ID,
            "latency_ms": latency,
        }
        for m in METRICS:
            row[m] = clamp_int(js.get(m, 5))
        samples.append(row)

    if (idx + 1) % 10 == 0:
        elapsed = time.time() - start_time
        rate = call_count / elapsed
        remaining_calls = total_calls - call_count
        remaining_time = remaining_calls / rate if rate > 0 else 0
        print(f"Scored {idx + 1}/{len(records)} interviews | "
              f"Calls: {call_count}/{total_calls} | "
              f"Rate: {rate:.1f}/s | "
              f"ETA: {remaining_time:.0f}s")

samples_df = pd.DataFrame(samples)
total_time = time.time() - start_time

print(f"\n✓ Generated {len(samples_df)} scoring samples in {total_time:.1f}s")
print(f"  Average: {total_time/len(samples_df):.3f}s per sample")
print(f"  Median latency: {samples_df['latency_ms'].median():.0f}ms")
display(samples_df.head(9))  # Show 3 samples for 3 interviews

Generating 3 scoring samples for 50 interviews...
Total API calls: 150

Scored 10/50 interviews | Calls: 30/150 | Rate: 0.6/s | ETA: 201s
Scored 20/50 interviews | Calls: 60/150 | Rate: 0.6/s | ETA: 142s
Scored 30/50 interviews | Calls: 90/150 | Rate: 0.6/s | ETA: 95s
Scored 40/50 interviews | Calls: 120/150 | Rate: 0.6/s | ETA: 47s
Scored 50/50 interviews | Calls: 150/150 | Rate: 0.6/s | ETA: 0s

✓ Generated 150 scoring samples in 233.9s
  Average: 1.559s per sample
  Median latency: 1530ms


Unnamed: 0,interview_id,run_idx,model_name,latency_ms,ca,exp,ps,rel,prof,comm
0,intv_0001,0,meta-llama/Llama-3.1-8B-Instruct:novita,2147,9,8,8,8,8,7
1,intv_0001,1,meta-llama/Llama-3.1-8B-Instruct:novita,1598,9,8,8,8,7,8
2,intv_0001,2,meta-llama/Llama-3.1-8B-Instruct:novita,1430,9,8,8,9,8,9
3,intv_0002,0,meta-llama/Llama-3.1-8B-Instruct:novita,1537,8,9,8,9,8,9
4,intv_0002,1,meta-llama/Llama-3.1-8B-Instruct:novita,1524,8,8,8,8,8,7
5,intv_0002,2,meta-llama/Llama-3.1-8B-Instruct:novita,1330,8,8,8,9,8,8
6,intv_0003,0,meta-llama/Llama-3.1-8B-Instruct:novita,2572,8,8,7,8,7,8
7,intv_0003,1,meta-llama/Llama-3.1-8B-Instruct:novita,1702,8,9,7,9,8,8
8,intv_0003,2,meta-llama/Llama-3.1-8B-Instruct:novita,1736,9,8,8,8,7,9


## Step 3: Aggregate Scores (Self-Consistency)

In [17]:
print("Aggregating scores using median (self-consistency)...")

aggs = []
for intv_id, g in samples_df.groupby("interview_id"):
    row = {"interview_id": intv_id}
    latencies = g["latency_ms"].tolist()

    for m in METRICS:
        vals = [int(v) for v in g[m].tolist()]
        row[f"{m}_score_agg"] = clamp_int(stats.median(vals))
        row[f"{m}_confidence"] = iqr_confidence(vals)

    row["overall_weighted_agg"] = compute_overall_weighted(
        {m: row[f"{m}_score_agg"] for m in METRICS}
    )
    row["p95_latency_ms"] = float(np.percentile(latencies, 95)) if latencies else 0
    aggs.append(row)

aggregated_df = pd.DataFrame(aggs)

print(f"✓ Aggregated {len(aggregated_df)} interview scores")
print(f"  Confidence distribution:")
for m in METRICS:
    conf_col = f"{m}_confidence"
    if conf_col in aggregated_df.columns:
        high = (aggregated_df[conf_col] == "high").sum()
        med = (aggregated_df[conf_col] == "medium").sum()
        low = (aggregated_df[conf_col] == "low").sum()
        print(f"    {m.upper()}: High={high}, Medium={med}, Low={low}")

display(aggregated_df.head())

Aggregating scores using median (self-consistency)...
✓ Aggregated 50 interview scores
  Confidence distribution:
    CA: High=49, Medium=1, Low=0
    EXP: High=45, Medium=3, Low=2
    PS: High=48, Medium=1, Low=1
    REL: High=48, Medium=2, Low=0
    PROF: High=50, Medium=0, Low=0
    COMM: High=50, Medium=0, Low=0


Unnamed: 0,interview_id,ca_score_agg,ca_confidence,exp_score_agg,exp_confidence,ps_score_agg,ps_confidence,rel_score_agg,rel_confidence,prof_score_agg,prof_confidence,comm_score_agg,comm_confidence,overall_weighted_agg,p95_latency_ms
0,intv_0001,9,high,8,high,8,high,8,high,8,high,8,high,8.35,2092.1
1,intv_0002,8,high,8,high,8,high,9,high,8,high,8,high,8.05,1535.7
2,intv_0003,8,high,8,high,7,high,8,high,7,high,8,high,7.8,2488.4
3,intv_0004,8,high,8,high,8,high,8,high,8,high,8,high,8.0,1786.3
4,intv_0005,6,high,5,high,6,high,8,high,8,high,8,high,5.95,1706.0


## Step 4: Generate Final Outputs with Justifications

In [20]:
qa_map = dict(zip(interviews_df["interview_id"], interviews_df["qa_text"]))

def rewrite_once_chat(qa_text: str, scores_locked: Dict[str, int],
                     model: str, max_tokens: int = 1200) -> tuple:
    """Generate justifications for locked scores."""
    prompt = (
        "Respond with exactly ONE JSON object, no code fences, no prose, no markdown. "
        "Do not include any keys that are not requested.\n\n"
        + build_rewrite_prompt_locked(qa_text, scores_locked)
    )
    t0 = time.time()
    js, raw = hf_chat_json(prompt, model=model, temperature=0.0, max_tokens=max_tokens)
    latency = int((time.time() - t0) * 1000)
    return js, raw, latency

print(f"Generating justifications for {len(aggregated_df)} interviews...")
print()

final_rows = []
invalid_count = 0
start_time = time.time()

for idx, (_, agg_row) in enumerate(aggregated_df.iterrows()):
    iid = agg_row["interview_id"]
    qa_text = qa_map.get(iid, "")
    if not qa_text:
        invalid_count += 1
        continue

    scores_locked = {m: agg_row[f"{m}_score_agg"] for m in METRICS}
    js, raw_text, lat = rewrite_once_chat(qa_text, scores_locked, model=MODEL_ID)

    # Validate scores match
    score_keys = [
        "cognitive_ability_score", "experience_score", "reliability_score",
        "professionalism_score", "problem_solving_score", "communication_score"
    ]
    if all(k in js for k in score_keys):
        out_row = {
            "interview_id": iid,
            "cognitive_ability_score": js.get("cognitive_ability_score"),
            "cognitive_ability_justification": js.get("cognitive_ability_justification", ""),
            "experience_score": js.get("experience_score"),
            "experience_justification": js.get("experience_justification", ""),
            "reliability_score": js.get("reliability_score"),
            "reliability_justification": js.get("reliability_justification", ""),
            "professionalism_score": js.get("professionalism_score"),
            "professionalism_justification": js.get("professionalism_justification", ""),
            "problem_solving_score": js.get("problem_solving_score"),
            "problem_solving_justification": js.get("problem_solving_justification", ""),
            "communication_score": js.get("communication_score"),
            "communication_justification": js.get("communication_justification", ""),
            "general_strengths": js.get("general_strengths", ""),
            "general_weaknesses": js.get("general_weaknesses", ""),
            "general_summary": js.get("general_summary", ""),
            "overall_weighted_score": agg_row["overall_weighted_agg"],
            "rewrite_latency_ms": lat,
            "rewrite_model": MODEL_ID,
        }
        final_rows.append(out_row)
    else:
        invalid_count += 1

    if (idx + 1) % 10 == 0:
        elapsed = time.time() - start_time
        rate = (idx + 1) / elapsed
        remaining = (len(aggregated_df) - idx - 1) / rate if rate > 0 else 0
        print(f"Processed {idx + 1}/{len(aggregated_df)} | "
              f"Rate: {rate:.1f}/s | "
              f"ETA: {remaining:.0f}s")

final_df = pd.DataFrame(final_rows)
total_time = time.time() - start_time

print(f"\n✓ Generated {len(final_df)} complete evaluations in {total_time:.1f}s")
print(f"  Invalid/skipped: {invalid_count}")
display(final_df.head())

Generating justifications for 50 interviews...

Processed 10/50 | Rate: 0.1/s | ETA: 369s
Processed 20/50 | Rate: 0.1/s | ETA: 250s
Processed 30/50 | Rate: 0.1/s | ETA: 165s
Processed 40/50 | Rate: 0.1/s | ETA: 80s
Processed 50/50 | Rate: 0.1/s | ETA: 0s

✓ Generated 49 complete evaluations in 394.3s
  Invalid/skipped: 1


Unnamed: 0,interview_id,cognitive_ability_score,cognitive_ability_justification,experience_score,experience_justification,reliability_score,reliability_justification,professionalism_score,professionalism_justification,problem_solving_score,problem_solving_justification,communication_score,communication_justification,general_strengths,general_weaknesses,general_summary,overall_weighted_score,rewrite_latency_ms,rewrite_model
0,intv_0001,9,The candidate demonstrated exceptional cogniti...,8,The candidate has significant experience in th...,8,The candidate demonstrated a strong commitment...,8,The candidate consistently demonstrated profes...,8,The candidate demonstrated exceptional problem...,8,The candidate consistently demonstrated effect...,- Exceptional cognitive abilities and problem-...,- Limited opportunities to showcase their abil...,The candidate demonstrated exceptional cogniti...,8.35,10458,meta-llama/Llama-3.1-8B-Instruct:novita
1,intv_0002,8,The candidate demonstrated strong problem-solv...,8,The candidate has extensive experience as a fi...,9,The candidate consistently demonstrated a stro...,8,The candidate consistently displayed a profess...,8,The candidate demonstrated strong problem-solv...,8,The candidate consistently demonstrated excell...,- Strong problem-solving skills and critical t...,- May benefit from additional training or deve...,The candidate is a highly skilled and experien...,8.05,8497,meta-llama/Llama-3.1-8B-Instruct:novita
2,intv_0003,8,The candidate demonstrates strong cognitive ab...,8,The candidate has a significant amount of rele...,8,The candidate consistently demonstrates reliab...,7,The candidate generally demonstrates professio...,7,The candidate demonstrates strong problem-solv...,8,The candidate communicates effectively through...,- Strong problem-solving skills and ability to...,- Occasionally uses informal language or phras...,The candidate demonstrates strong cognitive ab...,7.8,6355,meta-llama/Llama-3.1-8B-Instruct:novita
3,intv_0004,8,The candidate demonstrated strong cognitive ab...,8,The candidate has extensive experience as a Fi...,8,The candidate consistently demonstrated reliab...,8,The candidate consistently demonstrated profes...,8,The candidate effectively applied problem-solv...,8,The candidate consistently demonstrated strong...,- Strong cognitive abilities and problem-solvi...,- Limited discussion of long-term goals or car...,The candidate demonstrated strong cognitive ab...,8.0,9203,meta-llama/Llama-3.1-8B-Instruct:novita
4,intv_0005,6,The candidate demonstrates average cognitive a...,5,The candidate has limited direct experience in...,8,The candidate consistently demonstrates a stro...,8,The candidate consistently demonstrates a high...,6,The candidate demonstrates average problem-sol...,8,The candidate consistently demonstrates strong...,- Strong commitment to reliability and profess...,- Limited direct experience in the field - May...,The candidate demonstrates a strong commitment...,5.95,8221,meta-llama/Llama-3.1-8B-Instruct:novita


## Save Results

In [21]:
# Save all dataframes
ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
model_safe = MODEL_ID.replace("/", "_").replace(":", "_")

# interviews_df.to_csv(f"interviews_{model_safe}_{ts}.csv", index=False)
# samples_df.to_csv(f"samples_{model_safe}_{ts}.csv", index=False)
# aggregated_df.to_csv(f"aggregated_{model_safe}_{ts}.csv", index=False)
final_df.to_csv(f"final_{model_safe}_{ts}.csv", index=False)

print(f"✓ Saved all results with timestamp: {ts}")
print(f"  Model used: {MODEL_ID}")
print(f"\nFiles saved:")
print(f"  - interviews_{model_safe}_{ts}.csv")
print(f"  - samples_{model_safe}_{ts}.csv")
print(f"  - aggregated_{model_safe}_{ts}.csv")
print(f"  - final_{model_safe}_{ts}.csv")

✓ Saved all results with timestamp: 20251104-234821
  Model used: meta-llama/Llama-3.1-8B-Instruct:novita

Files saved:
  - interviews_meta-llama_Llama-3.1-8B-Instruct_novita_20251104-234821.csv
  - samples_meta-llama_Llama-3.1-8B-Instruct_novita_20251104-234821.csv
  - aggregated_meta-llama_Llama-3.1-8B-Instruct_novita_20251104-234821.csv
  - final_meta-llama_Llama-3.1-8B-Instruct_novita_20251104-234821.csv


  ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")


## Summary Statistics

In [22]:
print("=" * 70)
print(f"SELF-CONSISTENCY SCORING SUMMARY")
print("=" * 70)
print(f"\nModel: {MODEL_ID}")
print(f"\nDataset:")
print(f"  Total Interviews: {len(interviews_df)}")
print(f"  K Samples per Interview: {K}")
print(f"  Total Scoring Samples: {len(samples_df)}")
print(f"  Final Valid Outputs: {len(final_df)}")

print(f"\nScore Statistics:")
print(f"  Overall Weighted Score:")
print(f"    Mean:   {final_df['overall_weighted_score'].mean():.2f}")
print(f"    Median: {final_df['overall_weighted_score'].median():.2f}")
print(f"    Std:    {final_df['overall_weighted_score'].std():.2f}")
print(f"    Min:    {final_df['overall_weighted_score'].min():.2f}")
print(f"    Max:    {final_df['overall_weighted_score'].max():.2f}")

print(f"\nMetric Score Averages:")
for m in METRICS:
    col = f"{m}_score_agg"
    if col in aggregated_df.columns:
        mean_score = aggregated_df[col].mean()
        median_score = aggregated_df[col].median()
        print(f"  {m.upper():6s}: Mean={mean_score:.2f}, Median={median_score:.0f}")

print(f"\nConfidence Analysis:")
for m in METRICS:
    conf_col = f"{m}_confidence"
    if conf_col in aggregated_df.columns:
        high = (aggregated_df[conf_col] == "high").sum()
        med = (aggregated_df[conf_col] == "medium").sum()
        low = (aggregated_df[conf_col] == "low").sum()
        total = high + med + low
        high_pct = 100 * high / total if total > 0 else 0
        print(f"  {m.upper():6s}: High={high} ({high_pct:.0f}%), Medium={med}, Low={low}")

print(f"\nLatency Statistics:")
print(f"  Scoring (per sample):")
print(f"    Median: {samples_df['latency_ms'].median():.0f}ms")
print(f"    P95:    {samples_df['latency_ms'].quantile(0.95):.0f}ms")
print(f"    P99:    {samples_df['latency_ms'].quantile(0.99):.0f}ms")
print(f"  Rewrite (per interview):")
print(f"    Median: {final_df['rewrite_latency_ms'].median():.0f}ms")
print(f"    P95:    {final_df['rewrite_latency_ms'].quantile(0.95):.0f}ms")
print(f"    P99:    {final_df['rewrite_latency_ms'].quantile(0.99):.0f}ms")

print(f"\nScore Distribution by Percentile:")
percentiles = [10, 25, 50, 75, 90]
for p in percentiles:
    val = final_df['overall_weighted_score'].quantile(p/100)
    print(f"  P{p:2d}: {val:.2f}")

print("=" * 70)

SELF-CONSISTENCY SCORING SUMMARY

Model: meta-llama/Llama-3.1-8B-Instruct:novita

Dataset:
  Total Interviews: 50
  K Samples per Interview: 3
  Total Scoring Samples: 150
  Final Valid Outputs: 49

Score Statistics:
  Overall Weighted Score:
    Mean:   7.76
    Median: 8.00
    Std:    0.85
    Min:    5.05
    Max:    8.70

Metric Score Averages:
  CA    : Mean=7.84, Median=8
  EXP   : Mean=7.60, Median=8
  PS    : Mean=7.58, Median=8
  REL   : Mean=8.08, Median=8
  PROF  : Mean=7.80, Median=8
  COMM  : Mean=8.10, Median=8

Confidence Analysis:
  CA    : High=49 (98%), Medium=1, Low=0
  EXP   : High=45 (90%), Medium=3, Low=2
  PS    : High=48 (96%), Medium=1, Low=1
  REL   : High=48 (96%), Medium=2, Low=0
  PROF  : High=50 (100%), Medium=0, Low=0
  COMM  : High=50 (100%), Medium=0, Low=0

Latency Statistics:
  Scoring (per sample):
    Median: 1530ms
    P95:    1786ms
    P99:    2056ms
  Rewrite (per interview):
    Median: 7649ms
    P95:    9855ms
    P99:    10893ms

Score Dist

## Optional: Mount Google Drive for Persistent Storage

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save to Drive
save_dir = "/content/drive/MyDrive/mvp"
os.makedirs(save_dir, exist_ok=True)

ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
model_safe = MODEL_ID.replace("/", "_").replace(":", "_")

interviews_df.to_csv(f"{save_dir}/interviews_{model_safe}_{ts}.csv", index=False)
samples_df.to_csv(f"{save_dir}/samples_{model_safe}_{ts}.csv", index=False)
aggregated_df.to_csv(f"{save_dir}/aggregated_{model_safe}_{ts}.csv", index=False)
final_df.to_csv(f"{save_dir}/final_{model_safe}_{ts}.csv", index=False)

print(f"✓ Saved to Google Drive: {save_dir}")