# Self-Consistency Scoring with OpenAI Models
This notebook runs self-consistency evaluation using GPT-4o and GPT-3.5-turbo models.

In [1]:
# Install required packages
!pip install openai pandas numpy pyyaml -q

In [2]:
import os
from getpass import getpass

# Set your OpenAI API key
if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"]:
    os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI API key: ")

Paste your OpenAI API key: ··········


In [3]:
from google.colab import drive
drive.mount('/content/drive')  # follow the auth prompt

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import yaml
import json
import time
import random
import statistics as stats
import re
from typing import Dict, Any, List
from datetime import datetime

## Configuration

In [15]:
CONFIG = {
    "role_title": "Field Technician",
    "question_set_id": "qs_v1",
    "question_set": [
        "Tell me about a time you handled an urgent service call. What steps did you take?",
        "How do you plan your route and prioritize jobs when schedules change during the day?",
        "Describe a tricky diagnostic you solved. What tools or methods did you use?",
        "How do you keep customers calm when they are upset or stressed?",
        "Walk me through your process for documenting work and updating tickets.",
        "What does reliability at work mean to you, and how do you demonstrate it?",
        "How do you stay safe on the job and follow site-specific rules?",
        "How do you collaborate with teammates or escalate when blocked?"
    ],
    "num_candidates": 50,
    "k_samples": 3,  # self-consistency K
    "generation_temperature": 0.8,
    "generation_max_tokens": 1200,
    "timeout_s": 60,
    "gen_prompt_version": "gen_v1",
    "score_prompt_version": "score_v1",
    "rewrite_prompt_version": "rewrite_v1",
}

# Choose your OpenAI model
# Options: "gpt-4o", "gpt-4o-mini", "gpt-3.5-turbo", "gpt-4-turbo"
# MODEL_ID = "gpt-4o"  # Change to "gpt-3.5-turbo" for GPT-3.5
MODEL_ID = "gpt-3.5-turbo"

# Scoring weights and metrics
WEIGHTS = {"ca":0.35, "exp":0.35, "ps":0.15, "rel":0.05, "prof":0.05, "comm":0.05}
METRICS = ["ca", "exp", "ps", "rel", "prof", "comm"]
FILLERS_RE = re.compile(r"\b(?:um+|uh+|erm|like|you know|sort of|kinda|i mean|ya know)\b", re.IGNORECASE)

## Helper Functions

In [16]:
def clamp_int(x, lo=1, hi=10):
    try:
        xi = int(round(float(x)))
    except Exception:
        xi = 5
    return max(lo, min(hi, xi))

def canonicalize_qa_text(text: str) -> str:
    """Clean and standardize QA text."""
    return text.strip()

def compute_overall_weighted(scores: Dict[str, int]) -> float:
    """Compute weighted overall score."""
    total = sum(WEIGHTS.get(m, 0) * scores.get(m, 5) for m in METRICS)
    return round(total, 2)

def iqr_confidence(vals: List[int]) -> str:
    """Calculate confidence based on IQR."""
    if len(vals) < 2:
        return "low"
    q1, q3 = np.percentile(vals, [25, 75])
    iqr = q3 - q1
    if iqr <= 1:
        return "high"
    elif iqr <= 2:
        return "medium"
    else:
        return "low"

## Prompt Building Functions

In [17]:
def build_generation_prompt(role: str, question_set: List[str], persona: Dict[str, Any]) -> str:
    """Build prompt for generating candidate interview responses."""
    q_block = "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(question_set)])
    lines = [
        f"You are the candidate interviewing for the role: {role}.",
        f"Persona hints: title={persona['persona_title']}; years_experience={persona['yrs_experience']}; "
        f"keywords={persona['domain_keywords']}; reliability={persona['reliability_flags']}; notes={persona['notes']}.",
        "",
        "Answer each question clearly (2–4 sentences per answer).",
        "",
        q_block,
        "",
        "Return responses in this pattern:",
        "Question 1: <repeat question>",
        "Answer: <answer>",
        "",
        "Question 2: <repeat question>",
        "Answer: <answer>",
    ]
    return "\n".join(lines)

def build_scoring_prompt(qa_text: str) -> str:
    """Build prompt for scoring candidate responses."""
    metrics_def = "\n".join([
        "- Cognitive Ability (35%): Structured thinking, planning, logic.",
        "- Experience (35%): Relevant work (last 10 years), skills, accomplishments in similar service jobs.",
        "- Problem Solving (15%): Resourcefulness, safe tradeoffs under constraints.",
        "- Reliability (5%): Punctuality, follow-through, transport reliability.",
        "- Professionalism (5%): Respect for clients/rules, composure under stress.",
        "- Communication (5%): Clarity and tone; IGNORE filler words.",
    ])
    lines = [
        "Analyze the candidate responses using the six metrics below.",
        "Return ONLY a JSON object with keys: ca, exp, ps, rel, prof, comm (each 1–10).",
        "",
        "Definitions (approximate weighting):",
        metrics_def,
        "",
        "Candidate Responses:",
        "--- START RESPONSES ---",
        qa_text,
        "--- END RESPONSES ---",
        "",
        '{"ca":8,"exp":7,"ps":7,"rel":7,"prof":6,"comm":6}',
    ]
    return "\n".join(lines)

def build_rewrite_prompt_locked(qa_text: str, s: Dict[str, int]) -> str:
    """Build prompt for generating justifications with locked scores."""
    lines = [
        "Use FIXED scores; DO NOT change them. Generate justifications + bullets + summary.",
        "",
        f"- Cognitive Ability: {s['ca']}",
        f"- Experience: {s['exp']}",
        f"- Problem Solving: {s['ps']}",
        f"- Reliability: {s['rel']}",
        f"- Professionalism: {s['prof']}",
        f"- Communication: {s['comm']}",
        "",
        "Return ONLY this JSON:",
        "{",
        f'  "cognitive_ability_score": {s["ca"]},',
        '  "cognitive_ability_justification": "...",',
        f'  "experience_score": {s["exp"]},',
        '  "experience_justification": "...",',
        f'  "reliability_score": {s["rel"]},',
        '  "reliability_justification": "...",',
        f'  "professionalism_score": {s["prof"]},',
        '  "professionalism_justification": "...",',
        f'  "problem_solving_score": {s["ps"]},',
        '  "problem_solving_justification": "...",',
        f'  "communication_score": {s["comm"]},',
        '  "communication_justification": "...",',
        '  "general_strengths": "- ...\\n- ...\\n- ...",',
        '  "general_weaknesses": "- ...\\n- ...\\n- ...",',
        '  "general_summary": "..."',
        "}",
        "",
        "Candidate Responses:",
        "--- START ---",
        qa_text,
        "--- END ---",
    ]
    return "\n".join(lines)

## OpenAI API Client Functions

In [18]:
from openai import OpenAI

# Initialize OpenAI client
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def openai_chat_once(prompt: str, model: str, temperature: float = 0.7,
                     max_tokens: int = 512, json_mode: bool = False) -> str:
    """Call OpenAI chat completion API."""
    kwargs = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": max_tokens,
        "top_p": 1.0,
    }

    if json_mode:
        # Use JSON mode for structured outputs
        kwargs["response_format"] = {"type": "json_object"}

    try:
        resp = openai_client.chat.completions.create(**kwargs)
        return resp.choices[0].message.content
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        raise

def openai_chat_json(prompt: str, model: str, temperature: float,
                     max_tokens: int = 512) -> tuple:
    """Call OpenAI API and parse JSON response."""
    txt = openai_chat_once(prompt, model=model, temperature=temperature,
                          max_tokens=max_tokens, json_mode=True)
    try:
        return json.loads(txt), txt
    except Exception:
        # Fallback: try to extract JSON from text
        try:
            start = txt.index("{")
            end = txt.rindex("}") + 1
            return json.loads(txt[start:end]), txt
        except Exception:
            return {}, txt

## Step 1: Generate Synthetic Interview Responses

In [None]:
random.seed(123)

role = CONFIG["role_title"]
questions = CONFIG["question_set"]
N = CONFIG["num_candidates"]

interviews = []
for i in range(N):
    persona = {
        "candidate_id": f"cand_{i+1:04d}",
        "persona_title": random.choice([
            "Veteran field tech", "Career switcher", "Recent grad",
            "Retail service rep", "HVAC junior"
        ]),
        "yrs_experience": random.choice([0, 1, 2, 3, 5, 7, 10]),
        "domain_keywords": random.choice([
            "preventive maintenance, HVAC, route planning",
            "customer empathy, troubleshooting basics",
            "inventory, parts ordering, safety protocols",
            "ticket triage, escalation, SLA awareness",
        ]),
        "reliability_flags": random.choice([
            "has_car; weekend_ok", "public_transit", "night_shift_ok"
        ]),
        "notes": random.choice([
            "calm under pressure", "fast learner", "detail-oriented"
        ]),
    }

    prompt = build_generation_prompt(role, questions, persona)
    out = openai_chat_once(
        prompt,
        model=MODEL_ID,
        temperature=CONFIG["generation_temperature"],
        max_tokens=CONFIG["generation_max_tokens"]
    )

    interviews.append({
        "interview_id": f"intv_{i+1:04d}",
        "candidate_id": persona["candidate_id"],
        "role_title": role,
        "question_set_id": CONFIG["question_set_id"],
        "num_questions": len(questions),
        "qa_text": canonicalize_qa_text(out),
        "source": "synthetic",
        "gen_model": MODEL_ID,
        "gen_prompt_version": CONFIG["gen_prompt_version"],
        "gen_temperature": CONFIG["generation_temperature"],
        "gen_top_p": 1.0,
        "gen_seed": 123,
        "created_at": pd.Timestamp.utcnow().isoformat(),
    })

    if (i + 1) % 10 == 0:
        print(f"Generated {i + 1}/{N} interviews...")

interviews_df = pd.DataFrame(interviews)
print(f"\nGenerated {len(interviews_df)} interviews")
display(interviews_df.head(2))

In [19]:
interviews_df = pd.read_csv("/content/drive/MyDrive/mvp/synthInterviews20251009-234435.csv")

## Step 2: Generate K Self-Consistency Samples (Scoring)

In [20]:
K = CONFIG["k_samples"]
records = interviews_df[["interview_id", "qa_text"]].to_dict("records")

samples = []
for idx, iv in enumerate(records):
    qa = iv["qa_text"] or ""
    sprompt = build_scoring_prompt(qa)

    for k in range(K):
        t0 = time.time()
        js, raw = openai_chat_json(
            sprompt,
            model=MODEL_ID,
            temperature=0.7,
            max_tokens=256
        )
        latency = int((time.time() - t0) * 1000)

        row = {
            "interview_id": iv["interview_id"],
            "run_idx": k,
            "model_name": MODEL_ID,
            "latency_ms": latency,
        }
        for m in METRICS:
            row[m] = clamp_int(js.get(m, 5))
        samples.append(row)

    if (idx + 1) % 10 == 0:
        print(f"Scored {idx + 1}/{len(records)} interviews (K={K} each)...")

samples_df = pd.DataFrame(samples)
print(f"\nGenerated {len(samples_df)} scoring samples")
display(samples_df.head(9))  # Show 3 samples for 3 interviews

Scored 10/50 interviews (K=3 each)...
Scored 20/50 interviews (K=3 each)...
Scored 30/50 interviews (K=3 each)...
Scored 40/50 interviews (K=3 each)...
Scored 50/50 interviews (K=3 each)...

Generated 150 scoring samples


Unnamed: 0,interview_id,run_idx,model_name,latency_ms,ca,exp,ps,rel,prof,comm
0,intv_0001,0,gpt-3.5-turbo,1495,8,7,7,7,6,6
1,intv_0001,1,gpt-3.5-turbo,962,8,7,7,7,6,6
2,intv_0001,2,gpt-3.5-turbo,1104,8,7,7,7,6,6
3,intv_0002,0,gpt-3.5-turbo,1416,8,7,7,7,6,6
4,intv_0002,1,gpt-3.5-turbo,1170,8,7,7,7,6,6
5,intv_0002,2,gpt-3.5-turbo,1102,8,7,7,7,6,6
6,intv_0003,0,gpt-3.5-turbo,1219,8,7,7,7,6,6
7,intv_0003,1,gpt-3.5-turbo,776,8,7,7,7,6,6
8,intv_0003,2,gpt-3.5-turbo,1041,8,7,7,7,6,6


## Step 3: Aggregate Scores (Self-Consistency)

In [21]:
aggs = []
for intv_id, g in samples_df.groupby("interview_id"):
    row = {"interview_id": intv_id}
    latencies = g["latency_ms"].tolist()

    for m in METRICS:
        vals = [int(v) for v in g[m].tolist()]
        row[f"{m}_score_agg"] = clamp_int(stats.median(vals))
        row[f"{m}_confidence"] = iqr_confidence(vals)

    row["overall_weighted_agg"] = compute_overall_weighted(
        {m: row[f"{m}_score_agg"] for m in METRICS}
    )
    row["p95_latency_ms"] = float(np.percentile(latencies, 95)) if latencies else 0
    aggs.append(row)

aggregated_df = pd.DataFrame(aggs)
print(f"Aggregated {len(aggregated_df)} interview scores")
display(aggregated_df.head())

Aggregated 50 interview scores


Unnamed: 0,interview_id,ca_score_agg,ca_confidence,exp_score_agg,exp_confidence,ps_score_agg,ps_confidence,rel_score_agg,rel_confidence,prof_score_agg,prof_confidence,comm_score_agg,comm_confidence,overall_weighted_agg,p95_latency_ms
0,intv_0001,8,high,7,high,7,high,7,high,6,high,6,high,7.25,1455.9
1,intv_0002,8,high,7,high,7,high,7,high,6,high,6,high,7.25,1391.4
2,intv_0003,8,high,7,high,7,high,7,high,6,high,6,high,7.25,1201.2
3,intv_0004,8,high,7,high,7,high,7,high,6,high,6,high,7.25,3427.5
4,intv_0005,8,high,7,high,7,high,7,high,6,high,6,high,7.25,1182.6


## Step 4: Generate Final Outputs with Justifications

In [22]:
qa_map = dict(zip(interviews_df["interview_id"], interviews_df["qa_text"]))

def rewrite_once_chat(qa_text: str, scores_locked: Dict[str, int],
                     model: str, max_tokens: int = 1200) -> tuple:
    """Generate justifications for locked scores."""
    prompt = (
        "Respond with exactly ONE JSON object, no code fences, no prose, no markdown. "
        "Do not include any keys that are not requested.\n\n"
        + build_rewrite_prompt_locked(qa_text, scores_locked)
    )
    t0 = time.time()
    js, raw = openai_chat_json(prompt, model=model, temperature=0.0, max_tokens=max_tokens)
    latency = int((time.time() - t0) * 1000)
    return js, raw, latency

final_rows = []
invalid_count = 0

for _, agg_row in aggregated_df.iterrows():
    iid = agg_row["interview_id"]
    qa_text = qa_map.get(iid, "")
    if not qa_text:
        invalid_count += 1
        continue

    scores_locked = {m: agg_row[f"{m}_score_agg"] for m in METRICS}
    js, raw_text, lat = rewrite_once_chat(qa_text, scores_locked, model=MODEL_ID)

    # Validate scores match
    score_keys = [
        "cognitive_ability_score", "experience_score", "reliability_score",
        "professionalism_score", "problem_solving_score", "communication_score"
    ]
    if all(k in js for k in score_keys):
        out_row = {
            "interview_id": iid,
            "cognitive_ability_score": js.get("cognitive_ability_score"),
            "cognitive_ability_justification": js.get("cognitive_ability_justification", ""),
            "experience_score": js.get("experience_score"),
            "experience_justification": js.get("experience_justification", ""),
            "reliability_score": js.get("reliability_score"),
            "reliability_justification": js.get("reliability_justification", ""),
            "professionalism_score": js.get("professionalism_score"),
            "professionalism_justification": js.get("professionalism_justification", ""),
            "problem_solving_score": js.get("problem_solving_score"),
            "problem_solving_justification": js.get("problem_solving_justification", ""),
            "communication_score": js.get("communication_score"),
            "communication_justification": js.get("communication_justification", ""),
            "general_strengths": js.get("general_strengths", ""),
            "general_weaknesses": js.get("general_weaknesses", ""),
            "general_summary": js.get("general_summary", ""),
            "overall_weighted_score": agg_row["overall_weighted_agg"],
            "rewrite_latency_ms": lat,
            "rewrite_model": MODEL_ID,
        }
        final_rows.append(out_row)
    else:
        invalid_count += 1

final_df = pd.DataFrame(final_rows)
print(f"Final rows: {len(final_df)} | Invalid rewrites: {invalid_count}")
display(final_df.head())

Final rows: 50 | Invalid rewrites: 0


Unnamed: 0,interview_id,cognitive_ability_score,cognitive_ability_justification,experience_score,experience_justification,reliability_score,reliability_justification,professionalism_score,professionalism_justification,problem_solving_score,problem_solving_justification,communication_score,communication_justification,general_strengths,general_weaknesses,general_summary,overall_weighted_score,rewrite_latency_ms,rewrite_model
0,intv_0001,8,Demonstrated strong problem-solving skills in ...,7,Seasoned field technician with a proven track ...,7,"Prioritizes safety, follows site-specific rule...",6,Maintains a calm and professional demeanor whe...,7,Successfully resolved tricky diagnostics using...,6,"Effectively collaborates with teammates, escal...",- Strong problem-solving skills\n- Seasoned fi...,- Communication skills can be further improved...,"Overall, the candidate demonstrates a high lev...",7.25,5862,gpt-3.5-turbo
1,intv_0002,8,Demonstrated ability to analyze complex issues...,7,Seasoned field technician with a track record ...,7,"Dependable, responsible, and consistent in act...",6,Maintains a calm and professional demeanor whe...,7,Utilizes a combination of tools and methods to...,6,Clear and concise communication with customers...,- Strong problem-solving skills\n- Excellent e...,- Communication skills could be further improv...,The candidate demonstrates strong cognitive ab...,7.25,3291,gpt-3.5-turbo
2,intv_0003,8,The candidate demonstrates strong problem-solv...,7,The candidate has relevant experience as an HV...,7,The candidate emphasizes the importance of rel...,6,The candidate exhibits professionalism in deal...,7,The candidate demonstrates strong problem-solv...,6,The candidate effectively communicates with cu...,- Strong problem-solving skills\n- Relevant ex...,- Room for improvement in professionalism\n- C...,"Overall, the candidate shows promise with soli...",7.25,3599,gpt-3.5-turbo
3,intv_0004,8,The candidate demonstrated strong problem-solv...,7,The candidate has relevant experience as a Fie...,7,The candidate emphasized the importance of rel...,6,The candidate maintained a professional demean...,7,The candidate showcased strong problem-solving...,6,The candidate demonstrated effective communica...,- Strong problem-solving skills\n- Relevant ex...,- Room for improvement in professionalism\n- O...,"The candidate excels in cognitive ability, pro...",7.25,3504,gpt-3.5-turbo
4,intv_0005,8,The candidate demonstrates strong cognitive ab...,7,Although the candidate lacks direct experience...,7,The candidate emphasizes the importance of rel...,6,While the candidate maintains a professional a...,7,The candidate shows an understanding of proble...,6,The candidate exhibits clear communication ski...,- Strong cognitive abilities\n- Transferable s...,- Room for improvement in professionalism\n- P...,"Overall, the candidate shows promise with thei...",7.25,5841,gpt-3.5-turbo


## Save Results

In [23]:
# Save all dataframes
ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
model_safe = MODEL_ID.replace("/", "_").replace(":", "_")

interviews_df.to_csv(f"interviews_{model_safe}_{ts}.csv", index=False)
samples_df.to_csv(f"samples_{model_safe}_{ts}.csv", index=False)
aggregated_df.to_csv(f"aggregated_{model_safe}_{ts}.csv", index=False)
final_df.to_csv(f"final_{model_safe}_{ts}.csv", index=False)

print(f"Saved all results with timestamp: {ts}")
print(f"Model used: {MODEL_ID}")
print(f"\nFiles saved:")
print(f"- interviews_{model_safe}_{ts}.csv")
print(f"- samples_{model_safe}_{ts}.csv")
print(f"- aggregated_{model_safe}_{ts}.csv")
print(f"- final_{model_safe}_{ts}.csv")

Saved all results with timestamp: 20251104-231014
Model used: gpt-3.5-turbo

Files saved:
- interviews_gpt-3.5-turbo_20251104-231014.csv
- samples_gpt-3.5-turbo_20251104-231014.csv
- aggregated_gpt-3.5-turbo_20251104-231014.csv
- final_gpt-3.5-turbo_20251104-231014.csv


  ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")


## Summary Statistics

In [14]:
print("=" * 60)
print(f"SELF-CONSISTENCY SCORING SUMMARY ({MODEL_ID})")
print("=" * 60)
print(f"\nTotal Interviews: {len(interviews_df)}")
print(f"K Samples per Interview: {K}")
print(f"Total Scoring Samples: {len(samples_df)}")
print(f"Final Valid Outputs: {len(final_df)}")
print(f"\nScore Statistics:")
print(f"  Overall Weighted Score: {final_df['overall_weighted_score'].mean():.2f} ± {final_df['overall_weighted_score'].std():.2f}")
print(f"\nMetric Averages:")
for m in METRICS:
    col = f"{m}_score_agg"
    if col in aggregated_df.columns:
        print(f"  {m.upper()}: {aggregated_df[col].mean():.2f}")
print(f"\nLatency Statistics:")
print(f"  Scoring P95 Latency: {samples_df['latency_ms'].quantile(0.95):.0f}ms")
print(f"  Rewrite P95 Latency: {final_df['rewrite_latency_ms'].quantile(0.95):.0f}ms")
print("=" * 60)

SELF-CONSISTENCY SCORING SUMMARY (gpt-4o)

Total Interviews: 50
K Samples per Interview: 3
Total Scoring Samples: 150
Final Valid Outputs: 50

Score Statistics:
  Overall Weighted Score: 7.37 ± 0.93

Metric Averages:
  CA: 7.78
  EXP: 7.10
  PS: 7.24
  REL: 7.42
  PROF: 7.08
  COMM: 7.10

Latency Statistics:
  Scoring P95 Latency: 3136ms
  Rewrite P95 Latency: 16108ms


In [24]:
print("=" * 60)
print(f"SELF-CONSISTENCY SCORING SUMMARY ({MODEL_ID})")
print("=" * 60)
print(f"\nTotal Interviews: {len(interviews_df)}")
print(f"K Samples per Interview: {K}")
print(f"Total Scoring Samples: {len(samples_df)}")
print(f"Final Valid Outputs: {len(final_df)}")
print(f"\nScore Statistics:")
print(f"  Overall Weighted Score: {final_df['overall_weighted_score'].mean():.2f} ± {final_df['overall_weighted_score'].std():.2f}")
print(f"\nMetric Averages:")
for m in METRICS:
    col = f"{m}_score_agg"
    if col in aggregated_df.columns:
        print(f"  {m.upper()}: {aggregated_df[col].mean():.2f}")
print(f"\nLatency Statistics:")
print(f"  Scoring P95 Latency: {samples_df['latency_ms'].quantile(0.95):.0f}ms")
print(f"  Rewrite P95 Latency: {final_df['rewrite_latency_ms'].quantile(0.95):.0f}ms")
print("=" * 60)

SELF-CONSISTENCY SCORING SUMMARY (gpt-3.5-turbo)

Total Interviews: 50
K Samples per Interview: 3
Total Scoring Samples: 150
Final Valid Outputs: 50

Score Statistics:
  Overall Weighted Score: 7.25 ± 0.00

Metric Averages:
  CA: 8.00
  EXP: 7.00
  PS: 7.00
  REL: 7.00
  PROF: 6.00
  COMM: 6.00

Latency Statistics:
  Scoring P95 Latency: 1516ms
  Rewrite P95 Latency: 7224ms
