# Synthetic Interview Dataset Generator

This notebook generates synthetic interview data for 5 service roles:
- Customer Service Representative
- Sales Representative
- Field Technician
- Home Service Technician
- General Manager (Franchise)

Each role will have 50 candidates with varying quality levels based on these metrics:
- Cognitive Ability (35%)
- Experience (35%)
- Problem Solving (15%)
- Reliability (5%)
- Professionalism (5%)
- Communication (5%)

In [1]:
# Install required packages
!pip install huggingface_hub pandas numpy tqdm openai --quiet

In [2]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from typing import Dict, List, Tuple
import random

# Import based on provider choice
try:
    from huggingface_hub import InferenceClient
except ImportError:
    print("Warning: huggingface_hub not found. Install with: pip install huggingface_hub")

try:
    from openai import OpenAI
except ImportError:
    print("Warning: openai not found. Install with: pip install openai")

In [5]:
# Configuration
# Choose your inference provider:
# Option 1: Hugging Face Inference API (recommended)
# Option 2: OpenAI-compatible API (Novita AI, Together AI, etc.)

INFERENCE_PROVIDER = "huggingface"  # "huggingface" or "openai_compatible"

# For Hugging Face
HF_TOKEN = os.getenv("HF_TOKEN")
HF_MODEL = "meta-llama/Llama-3.1-8B-Instruct"

# For OpenAI-compatible APIs (Novita AI, Together AI, etc.)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # or NOVITA_API_KEY
OPENAI_BASE_URL = "https://api.novita.ai/v3/openai"  # Change based on your provider
OPENAI_MODEL = "meta-llama/llama-3.1-8b-instruct"  # Novita format

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

In [20]:
client = InferenceClient(token=HF_TOKEN)
MODEL_NAME = HF_MODEL

In [21]:
# # Initialize the appropriate client
# if INFERENCE_PROVIDER == "huggingface":
#     print("Using Hugging Face Inference API")
#     if not HF_TOKEN:
#         raise ValueError("HF_TOKEN environment variable not set")
#     client = InferenceClient(token=HF_TOKEN)
#     MODEL_NAME = HF_MODEL

# elif INFERENCE_PROVIDER == "openai_compatible":
#     print("Using OpenAI-compatible API")
#     if not OPENAI_API_KEY:
#         raise ValueError("OPENAI_API_KEY environment variable not set")
#     client = OpenAI(
#         api_key=OPENAI_API_KEY,
#         base_url=OPENAI_BASE_URL
#     )
#     MODEL_NAME = OPENAI_MODEL

# else:
#     raise ValueError(f"Invalid INFERENCE_PROVIDER: {INFERENCE_PROVIDER}")

# print(f"Model: {MODEL_NAME}")

In [22]:
# Define roles and metrics
ROLES = [
    "Customer Service Representative",
    "Sales Representative",
    "Field Technician",
    "Home Service Technician",
    "General Manager (Franchise)"
]

METRICS_DEF = "\n".join([
    "- Cognitive Ability (35%): Structured thinking, planning, logic.",
    "- Experience (35%): Relevant work (last 10 years), skills, accomplishments in similar service jobs.",
    "- Problem Solving (15%): Resourcefulness, safe tradeoffs under constraints.",
    "- Reliability (5%): Punctuality, follow-through, transport reliability.",
    "- Professionalism (5%): Respect for clients/rules, composure under stress.",
    "- Communication (5%): Clarity and tone; IGNORE filler words."
])

NUM_QUESTIONS = 10
NUM_YES_NO_QUESTIONS = 3
NUM_CANDIDATES_PER_ROLE = 50

In [23]:
def call_llm(messages: List[Dict], max_tokens: int = 1000, temperature: float = 0.7) -> str:
    """Call the LLM with given messages and return response."""
    try:
        if INFERENCE_PROVIDER == "huggingface":
            response = client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature
            )
            return response.choices[0].message.content.strip()

        elif INFERENCE_PROVIDER == "openai_compatible":
            response = client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature
            )
            return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Error calling LLM: {e}")
        time.sleep(2)  # Wait before retry
        return call_llm(messages, max_tokens, temperature)

In [24]:
def generate_questions_for_role(role: str) -> List[Dict[str, str]]:
    """Generate interview questions for a specific role."""
    prompt = f"""You are an expert interviewer creating interview questions for a {role} position.

Based on these evaluation metrics:
{METRICS_DEF}

Generate exactly {NUM_QUESTIONS} interview questions:
- {NUM_YES_NO_QUESTIONS} should be yes/no questions
- {NUM_QUESTIONS - NUM_YES_NO_QUESTIONS} should be open-ended questions

The questions should assess the candidate across all metrics.

Format your response as a JSON array with this structure:
[{{"question": "...", "type": "yes_no"}}, {{"question": "...", "type": "open_ended"}}, ...]

Return ONLY the JSON array, no additional text."""

    messages = [{"role": "user", "content": prompt}]
    response = call_llm(messages, max_tokens=1500, temperature=0.8)

    # Parse JSON from response
    try:
        # Try to extract JSON if wrapped in markdown
        if "```json" in response:
            response = response.split("```json")[1].split("```")[0].strip()
        elif "```" in response:
            response = response.split("```")[1].split("```")[0].strip()

        questions = json.loads(response)
        return questions
    except json.JSONDecodeError as e:
        print(f"Error parsing questions JSON: {e}")
        print(f"Response: {response}")
        # Fallback: generate simple questions
        return [
            {"question": f"Question {i+1} for {role}", "type": "yes_no" if i < NUM_YES_NO_QUESTIONS else "open_ended"}
            for i in range(NUM_QUESTIONS)
        ]

In [25]:
def assign_candidate_score() -> Tuple[float, str]:
    """Randomly assign a candidate quality score and label."""
    score = np.random.uniform(1.0, 10.0)

    if score >= 8.0:
        quality = "good"
    elif score >= 5.0:
        quality = "moderate"
    else:
        quality = "poor"

    return round(score, 2), quality

In [26]:
def generate_metric_scores(overall_score: float) -> Dict[str, float]:
    """Generate individual metric scores based on overall score."""
    # Add some variance but keep it centered around overall score
    variance = 1.5

    scores = {
        "cognitive_ability": np.clip(np.random.normal(overall_score, variance), 1, 10),
        "experience": np.clip(np.random.normal(overall_score, variance), 1, 10),
        "problem_solving": np.clip(np.random.normal(overall_score, variance), 1, 10),
        "reliability": np.clip(np.random.normal(overall_score, variance), 1, 10),
        "professionalism": np.clip(np.random.normal(overall_score, variance), 1, 10),
        "communication": np.clip(np.random.normal(overall_score, variance), 1, 10)
    }

    return {k: round(v, 2) for k, v in scores.items()}

In [27]:
def generate_answer(question: str, question_type: str, role: str, overall_score: float, quality: str) -> str:
    """Generate a candidate's answer based on their quality level."""

    quality_description = {
        "good": "excellent candidate with strong experience, clear thinking, and great problem-solving skills",
        "moderate": "average candidate with some relevant experience but room for improvement",
        "poor": "weak candidate with limited experience, unclear thinking, or poor communication"
    }

    prompt = f"""You are roleplaying as a {quality} candidate (score: {overall_score}/10) interviewing for a {role} position.

A {quality} candidate is: {quality_description[quality]}

Question: {question}
Question Type: {question_type}

Respond naturally as this candidate would. Your answer should reflect your quality level:
- Good candidates (8-10): Detailed, structured, shows experience and insight
- Moderate candidates (5-8): Adequate but may lack depth or specific examples
- Poor candidates (1-5): Vague, inexperienced, or shows poor judgment

For yes/no questions, start with yes or no, then briefly explain.
Keep answers realistic and conversational (2-5 sentences for open-ended, 1-2 for yes/no).

Return ONLY the answer, no additional formatting or labels."""

    messages = [{"role": "user", "content": prompt}]
    answer = call_llm(messages, max_tokens=300, temperature=0.9)

    return answer

In [28]:
def generate_interview(role: str, questions: List[Dict], interview_id: str) -> Dict:
    """Generate a complete interview for one candidate."""

    # Assign candidate quality
    overall_score, quality = assign_candidate_score()
    metric_scores = generate_metric_scores(overall_score)

    # Generate Q&A pairs
    qa_pairs = []
    for q in questions:
        answer = generate_answer(
            question=q["question"],
            question_type=q["type"],
            role=role,
            overall_score=overall_score,
            quality=quality
        )

        qa_pairs.append({
            "question": q["question"],
            "question_type": q["type"],
            "answer": answer
        })

        time.sleep(0.5)  # Rate limiting

    # Construct interview record
    interview = {
        "interview_id": interview_id,
        "role": role,
        "overall_score": overall_score,
        "quality": quality,
        "metric_scores": metric_scores,
        "qa_pairs": qa_pairs,
        "full_transcript": "\n\n".join([
            f"Q: {qa['question']}\nA: {qa['answer']}"
            for qa in qa_pairs
        ])
    }

    return interview

## Generate Questions for Each Role

First, we'll generate the interview questions for each role.

In [29]:
# Generate questions for each role
role_questions = {}

print("Generating interview questions for each role...\n")
for role in ROLES:
    print(f"Generating questions for: {role}")
    questions = generate_questions_for_role(role)
    role_questions[role] = questions

    print(f"  Generated {len(questions)} questions")
    print(f"  Yes/No questions: {sum(1 for q in questions if q['type'] == 'yes_no')}")
    print(f"  Open-ended questions: {sum(1 for q in questions if q['type'] == 'open_ended')}\n")

    time.sleep(1)  # Rate limiting between roles

print("✓ Questions generated for all roles")

Generating interview questions for each role...

Generating questions for: Customer Service Representative
  Generated 10 questions
  Yes/No questions: 4
  Open-ended questions: 6

Generating questions for: Sales Representative
Error parsing questions JSON: Expecting ',' delimiter: line 11 column 165 (char 1248)
Response: [
  {"question": "Have you ever worked with CRM software?", "type": "yes_no"},
  {"question": "Can you describe a time when you had to meet an ambitious sales target within a tight deadline?", "type": "open_ended"},
  {"question": "Do you have a valid driver's license?", "type": "yes_no"},
  {"question": "How would you handle a situation where a client is unsatisfied with the product or service?", "type": "open_ended"},
  {"question": "Have you ever had to adjust your sales strategy on the fly to meet a client's changing needs?", "type": "open_ended"},
  {"question": "How do you prioritize your tasks and manage your time effectively?", "type": "open_ended"},
  {"quest

In [30]:
# Display sample questions
sample_role = ROLES[0]
print(f"Sample questions for {sample_role}:\n")
for i, q in enumerate(role_questions[sample_role][:3], 1):
    print(f"{i}. [{q['type']}] {q['question']}")

Sample questions for Customer Service Representative:

1. [yes_no] Have you worked in a customer-facing role for at least 6 months?
2. [open_ended] Tell me about a time when you had to handle a high volume of customer calls or queries within a tight timeframe, and how you managed your workload.
3. [yes_no] Are you available to work a standard 8-hour shift, 5 days a week?


## Generate Synthetic Interview Dataset

Now we'll generate 50 candidates for each role with varying quality levels.

In [31]:
# Generate complete dataset
all_interviews = []

print(f"Generating {NUM_CANDIDATES_PER_ROLE} interviews per role...\n")
print(f"Total interviews to generate: {len(ROLES) * NUM_CANDIDATES_PER_ROLE}\n")

for role in ROLES:
    print(f"\n{'='*60}")
    print(f"Generating interviews for: {role}")
    print(f"{'='*60}")

    questions = role_questions[role]

    for candidate_num in tqdm(range(NUM_CANDIDATES_PER_ROLE), desc=f"{role}"):
        interview_id = f"{role.lower().replace(' ', '_')}_{candidate_num+1:03d}"

        interview = generate_interview(role, questions, interview_id)
        all_interviews.append(interview)

        # Save checkpoint every 10 interviews
        if (candidate_num + 1) % 10 == 0:
            temp_df = pd.DataFrame(all_interviews)
            temp_df.to_json("interview_dataset_checkpoint.json", orient="records", indent=2)

print(f"\n✓ Generated {len(all_interviews)} total interviews")

Generating 50 interviews per role...

Total interviews to generate: 250


Generating interviews for: Customer Service Representative


Customer Service Representative: 100%|██████████| 50/50 [22:04<00:00, 26.49s/it]



Generating interviews for: Sales Representative


Sales Representative: 100%|██████████| 50/50 [19:11<00:00, 23.02s/it]



Generating interviews for: Field Technician


Field Technician: 100%|██████████| 50/50 [23:00<00:00, 27.61s/it]



Generating interviews for: Home Service Technician


Home Service Technician: 100%|██████████| 50/50 [24:28<00:00, 29.37s/it]



Generating interviews for: General Manager (Franchise)


General Manager (Franchise): 100%|██████████| 50/50 [23:40<00:00, 28.40s/it]


✓ Generated 250 total interviews





## Analyze Dataset Statistics

In [32]:
# Create DataFrame
df = pd.DataFrame(all_interviews)

print("Dataset Overview:")
print(f"Total interviews: {len(df)}")
print(f"\nInterviews per role:")
print(df['role'].value_counts())

print(f"\nCandidate quality distribution:")
print(df['quality'].value_counts())
print(f"\nQuality distribution (%)")
print(df['quality'].value_counts(normalize=True) * 100)

print(f"\nOverall score statistics:")
print(df['overall_score'].describe())

print(f"\nScore distribution by quality:")
print(df.groupby('quality')['overall_score'].describe())

Dataset Overview:
Total interviews: 250

Interviews per role:
role
Customer Service Representative    50
Sales Representative               50
Field Technician                   50
Home Service Technician            50
General Manager (Franchise)        50
Name: count, dtype: int64

Candidate quality distribution:
quality
poor        110
moderate     82
good         58
Name: count, dtype: int64

Quality distribution (%)
quality
poor        44.0
moderate    32.8
good        23.2
Name: proportion, dtype: float64

Overall score statistics:
count    250.000000
mean       5.575800
std        2.598592
min        1.060000
25%        3.387500
50%        5.675000
75%        7.940000
max        9.930000
Name: overall_score, dtype: float64

Score distribution by quality:
          count      mean       std   min     25%    50%     75%   max
quality                                                               
good       58.0  8.927069  0.614232  8.02  8.3325  8.970  9.5300  9.93
moderate   82.0 

In [33]:
# Quality distribution by role
print("\nQuality distribution by role:")
quality_by_role = pd.crosstab(df['role'], df['quality'], normalize='index') * 100
print(quality_by_role.round(2))


Quality distribution by role:
quality                          good  moderate  poor
role                                                 
Customer Service Representative  26.0      38.0  36.0
Field Technician                 20.0      28.0  52.0
General Manager (Franchise)      20.0      34.0  46.0
Home Service Technician          34.0      32.0  34.0
Sales Representative             16.0      32.0  52.0


## Save Dataset

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
# Save complete dataset in multiple formats

# 1. JSON format (preserves nested structure)
df.to_json("synthetic_interview_dataset.json", orient="records", indent=2)
print("✓ Saved: synthetic_interview_dataset.json")

# 2. CSV format (flattened for easy analysis)
df_flat = df.copy()
df_flat['qa_text'] = df_flat['full_transcript']
df_flat = df_flat.drop(columns=['qa_pairs', 'full_transcript'])

# Flatten metric scores
metric_scores_df = pd.json_normalize(df_flat['metric_scores'])
df_flat = pd.concat([df_flat.drop(columns=['metric_scores']), metric_scores_df], axis=1)

df_flat.to_csv("synthetic_interview_dataset.csv", index=False)
print("✓ Saved: synthetic_interview_dataset.csv")

# 3. Detailed Q&A format
qa_records = []
for _, interview in df.iterrows():
    for i, qa in enumerate(interview['qa_pairs']):
        qa_records.append({
            'interview_id': interview['interview_id'],
            'role': interview['role'],
            'overall_score': interview['overall_score'],
            'quality': interview['quality'],
            'question_num': i + 1,
            'question': qa['question'],
            'question_type': qa['question_type'],
            'answer': qa['answer']
        })

qa_df = pd.DataFrame(qa_records)
qa_df.to_csv("synthetic_interview_qa_detailed.csv", index=False)
print("✓ Saved: synthetic_interview_qa_detailed.csv")

print(f"\n✓ Dataset generation complete!")
print(f"Total interviews: {len(df)}")
print(f"Total Q&A pairs: {len(qa_df)}")

✓ Saved: synthetic_interview_dataset.json
✓ Saved: synthetic_interview_dataset.csv
✓ Saved: synthetic_interview_qa_detailed.csv

✓ Dataset generation complete!
Total interviews: 250
Total Q&A pairs: 2500


## Sample Interview Examples

In [None]:
# Display sample interviews
print("Sample Interviews:\n")

for quality in ['good', 'moderate', 'poor']:
    sample = df[df['quality'] == quality].iloc[0]

    print(f"\n{'='*80}")
    print(f"EXAMPLE {quality.upper()} CANDIDATE")
    print(f"{'='*80}")
    print(f"Interview ID: {sample['interview_id']}")
    print(f"Role: {sample['role']}")
    print(f"Overall Score: {sample['overall_score']}/10")
    print(f"Quality: {sample['quality']}")
    print(f"\nMetric Scores:")
    for metric, score in sample['metric_scores'].items():
        print(f"  - {metric.replace('_', ' ').title()}: {score}")
    print(f"\nInterview Transcript (first 3 Q&A):")
    print("-" * 80)
    for i, qa in enumerate(sample['qa_pairs'][:3], 1):
        print(f"\nQ{i}: {qa['question']}")
        print(f"A{i}: {qa['answer']}")
    print()

## Dataset Summary

The generated dataset includes:

1. **synthetic_interview_dataset.json** - Complete dataset with nested structure
2. **synthetic_interview_dataset.csv** - Flattened dataset for analysis
3. **synthetic_interview_qa_detailed.csv** - Detailed Q&A pairs

Each interview contains:
- Unique interview ID
- Role information
- Overall quality score (1-10)
- Quality label (good/moderate/poor)
- Individual metric scores
- 10 Q&A pairs (3 yes/no, 7 open-ended)
- Full transcript