In [None]:
from dotenv import load_dotenv
from openai import OpenAI
import json
import os
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

NameError: name 'os' is not defined

In [10]:
def call_llm(prompt: str, system:str = None,model: str = "gpt-4o-mini", temperature: float = 0.7) -> str:
    messages = []
    if system:
        messages = [{"role":"system","content":system}]
    messages.append({"role": "user", "content": prompt})
    if model == "gpt-4o-mini" or model == "gpt-4o":
        try:
            response = client.chat.completions.create(
                model=model,
                messages=messages
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return f"Error in call_llm func: {e}"
    
def extract_json_dict(text: str):
    try:
        start = min(
            (text.index('{') if '{' in text else float('inf')),
            (text.index('[') if '[' in text else float('inf'))
        )
        end = max(
            (text.rindex('}') + 1 if '}' in text else -1),
            (text.rindex(']') + 1 if ']' in text else -1)
        )
        json_str = text[start:end]
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(text)
        print(json_str)
        raise ValueError(f"Invalid JSON found: {e}")

In [None]:
analyse_domain_template = '''
**Role**  
You are an expert `{job_title}`. Your task is to rigorously evaluate a candidate's domain knowledge based on their response to an interview question.

---

### Instructions  
1. **Category-Driven Analysis**  
   - Analyze *only* the attributes relevant to the question's category:  
     - **Technical**: Focus on Accuracy, Depth, Terminology, Examples.  
     - **Behavioral**: Focus on Examples/Evidence, Depth, Relevance. *Ignore Terminology Usage*.  
     - **Role-specific**: All attributes EXCEPT Terminology *unless* the question demands domain jargon.  
     - **Resume-specific**: Prioritize Examples/Evidence and Accuracy (validate against `Users_Resume_Profile`).  

2. **Hint Handling**  
   - If a hint exists, reward responses that explicitly follow it.  
   - Example: A hint like "STAR method" expects Situation/Task/Action/Result structure.  

3. **Scoring (1-5 per Attribute)**  
   - **Accuracy**: Factual correctness.  
     - *5: Flawless, 3: Partially correct, 1: Incorrect*  
   - **Depth of Understanding**: Nuance/complexity.  
     - *5: Detailed tradeoffs, 3: Surface-level, 1: Vague*  
   - **Relevance**: Addresses all question parts.  
     - *5: Fully on-point, 3: Partial, 1: Off-topic*  
   - **Examples/Evidence**: Concrete proof.  
     - *5: Specific case studies, 3: Generic, 1: None*  
   - **Terminology Usage** (Technical/Role-specific only):  
     - *5: Precise jargon, 3: Minor errors, 1: Misused terms*  

4. **Resume Validation**  
   - For Resume-specific questions, cross-check claims against `Users_Resume_Profile`. Flag inconsistencies.  

---

### Examples of Analysis  
#### Example 1: Technical Question (0 YOE)  
**Question**:  
*"Explain bias-variance tradeoff using a simple linear regression example. Hint: Visualize underfitting vs overfitting curves."*  
**Response**:  
*"High bias (underfitting) occurs when linear regression oversimplifies data. High variance (overfitting) happens with complex models memorizing noise."*  
**Analysis**:  
- **Accuracy**: 3/5 (Correct basics but misses linear regression example).  
- **Depth**: 2/5 (No tradeoff mechanics or complexity impact).  
- **Relevance**: 4/5 (Addresses core concepts).  
- **Examples**: 1/5 (No regression example/visualization).  
- **Terminology**: 5/5 (Correct terms).  
- **Hint Followed?** No → Penalized Depth/Examples.  

#### Example 2: Behavioral Question (4 YOE)  
**Question**:  
*"Describe resolving model fairness issues. Hint: Equal odds vs opportunity parity."*  
**Response**:  
*"We prioritized equal opportunity by adjusting thresholds for loan approvals, reducing false negatives in protected groups."*  
**Analysis**:  
- **Depth**: 5/5 (Nuanced fairness tradeoffs).  
- **Examples**: 5/5 (Specific threshold strategy).  
- **Relevance**: 5/5 (Uses hint's "opportunity" focus).  
- *Terminology Skipped* (Behavioral category).  

---

### Context  
- **Job Title**: `{job_title}`  
- **Expected Seniority**: `{Years_of_experience}` years (e.g., Junior/Senior).  
- **Users Resume Profile's **:  
  ```  
  {Users_Resume_Profile}  
  ```  
- **Question Metadata**:  
  - Category: `{category}`  
  - Difficulty: `{difficulty}`  
  - Hint: `{hint}`  

---

### Analysis Task  
**Interview Question**:  
"{question}"  

**Candidate Response**:  
"{Candidate_Response}"  

**Your Analysis**:  
**Return your analysis strictly in the following JSON format:**
{{
  "category": "[category]",
  "hint_addressed": true/false/null, // null if no hint
  "attribute_scores": {{
    "Accuracy": {{"score": number, "reason": string}},
    "Depth of Understanding": {{"score": number, "reason": string}},
    "Relevance": {{"score": number, "reason": string}},
    "Examples/Evidence": {{"score": number, "reason": string}},
    "Terminology Usage": {{"score": number, "reason": string}} // Omit if behavioral
  }},
  "overall_score": number, // sum of all score
  "overall_feedback": "Concise strengths/weaknesses summary"
}}

---

### Rules  
- **No assumptions**: Base scores strictly on the response.  
- **Hint bonus**: If a hint exists, +0.5 to Accuracy/Depth when followed (round down if .49).  
- **Resume checks**: For Resume-specific questions, deduct -1 from Accuracy if claims contradict.  
- **Be brutal**: Ignore fluff. Reward precision. Penalize vagueness.
'''

In [1]:
import pandas as pd
df = pd.read_csv('../dataset_frontend.csv')

In [None]:
df.columns

Index(['Unnamed: 0', 'Question', 'varient', 'answer', 'accuracy', 'depth',
       'relevance', 'hint', 'accuracy_reason', 'relevance_reason',
       'depth_reason', 'terminology', 'terminology_reason',
       'Examples/Evidence', 'Examples/Evidence_reason'],
      dtype='object')

# varient
  | # | Hint | Accuracy | Relevance | Depth  | Type                      | Score Range |
  | - | ---- | -------- | --------- | ------ | ------------------------- | ----------- |
  | 1 | ✅    | High     | High      | High   | ⭐ Perfect                 | 24-25       |
  | 2 | ✅    | Low      | Low       | Low    | ❌ Failure                 | 6-8         |
  | 3 | ❌    | High     | High      | High   | ⚠️ Smart but Ignored Hint | 21-22       |
  | 4 | ⚠️   | Low      | High      | High   | 🤔 Deep but Flawed        | 12-15       |
  | 5 | ✅    | Medium   | High      | Medium | 🟡 Realistic Midpoint     | 17-19       |

In [None]:
ml_resume = '''
{
  "experience": [
    {
      "company": "The Barabari Collective",
      "position": "AI Engineer (Freelance)",
      "duration": "June 2025 - Present"
    },
    {
      "company": "GDSC DDU Chapter",
      "position": "AI/ML Team Member",
      "duration": "2023-2024"
    }
  ],
  "certifications": [
    "Advanced Learning Algorithms - DeepLearning.AI",
    "Introduction to TensorFlow for AI, ML and DL - DeepLearning.AI",
    "Supervised Machine Learning - Stanford University"
  ],
  "projects": [
    {
      "name": "ChatGPT 2",
      "description": "Trained and implemented from scratch in Pytorch. Winner of Bhashathon 2025, won a cash prize of Rs. 50,000."
    }
  ],
  "skills": {
    "Languages": [
      "Python",
      "C++",
      "JavaScript"
    ],
    "Libraries": [
      "PyTorch",
      "NumPy",
      "Pandas",
      "Matplotlib",
      "FAISS",
      "scikit-learn"
    ],
    "Frameworks": [
      "Flask",
      "FastAPI",
      "Express.js",
      "TensorFlow"
    ],
    "Tools & Technologies": [
      "Git",
      "AWS",
      "Linux",
      "SentencePiece",
      "OpenAI",
      "OpenAI-Agents",
      "Deepgram",
      "openSMILE"
    ]
  }
}
'''

In [None]:
ix = 0
Interview_Question = extract_json_dict(df['Question'][ix].replace('\'','\"'))
answer = df['answer'][ix]
job_role = "Data Science" # "Backend Developer" "Frontend Developer"
years_of_experience = 0

{'category': 'Technical',
 'difficulty': 'Easy',
 'question': 'Explain how `useState` works in React for a simple counter component.',
 'hint': 'Describe the initial state setup, state update function, and re-rendering behavior.'}

In [None]:
category,difficulty,question,hint = Interview_Question.values()

prompt = analyse_domain_template.format(job_title=job_role,
                                        Years_of_experience=years_of_experience,
                                        Users_Resume_Profile=ml_resume,
                                        Candidate_Response=answer,
                                        category=category,
                                        difficulty=difficulty,
                                        question=question,
                                        hint=hint
                                        )
response = call_llm(prompt=prompt)
json_res = extract_json_dict(response)

In [None]:
json_res