In [2]:
!pip install pdfminer.six

Defaulting to user installation because normal site-packages is not writeable
Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   -------------------------- ------------- 3.7/5.6 MB 21.8 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 16.3 MB/s  0:00:00
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [13]:
# !pip install pdfminer.six
# If you are in a Jupyter Notebook, you can run the above line by removing the initial '#' and running the cell.

import re
from pdfminer.high_level import extract_text
import json
from pathlib import Path

PDF_PATH = 'resumes/Aditya_CV_Latex-1.pdf'

# Create a Path object and access its .stem attribute
file_stem = Path(PDF_PATH).stem

print(file_stem)

def extract_text_from_pdf(pdf_path):
    """Extracts raw text from a PDF file using pdfminer.six."""
    try:
        text = extract_text(pdf_path)
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None

# 1. Extract Text
raw_text = extract_text_from_pdf(PDF_PATH)

if raw_text:
    print("‚úÖ Raw Text Extracted Successfully (First 500 chars):")
    print("-" * 50)
    print(raw_text[:500] + "...")
else:
    print("‚ùå Could not extract text. Please check the PDF path.")

# You should replace 'your_resume.pdf' with a real file and ensure it is in the same directory, 
# or use the full path.

Aditya_CV_Latex-1
‚úÖ Raw Text Extracted Successfully (First 500 chars):
--------------------------------------------------
Aditya Ankanath TR
Bengaluru, Karnataka 560059
(cid:131) 7760012484 # sushmaaditya717@gmail.com (cid:239) linkedin (cid:128) portfolio ¬ß github

Education
R V College of Engineering
BE in Information Science and Engineering
CGPA: 9.14

Vidhya Vardhaka Golden Jubilee College
PUC - PCMC
Percentage: 94.8

Relevant Coursework

Aug 2023 ‚Äì 2027
Bengaluru, Karnataka

Sep 2021 ‚Äì March 2023
Mysuru, Karnataka

‚Ä¢ Data Structures
‚Ä¢ Algorithms Analysis

‚Ä¢ Data Science
‚Ä¢ Web Development

‚Ä¢ Operating Systems
‚Ä¢...


In [7]:
# Define common section headers for segmentation
# These headers are often in ALL CAPS or Title Case and followed by the section content.
# We'll use a regex pattern to find them.
SECTION_TITLES = [
    'PERSONAL INFO', 'CONTACT', 'EDUCATION', 'EXPERIENCE', 'PROJECTS', 
    'SKILLS', 'PROGRAMMING LANGUAGES', 'COURSEWORK', 'AWARDS', 
    'CERTIFICATIONS'
]
# Create a robust regex pattern to find any of the titles, ignoring case, 
# and potentially including a newline or a lot of space after the header.
SECTION_REGEX = '|'.join(SECTION_TITLES)
# The regex will find one of the section titles.
# re.IGNORECASE makes the matching case-insensitive.
# re.MULTILINE is helpful for text processing.

def segment_resume_sections(text, section_titles):
    """
    Segments the raw resume text into sections based on predefined titles.
    This is a heuristic approach and works best if the resume has clear,
    consistent section headings.
    """
    # Normalize text: replace multiple newlines with a unique delimiter for easier splitting
    text = re.sub(r'\s*\n\s*', '\n', text).strip()
    
    # 1. Split the text by section headers
    # The 're.split' function will split the string wherever the regex matches.
    # The parentheses around the regex make the matched separators (the section titles)
    # included in the list.
    pattern = r'\n(' + '|'.join(section_titles) + r')\n'
    segments = re.split(pattern, text, flags=re.IGNORECASE)
    
    if not segments:
        return {"Unparsed_Content": text}

    # 2. Re-combine headers with their content
    structured_data = {}
    
    # The first segment is often the name/contact info before the first section header
    # or empty if the first segment is a header.
    current_section = "PERSONAL_INFO"
    
    # Clean up the initial segment and treat it as the first section's content
    initial_content = segments[0].strip()
    if initial_content:
        structured_data[current_section] = initial_content

    # Iterate through the rest of the segments (which come in pairs of (Header, Content))
    for i in range(1, len(segments), 2):
        if i + 1 < len(segments):
            # The current segment is the header (e.g., 'EDUCATION')
            # The next segment is the content for that header.
            header = segments[i].strip().upper().replace(' ', '_')
            content = segments[i+1].strip()
            
            # Map common headers to the requested variable names
            if 'PROGRAMMING_LANGUAGES' in header or 'SKILLS' in header:
                current_section = 'SKILLS_OR_LANGUAGES'
            elif 'EDUCATION' in header:
                current_section = 'EDUCATION'
            elif 'PROJECTS' in header:
                current_section = 'PROJECTS'
            elif 'COURSEWORK' in header:
                current_section = 'COURSEWORK'
            else:
                # Use the header name as the section key
                current_section = header
            
            # Append content if the section already exists (e.g., if we map multiple headers to one key)
            if current_section in structured_data:
                structured_data[current_section] += "\n\n" + content
            else:
                structured_data[current_section] = content

    return structured_data

# 2. Segment Text and Store in JSON Format
if raw_text:
    structured_resume_data = segment_resume_sections(raw_text, SECTION_TITLES)

    # 3. Print or Store the Structured Data
    print("\n" + "=" * 50)
    print("‚úÖ Structured Resume Data (JSON Format):")
    print("=" * 50)
    print(json.dumps(structured_resume_data, indent=4))
    
    # Example of storing sections in variables (as requested):
    # Safe retrieval with .get() to avoid errors if a section wasn't found
    personal_info = structured_resume_data.get('PERSONAL_INFO', structured_resume_data.get('CONTACT', ''))
    education = structured_resume_data.get('EDUCATION', '')
    coursework = structured_resume_data.get('COURSEWORK', '')
    projects = structured_resume_data.get('PROJECTS', '')
    programming_languages = structured_resume_data.get('SKILLS_OR_LANGUAGES', '')

    print("\n" + "=" * 50)
    print("‚úÖ Stored in Variables (Example):")
    print("=" * 50)
    print(f"Personal Info Snippet: {personal_info[:50]}...")
    print(f"Education Snippet: {education[:50]}...")


‚úÖ Structured Resume Data (JSON Format):
{
    "PERSONAL_INFO": "Aditya Ankanath TR\nBengaluru, Karnataka 560059\n(cid:131) 7760012484 # sushmaaditya717@gmail.com (cid:239) linkedin (cid:128) portfolio \u00a7 github",
    "EDUCATION": "R V College of Engineering\nBE in Information Science and Engineering\nCGPA: 9.14\nVidhya Vardhaka Golden Jubilee College\nPUC - PCMC\nPercentage: 94.8\nRelevant Coursework\nAug 2023 \u2013 2027\nBengaluru, Karnataka\nSep 2021 \u2013 March 2023\nMysuru, Karnataka\n\u2022 Data Structures\n\u2022 Algorithms Analysis\n\u2022 Data Science\n\u2022 Web Development\n\u2022 Operating Systems\n\u2022 Computer Networks",
    "PROJECTS": "Plant-Disease-Detection-ML(website) | Flask, TensorFlow, MongoDB, OpenCV, JavaScript\nJuly 2024, SIH\n\u2022 Developed a responsive AI web app for real-time plant disease detection using a CNN (TensorFlow) with 93%\ntest accuracy across 38+ crop disease classes from PlantVillage dataset\n\u2022 MongoDB to store and retrieve over

In [9]:
RESUME_NAME = f'{PDF_PATH}'
OUTPUT_JSON_PATH = f'{RESUME_NAME}.json'


# --- 3. JSON SAVING FUNCTION ---
def save_to_json(data, output_path):
    """Saves the structured data dictionary to a JSON file."""
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            # Use indent=4 for a human-readable format
            json.dump(data, f, indent=4)
        print(f"\n==================================================")
        print(f"‚úÖ Data successfully saved to: {output_path}")
        print(f"==================================================")
        # Optional: Display content for confirmation
        print("\nFirst 5 lines of the saved JSON file:")
        with open(output_path, 'r', encoding='utf-8') as f:
            for i in range(5):
                print(f.readline(), end='')
        
    except Exception as e:
        print(f"‚ùå Error saving JSON file: {e}")

# --- 4. EXECUTE ---

# 4a. Map the segmented data to the desired keys/variables.
# Note: Since the input data does not have a separate 'COURSEWORK' or 'SKILLS_OR_LANGUAGES' key,
# we map them to the raw sections that contain that information.
personal_info = structured_resume_data.get('PERSONAL_INFO', structured_resume_data.get('CONTACT', ''))
education = structured_resume_data.get('EDUCATION', '')
# The coursework list is inside the 'EDUCATION' block in the raw data
coursework = structured_resume_data.get('EDUCATION', '') 
projects = structured_resume_data.get('PROJECTS', '')
# The technical skills list is inside the 'PROJECTS' block in the raw data
programming_languages = structured_resume_data.get('PROJECTS', '') 

# 4b. Create the filtered dictionary for saving.
filtered_data_to_save = {
    'personal_info': personal_info,
    'education': education,
    'coursework': coursework,
    'projects': projects,
    'programming_languages': programming_languages
}

# 4c. Execute the saving function with the filtered data.
save_to_json(filtered_data_to_save, OUTPUT_JSON_PATH)


‚úÖ Data successfully saved to: resumes/Aditya_CV_Latex-1.pdf.json

First 5 lines of the saved JSON file:
{
    "personal_info": "Aditya Ankanath TR\nBengaluru, Karnataka 560059\n(cid:131) 7760012484 # sushmaaditya717@gmail.com (cid:239) linkedin (cid:128) portfolio \u00a7 github",
    "education": "R V College of Engineering\nBE in Information Science and Engineering\nCGPA: 9.14\nVidhya Vardhaka Golden Jubilee College\nPUC - PCMC\nPercentage: 94.8\nRelevant Coursework\nAug 2023 \u2013 2027\nBengaluru, Karnataka\nSep 2021 \u2013 March 2023\nMysuru, Karnataka\n\u2022 Data Structures\n\u2022 Algorithms Analysis\n\u2022 Data Science\n\u2022 Web Development\n\u2022 Operating Systems\n\u2022 Computer Networks",
    "coursework": "R V College of Engineering\nBE in Information Science and Engineering\nCGPA: 9.14\nVidhya Vardhaka Golden Jubilee College\nPUC - PCMC\nPercentage: 94.8\nRelevant Coursework\nAug 2023 \u2013 2027\nBengaluru, Karnataka\nSep 2021 \u2013 March 2023\nMysuru, Karnataka\

In [17]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI

In [15]:
!ollama list

NAME                       ID              SIZE      MODIFIED           
nomic-embed-text:latest    0a109f422b47    274 MB    About a minute ago    
llama3.2:latest            a80c4f17acd5    2.0 GB    7 days ago            


In [17]:
load_dotenv()

# --- 2. Configure the OpenAI Client to Point to Ollama ---
# The client will use the environment variables we set in the .env file
# It reads OLLAMA_BASE_URL as the server address and OLLAMA_API_KEY for authentication.
try:
    client = OpenAI(
        base_url=os.getenv("OLLAMA_BASE_URL"),
        api_key=os.getenv("OLLAMA_API_KEY"),
    )
    print("‚úÖ OpenAI client configured to connect to local Ollama server.")
except Exception as e:
    print(f"‚ùå Failed to configure OpenAI client: {e}")
    raise

# --- 3. Define File Paths ---
INPUT_JSON = "resumes/Aditya_CV_Latex-1.pdf.json"
OUTPUT_KB_JSON = f"resumes/{file_stem}output.json" # Using a new name to avoid overwriting

# --- 4. Load Parsed Resume Data ---
try:
    with open(INPUT_JSON, 'r', encoding='utf-8') as f:
        resume_data = json.load(f)
    print(f"‚úÖ Successfully loaded resume data from '{INPUT_JSON}'")
except FileNotFoundError:
    print(f"‚ùå Error: The file '{INPUT_JSON}' was not found.")
    raise

# --- 5. Generate Embeddings and Create Knowledge Base ---
knowledge_base = []
print("\nüî• Starting embedding generation using the OpenAI client...")

for section, text in resume_data.items():
    if not text or not text.strip():
        print(f"‚ö†Ô∏è  Skipping empty section: '{section}'")
        continue
    
    print(f"   ‚Ü™Ô∏è  Processing section: '{section}'...")
    
    try:
        # The key change is here: using client.embeddings.create
        response = client.embeddings.create(
            model="nomic-embed-text", # The Ollama model to use
            input=text
        )
        
        # The embedding vector is located in a different place in the response object
        embedding_vector = response.data[0].embedding
        
        knowledge_base.append({
            'section': section,
            'text': text,
            'embedding': embedding_vector
        })
        
    except Exception as e:
        print(f"‚ùå Failed to generate embedding for section '{section}': {e}")

# --- 6. Save the Knowledge Base ---
with open(OUTPUT_KB_JSON, 'w', encoding='utf-8') as f:
    json.dump(knowledge_base, f, indent=4)

print(f"\nüéâ Success! Your new knowledge base is saved as '{OUTPUT_KB_JSON}'")

‚úÖ OpenAI client configured to connect to local Ollama server.
‚úÖ Successfully loaded resume data from 'resumes/Aditya_CV_Latex-1.pdf.json'

üî• Starting embedding generation using the OpenAI client...
   ‚Ü™Ô∏è  Processing section: 'personal_info'...
   ‚Ü™Ô∏è  Processing section: 'education'...
   ‚Ü™Ô∏è  Processing section: 'coursework'...
   ‚Ü™Ô∏è  Processing section: 'projects'...
   ‚Ü™Ô∏è  Processing section: 'programming_languages'...

üéâ Success! Your new knowledge base is saved as 'resumes/Aditya_CV_Latex-1output.json'


In [21]:
pip install -U google-generativeai


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [24]:
!pip show google-generativeai



Name: google-generativeai
Version: 0.8.5
Summary: Google Generative AI High level API client library and tools.
Home-page: https://github.com/google/generative-ai-python
Author: Google LLC
Author-email: googleapis-packages@google.com
License: Apache 2.0
Location: C:\ProgramData\anaconda3\envs\llms\Lib\site-packages
Requires: google-ai-generativelanguage, google-api-core, google-api-python-client, google-auth, protobuf, pydantic, tqdm, typing-extensions
Required-by: 


In [30]:
import os
import json
from dotenv import load_dotenv
import google.generativeai as genai

# --- 1. Load API Key and Configure Gemini ---
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

if not api_key:
    print("‚ùå Error: GOOGLE_API_KEY not found. Please create a .env file and add your key.")
else:
    genai.configure(api_key=api_key)
    print("‚úÖ Gemini API configured successfully.")

# --- 2. Load Resume JSON from Folder ---
resume_folder = "resumes"
resume_filename = "Aditya_CV_Latex-1.pdf.json"
resume_path = os.path.join(resume_folder, resume_filename)

if not os.path.exists(resume_path):
    raise FileNotFoundError(f"‚ùå Resume JSON not found at: {resume_path}")

with open(resume_path, "r", encoding="utf-8") as f:
    resume_data = json.load(f)

print(f"üìÑ Loaded resume data from: {resume_path}")

# --- 3. Convert Resume Data into Structured Text ---
resume_text = ""
for section, content in resume_data.items():
    resume_text += f"## {section.replace('_', ' ').title()}\n{content}\n\n"

# --- 4. Build the Gemini Prompt ---
prompt = f"""
You are an expert technical interviewer preparing for an interview with a candidate named Aditya.
Your task is to analyze the provided resume text thoroughly and generate a comprehensive list of 15 interview questions.

**Instructions:**
1. Generate a mix of technical, project-deep-dive, and behavioral questions.
2. Base every question directly on the information given in the resume.
3. For each question, provide a strong, detailed reference answer that the candidate might give, drawing logical conclusions from their project details.
4. The final output MUST be a single, valid JSON object. Do not add any text before or after the JSON object.
5. Use the following JSON structure:
{{
  "interview_questions": [
    {{
      "id": <integer>,
      "area": "<Projects/Technical_Skills/Behavioral>",
      "question": "<The generated question>",
      "answer": "<The detailed reference answer>"
    }}
  ]
}}

**Resume to Analyze:**
---
{resume_text}
---
"""

# --- 5. Call Gemini Model ---
try:
    print("\nü§ñ Calling the Gemini Flash model to generate questions... (This may take a moment)")

    model = genai.GenerativeModel(
        "models/gemini-2.5-flash",
        generation_config=genai.types.GenerationConfig(
            response_mime_type="application/json"
        )
    )

    response = model.generate_content(prompt)

    # --- 6. Parse JSON Response ---
    generated_json = json.loads(response.text)

    print("\n‚úÖ Success! Here are the generated questions and answers:\n")
    print(json.dumps(generated_json, indent=2))

    # --- 7. Save Output ---
    output_filename = os.path.join(resume_folder, "interview_questions.json")
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(generated_json, f, indent=2)
    print(f"\nüìù Output also saved to: {output_filename}")

except Exception as e:
    print(f"\n‚ùå An error occurred: {e}")
    print("   Please check your API key and ensure it is configured correctly.")


‚úÖ Gemini API configured successfully.
üìÑ Loaded resume data from: resumes\Aditya_CV_Latex-1.pdf.json

ü§ñ Calling the Gemini Flash model to generate questions... (This may take a moment)

‚úÖ Success! Here are the generated questions and answers:

{
  "interview_questions": [
    {
      "id": 1,
      "area": "Projects",
      "question": "In your Plant-Disease-Detection-ML project, you achieved 93% test accuracy using a CNN. Can you elaborate on the architecture of the CNN you designed or chose, and what steps you took in data preprocessing and augmentation to achieve this high accuracy with the PlantVillage dataset?",
      "answer": "For the Plant-Disease-Detection-ML project, I opted for a relatively deep Convolutional Neural Network (CNN) architecture. The architecture typically involved several convolutional layers with ReLU activation, followed by max-pooling layers for downsampling. Batch normalization was included to stabilize training, and finally, fully connected layer

In [19]:
try:
    client = OpenAI(
        base_url=os.getenv("OLLAMA_BASE_URL"),
        api_key=os.getenv("OLLAMA_API_KEY"),
    )
    print(" OpenAI client configured to connect to local Ollama server.")
except Exception as e:
    print(f" Failed to configure OpenAI client: {e}")
    raise

# -----------------------------
# 3. Define file paths
INPUT_JSON = "resumes/interview_questions.json"
OUTPUT_JSON = "resumes/interview_questions_with_embeddings.json"

# -----------------------------
# 4. Load interview questions
try:
    with open(INPUT_JSON, 'r', encoding='utf-8') as f:
        data = json.load(f)
    questions = data.get("interview_questions", [])
    print(f"‚úÖ Loaded {len(questions)} questions from '{INPUT_JSON}'")
except FileNotFoundError:
    print(f"‚ùå File '{INPUT_JSON}' not found")
    raise

# -----------------------------
# 5. Generate embeddings using Ollama
for idx, q in enumerate(questions, start=1):
    answer_text = q.get("answer", "")
    if not answer_text.strip():
        print(f"‚ö†Ô∏è  Skipping empty answer for question {idx}")
        q["answer_embedding"] = []
        continue
    
    print(f"üîπ Generating embedding for question {idx}: {q.get('question', '')[:60]}...")
    try:
        response = client.embeddings.create(
            model="nomic-embed-text",  # Ollama embedding model
            input=answer_text
        )
        embedding_vector = response.data[0].embedding
        q["answer_embedding"] = embedding_vector
    except Exception as e:
        print(f"‚ùå Failed for question {idx}: {e}")
        q["answer_embedding"] = []

# -----------------------------
# 6. Save updated JSON
os.makedirs(os.path.dirname(OUTPUT_JSON), exist_ok=True)
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print(f"\nüéâ Success! Embeddings saved to '{OUTPUT_JSON}'")


 OpenAI client configured to connect to local Ollama server.
‚úÖ Loaded 20 questions from 'resumes/interview_questions.json'
üîπ Generating embedding for question 1: In your Plant-Disease-Detection-ML project, you achieved 93%...
üîπ Generating embedding for question 2: You used MongoDB to store over 500+ diagnosis records in you...
üîπ Generating embedding for question 3: The Plant-Disease-Detection-ML project reduced manual effort...
üîπ Generating embedding for question 4: In the Disaster-News-Auth-Validate system, you integrated Io...
üîπ Generating embedding for question 5: You used graph-based simulations (Dijkstra, BFS) for emergen...
üîπ Generating embedding for question 6: In the Budget-Optimizer project, you used ML models to achie...
üîπ Generating embedding for question 7: The Budget-Optimizer project leveraged both greedy and dynam...
üîπ Generating embedding for question 8: You've listed Flask, Node, React, and Next in your 'Backend ...
üîπ Generating embedding f

In [15]:
!python generate_audio.py

Generating 20 WAV audio files...


All questions converted successfully and saved in: audio_questions_wav



Processing:   0%|          | 0/20 [00:00<?, ?file/s]
Processing:   5%|5         | 1/20 [00:00<00:15,  1.20file/s]
Processing:  10%|#         | 2/20 [00:01<00:10,  1.74file/s]
Processing:  15%|#5        | 3/20 [00:01<00:08,  2.02file/s]
Processing:  20%|##        | 4/20 [00:02<00:07,  2.19file/s]
Processing:  25%|##5       | 5/20 [00:02<00:06,  2.29file/s]
Processing:  30%|###       | 6/20 [00:02<00:05,  2.37file/s]
Processing:  35%|###5      | 7/20 [00:03<00:05,  2.31file/s]
Processing:  40%|####      | 8/20 [00:03<00:05,  2.30file/s]
Processing:  45%|####5     | 9/20 [00:04<00:04,  2.34file/s]
Processing:  50%|#####     | 10/20 [00:04<00:04,  2.36file/s]
Processing:  55%|#####5    | 11/20 [00:04<00:03,  2.40file/s]
Processing:  60%|######    | 12/20 [00:05<00:03,  2.42file/s]
Processing:  65%|######5   | 13/20 [00:05<00:02,  2.46file/s]
Processing:  70%|#######   | 14/20 [00:06<00:02,  2.42file/s]
Processing:  75%|#######5  | 15/20 [00:06<00:02,  2.47file/s]
Processing:  80%|########

In [5]:
import os, json, whisper, time
from tqdm import tqdm
import speech_recognition as sr
from playsound import playsound


In [6]:
# Load Whisper model once (local, offline)
model = whisper.load_model("base")  # use "tiny" for faster performance

# Create folder for answers
os.makedirs("answers", exist_ok=True)

# Load interview questions
with open("resumes/interview_questions.json", "r", encoding="utf-8") as f:
    questions = json.load(f)

print(f"‚úÖ Loaded {len(questions)} questions.")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139M/139M [00:11<00:00, 12.6MiB/s]


‚úÖ Loaded 1 questions.


In [1]:
%run interview_session_terminal.py


Loading Whisper model (small) for better transcription...
Question 1: In your Plant-Disease-Detection-ML project, you achieved 93% test accuracy using a CNN. Can you elaborate on the architecture of the CNN you designed or chose, and what steps you took in data preprocessing and augmentation to achieve this high accuracy with the PlantVillage dataset?

Press Enter to start recording your answer...


 


Recording answer (max 15 seconds)...
Recorded Answer: Hello, can you hear me? Can you hear me?
Saved WAV: user_answers_wav\Q1_answer.wav



Proceed to next question? (y/n):  n


Ending interview session early.

Interview session completed! Answers saved to transcript/interview_answers.json


In [25]:
import json
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
import os
import re

# -----------------------------
# Configure Ollama/OpenAI client
load_dotenv()
client = OpenAI(
    base_url=os.getenv("OLLAMA_BASE_URL"),
    api_key=os.getenv("OLLAMA_API_KEY")
)

# -----------------------------
# Load reference questions with embeddings
REFERENCE_JSON = "resumes/interview_questions_with_embeddings.json"
with open(REFERENCE_JSON, "r", encoding="utf-8") as f:
    reference_data = json.load(f)
reference_questions = reference_data.get("interview_questions", [])

# -----------------------------
# Load transcript answers
TRANSCRIPT_JSON = "transcript/interview_answers.json"
with open(TRANSCRIPT_JSON, "r", encoding="utf-8") as f:
    user_answers = json.load(f)

if not user_answers:
    raise ValueError("No answers found in transcript.")

# -----------------------------
# Cosine similarity
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
        return 0.0
    return float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))

# -----------------------------
# Extract keywords from reference answer
def extract_keywords(text, top_n=10):
    # Lowercase, remove punctuation
    text = re.sub(r"[^\w\s]", "", text.lower())
    words = text.split()
    stopwords = set(["the", "and", "a", "of", "to", "in", "for", "with", "is", "on", "that", "this", "as", "it", "can"])
    keywords = [w for w in words if w not in stopwords]
    return list(dict.fromkeys(keywords))[:top_n]  # unique top_n words

# -----------------------------
# Evaluate all answers
results = []

for idx, user_entry in enumerate(user_answers):
    user_text = user_entry.get("user_answer", "").strip()
    if not user_text:
        results.append({"question_index": idx+1, "score_percent": 0, "note": "Empty answer"})
        continue
    
    # Reference embedding & answer
    if idx >= len(reference_questions):
        results.append({"question_index": idx+1, "score_percent": 0, "note": "No reference question"})
        continue
    
    ref = reference_questions[idx]
    ref_embedding = np.array(ref.get("answer_embedding", []))
    ref_text = ref.get("answer", "")
    
    if len(ref_embedding) == 0 or not ref_text.strip():
        results.append({"question_index": idx+1, "score_percent": 0, "note": "No reference embedding or text"})
        continue
    
    # User embedding
    response = client.embeddings.create(
        model="nomic-embed-text",
        input=user_text
    )
    user_embedding = np.array(response.data[0].embedding)
    
    # Cosine similarity (semantic score)
    sem_score = cosine_similarity(user_embedding, ref_embedding)
    
    # Keyword coverage score
    ref_keywords = extract_keywords(ref_text)
    user_words = set(re.sub(r"[^\w\s]", "", user_text.lower()).split())
    if ref_keywords:
        coverage = sum(1 for kw in ref_keywords if kw in user_words) / len(ref_keywords)
    else:
        coverage = 0.0
    
    # Combine scores: weighted average (50% embedding + 50% keyword coverage)
    final_score = (sem_score * 0.5 + coverage * 0.5) * 100
    
    results.append({
        "question_index": idx+1,
        "user_answer": user_text,
        "reference_question": ref.get("question", ""),
        "similarity_percent": round(sem_score*100, 2),
        "keyword_coverage_percent": round(coverage*100, 2),
        "final_score_percent": round(final_score, 2)
    })

# -----------------------------
# Print results
for r in results:
    print("\nQuestion", r["question_index"])
    print("Reference Question:", r.get("reference_question", "N/A"))
    print("User Answer:", r.get("user_answer", ""))
    print("Semantic Similarity:", r.get("similarity_percent"), "%")
    print("Keyword Coverage:", r.get("keyword_coverage_percent"), "%")
    print("Final Combined Score:", r.get("final_score_percent"), "%")



Question 1
Reference Question: In your Plant-Disease-Detection-ML project, you achieved 93% test accuracy using a CNN. Can you elaborate on the architecture of the CNN you designed or chose, and what steps you took in data preprocessing and augmentation to achieve this high accuracy with the PlantVillage dataset?
User Answer: I used CNN Mollinger.
Semantic Similarity: 56.39 %
Keyword Coverage: 20.0 %
Final Combined Score: 38.2 %
