In [None]:
from openai import OpenAI
API_KEY = "API_KEY"

client = OpenAI(api_key=API_KEY)
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello! Can you help me with a Python code snippet?"}
    ]
)


In [2]:
response

ChatCompletion(id='chatcmpl-CUYTRqV4NzhyfPqG59fwKYxlVJVZ5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Of course! I'd be happy to help. What do you need assistance with in your Python code snippet?", refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1761398393, model='gpt-3.5-turbo-0125', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=22, prompt_tokens=29, total_tokens=51, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [3]:

from abc import ABC, abstractmethod
import base64

class LLM(ABC):
    """
    Abstract class for Large Language Models (LLMs).
    This class defines a common interface for different LLM providers.
    """

    @abstractmethod
    def chat(self, messages: list, model: str = "default_model"):
        """Abstract method to send messages to the LLM and receive responses."""
        pass

    @abstractmethod
    def chat_stream(self, messages: list, model: str = "default_model"):
        """Abstract method to send messages to the LLM and receive responses."""
        pass

    @abstractmethod
    def generate_json(self, messages: list, model: str = "default_model"):
        """Abstract method to send messages to the LLM and receive responses."""
        pass

class OpenAI_LLM(LLM):
    """Implementation of the LLM interface for OpenAI's GPT models."""

    def __init__(self, api_key: str):
        self.client = OpenAI(api_key=api_key)

    def chat(self, messages: list, model: str = "gpt-4o"): # type: ignore
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            stream=False,  # Ensures response isn't streamed
        )
        return response.choices[0].message.content

    def chat_stream(self, messages: list, model: str = "gpt-4o"): # type: ignore
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            stream=True,
        )
        for chunk in response:
            if chunk.choices[0].delta.content or chunk.choices[0].finish_reason:
                yield {
                    "content": chunk.choices[0].delta.content,
                    "finish_reason": chunk.choices[0].finish_reason,
                }

    def generate_json(self, messages: list, schema: dict, model: str = "gpt-4o"): # type: ignore
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            response_format={
                "type": "json_schema",
                "json_schema": schema,
            }, # type: ignore
            stream=False,  # Ensures response isn't streamed
        ) # type: ignore
        # response.choices[0].message.content
        return response
    
    def generate_stream_json(self, messages: list, model: str = "gpt-4o"): # type: ignore
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            response_format={"type": "json_object"},
            stream=True,  # Ensures response isn't streamed
        )
        for chunk in response:
            if chunk.choices[0].delta.content or chunk.choices[0].finish_reason:
                yield {
                    "content": chunk.choices[0].delta.content,
                    "finish_reason": chunk.choices[0].finish_reason,
                }
    def extract_text_from_image(self, img_bytes: base64) -> str: # type: ignore
        

        response = client.responses.create(
            model="gpt-4.1",
            input=[
                {
                    "role": "user",
                    "content": [
                        { "type": "input_text", "text": "Extract text clearly from this image." },
                        {
                            "type": "input_image",
                            "image_url": f"data:image/jpeg;base64,{img_bytes}",
                        },
                    ],
                } # type: ignore
            ],
        )
        # self.api_calls += 1
        return response.output_text # type: ignore


In [4]:
llm = OpenAI_LLM(api_key=API_KEY)

# response = llm.chat(
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant that extracts information from resumes."},
#         {"role": "user", "content": "Extract the key details from this resume..."}
#     ],
#     model="gpt-4o-mini"
# )
# print(response)

# PDF Parsing

In [5]:
"""
Optimized Parallel Resume Extractor for GPT-4o-mini
Best for 10-30 page documents with parallel chunk processing

Dependencies:
- Required: openai, PyMuPDF (fitz)
- Optional: pytesseract, Pillow (for Tesseract OCR mode)
- Optional: Pillow (for Vision API mode)

Install:
  pip install openai PyMuPDF
  
Optional for image-based PDFs:
  pip install Pillow pytesseract
"""

import io
import fitz
import json
from typing import Dict, Any, List
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
import time
import warnings

# Optional dependencies
try:
    import pytesseract
    TESSERACT_AVAILABLE = True
except ImportError:
    TESSERACT_AVAILABLE = False

try:
    from PIL import Image
    import base64
    from io import BytesIO
    PILLOW_AVAILABLE = True
except ImportError:
    PILLOW_AVAILABLE = False


@dataclass
class ExtractionMetrics:
    """Track extraction performance metrics"""
    total_pages: int
    chunks_processed: int
    api_calls: int
    total_time: float
    cost_estimate: float
    
    def __str__(self):
        return f"""
╔══════════════════════════════════════════════╗
║          EXTRACTION METRICS                  ║
╠══════════════════════════════════════════════╣
║ Total Pages:      {self.total_pages:>4}      ║
║ Chunks Processed: {self.chunks_processed:>4} ║
║ API Calls:        {self.api_calls:>4}        ║
║ Time Elapsed:     {self.total_time:>6.2f}s   ║
║ Est. Cost:        ${self.cost_estimate:>6.4f}║
╚══════════════════════════════════════════════╝
"""


class OptimizedResumeExtractor:
    """
    High-performance resume extractor optimized for GPT-4o-mini
    - Parallel chunk processing
    - Smart token management
    - Cost optimization
    - Fast merging algorithm
    
    Supports multiple extraction modes:
    - TEXT: Fast text extraction (always available)
    - VISION: GPT-4 Vision API (requires Pillow)
    - TESSERACT: Tesseract OCR + LLM (requires pytesseract + Pillow)
    - AUTO: Automatic mode selection
    """
    
    # GPT-4o-mini pricing (as of 2024)
    INPUT_PRICE_PER_1K = 0.000150   # $0.15 per 1M tokens
    OUTPUT_PRICE_PER_1K = 0.000600  # $0.60 per 1M tokens
    
    def __init__(
        self, 
        openai_api_key: str,
        pages_per_chunk: int = 5,
        max_workers: int = 5,
        model: str = "gpt-4o-mini"
    ):
        """
        Initialize extractor
        
        Args:
            openai_api_key: Your OpenAI API key
            pages_per_chunk: Pages to process per chunk (3-7 recommended)
            max_workers: Parallel threads (5-10 recommended)
            model: OpenAI model to use
        """
        self.client = OpenAI(api_key=openai_api_key)
        self.pages_per_chunk = pages_per_chunk
        self.max_workers = max_workers
        self.model = model
        self.schema = self._get_schema()
        
        # Metrics
        self.api_calls = 0
        self.input_tokens = 0
        self.output_tokens = 0
        
        # Check optional dependencies
        self._check_dependencies()
    
    def _check_dependencies(self):
        """Check and warn about optional dependencies"""
        if not PILLOW_AVAILABLE:
            warnings.warn(
                "Pillow not installed. Vision and Tesseract modes will be unavailable.\n"
                "Install with: pip install Pillow",
                ImportWarning
            )
        
        if not TESSERACT_AVAILABLE:
            warnings.warn(
                "pytesseract not installed. Tesseract mode will be unavailable.\n"
                "Install with: pip install pytesseract\n"
                "Also install Tesseract OCR: https://github.com/tesseract-ocr/tesseract",
                ImportWarning
            )
    
    def get_available_modes(self) -> List[str]:
        """Get list of available extraction modes based on installed dependencies"""
        modes = ["text", "auto"]  # Always available
        
        if PILLOW_AVAILABLE:
            modes.append("vision")
        
        if PILLOW_AVAILABLE and TESSERACT_AVAILABLE:
            modes.append("tesseract")
        
        return modes
    
    def _get_schema(self) -> dict:
        """Extraction schema for structured output"""
        return {
            "name": "resume_extraction",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "personal_info": {
                        "type": "object",
                        "properties": {
                            "full_name": {"type": "string"},
                            "email": {"type": "string"},
                            "phone": {"type": "string"},
                            "linkedin": {"type": "string"},
                            "github": {"type": "string"},
                            "portfolio": {"type": "string"},
                            "location": {"type": "string"}
                        },
                        "required": ["full_name", "email", "phone", "linkedin", "github", "portfolio", "location"],
                        "additionalProperties": False
                    },
                    "professional_summary": {"type": "string"},
                    "skills": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "experience": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "company": {"type": "string"},
                                "position": {"type": "string"},
                                "location": {"type": "string"},
                                "start_date": {"type": "string"},
                                "end_date": {"type": "string"},
                                "duration": {"type": "string"},
                                "responsibilities": {
                                    "type": "array",
                                    "items": {"type": "string"}
                                }
                            },
                            "required": ["company", "position", "location", "start_date", "end_date", "duration", "responsibilities"],
                            "additionalProperties": False
                        }
                    },
                    "education": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "institution": {"type": "string"},
                                "degree": {"type": "string"},
                                "field_of_study": {"type": "string"},
                                "graduation_year": {"type": "string"},
                                "gpa": {"type": "string"},
                                "location": {"type": "string"}
                            },
                            "required": ["institution", "degree", "field_of_study", "graduation_year", "gpa", "location"],
                            "additionalProperties": False
                        }
                    },
                    "certifications": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {"type": "string"},
                                "issuer": {"type": "string"},
                                "date": {"type": "string"}
                            },
                            "required": ["name", "issuer", "date"],
                            "additionalProperties": False
                        }
                    },
                    "projects": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {"type": "string"},
                                "description": {"type": "string"},
                                "technologies": {
                                    "type": "array",
                                    "items": {"type": "string"}
                                }
                            },
                            "required": ["name", "description", "technologies"],
                            "additionalProperties": False
                        }
                    },
                    "languages": {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                },
                "required": ["personal_info", "professional_summary", "skills", "experience", "education", "certifications", "projects", "languages"],
                "additionalProperties": False
            }
        }
    
    def extract_from_pdf(self, pdf_path: str, ocr_method: str = "tesseract",verbose: bool = True) -> tuple[Dict[str, Any], ExtractionMetrics]:
        """
        Extract resume data from PDF with parallel processing
        
        Returns:
            (extracted_data, metrics)
        """
        start_time = time.time()
        self.api_calls = 0
        self.input_tokens = 0
        self.output_tokens = 0
        
        if verbose:
            print("🚀 Starting resume extraction...")
            print(f"📄 Model: {self.model}")
            print(f"⚙️  Config: {self.pages_per_chunk} pages/chunk, {self.max_workers} workers\n")
        
        # Step 1: Extract text from PDF
        if verbose:
            print("📖 Reading PDF...")
        doc = fitz.open(pdf_path)
        pages_text = []
        for page_num in range(len(doc)):
            text = doc[page_num].get_text()
            if not text.strip():  # type: ignore # If page has no text, try OCR
                if ocr_method != "none":
                    if verbose:
                        print(f"🔍 Page {page_num + 1} is scanned, trying OCR via {ocr_method}...")
                    page_image = doc[page_num].get_pixmap()

                    if ocr_method == "tesseract":
                        text = self._extract_with_tesseract(page_image)
                        print("Tesseract OCR Result:", text)

                    elif ocr_method == "vision":
                        text = self._extract_with_vision(page_image)
                        print("Vision OCR Result:", text)

            if text.strip():  # type: ignore # Only include non-empty pages
                pages_text.append(text)
        doc.close()
        
        num_pages = len(pages_text)
        if verbose:
            print(f"✓  Extracted {num_pages} pages\n")
        
        # Step 2: Create chunks
        chunks = self._create_chunks(pages_text)
        if verbose:
            print(f"📦 Created {len(chunks)} chunks for parallel processing\n")
        
        # Step 3: Parallel extraction
        if verbose:
            print("⚡ Processing chunks in parallel...")
        partial_results = self._extract_parallel(chunks, verbose)
        
        # Step 4: Merge results
        if verbose:
            print("\n🔄 Merging results...")
        merged_result = self._merge_results(partial_results)
        
        # Calculate metrics
        elapsed = time.time() - start_time
        cost = self._calculate_cost()
        
        metrics = ExtractionMetrics(
            total_pages=num_pages,
            chunks_processed=len(chunks),
            api_calls=self.api_calls,
            total_time=elapsed,
            cost_estimate=cost
        )
        
        if verbose:
            print("✅ Extraction complete!\n")
            print(metrics)
        
        return merged_result, metrics
    
    def _extract_with_tesseract(self, page_image):
        img = Image.frombytes("RGB", [page_image.width, page_image.height], page_image.samples) # type: ignore
        img.show()
        return pytesseract.image_to_string(img) # type: ignore

    def _extract_with_vision(self, page_image):
        # Convert Pixmap → PNG bytes
        img = Image.frombytes("RGB", (page_image.width, page_image.height), page_image.samples) # type: ignore
        buffer = io.BytesIO()
        img.save(buffer, format="PNG")
        img_bytes = buffer.getvalue()
        img_base64 = base64.b64encode(img_bytes).decode("utf-8") # type: ignore
        return llm.extract_text_from_image(img_base64) # type: ignore


    def _create_chunks(self, pages_text: List[str]) -> List[str]:
        """Create optimal chunks from pages"""
        chunks = []
        current_chunk = []
        
        for i, page_text in enumerate(pages_text):
            current_chunk.append(page_text)
            
            # Create chunk when reaching target size or last page
            if len(current_chunk) >= self.pages_per_chunk or i == len(pages_text) - 1:
                chunks.append("\n\n--- PAGE BREAK ---\n\n".join(current_chunk))
                current_chunk = []
        
        return chunks
    
    def _extract_parallel(self, chunks: List[str], verbose: bool = True) -> List[Dict[str, Any]]:
        """Extract data from chunks in parallel"""
        results = []
        completed = 0
        
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all tasks
            future_to_chunk = {
                executor.submit(self._extract_chunk, i, chunk): i 
                for i, chunk in enumerate(chunks)
            }
            
            # Process as they complete
            for future in as_completed(future_to_chunk):
                chunk_idx = future_to_chunk[future]
                try:
                    result = future.result()
                    results.append((chunk_idx, result))
                    completed += 1
                    if verbose:
                        print(f"  ✓ Chunk {completed}/{len(chunks)} processed")
                except Exception as e:
                    if verbose:
                        print(f"  ✗ Chunk {chunk_idx + 1} failed: {e}")
                    results.append((chunk_idx, self._empty_result()))
        
        # Sort by original chunk order
        results.sort(key=lambda x: x[0])
        return [r[1] for r in results]
    
    def _extract_chunk(self, chunk_idx: int, chunk_text: str) -> Dict[str, Any]:
        """Extract data from a single chunk"""
        messages = [
            {
                "role": "system",
                "content": "You are an expert resume parser. Extract ALL information from this resume section accurately. Use empty strings \"\" for missing text fields and empty arrays [] for missing lists."
            },
            {
                "role": "user",
                "content": f"Extract resume information from this text:\n\n{chunk_text}"
            }
        ]
        
        # response = self.client.chat.completions.create(
        #     model=self.model,
        #     messages=messages,
        #     response_format={
        #         "type": "json_schema",
        #         "json_schema": self.schema
        #     }
        # )
        response = llm.generate_json(
            messages=messages,
            model=self.model,
            schema=self.schema
        )
        # Track metrics
        self.api_calls += 1
        self.input_tokens += response.usage.prompt_tokens
        self.output_tokens += response.usage.completion_tokens
        
        return json.loads(response.choices[0].message.content)
    
    def _empty_result(self) -> Dict[str, Any]:
        """Return empty result structure"""
        return {
            "personal_info": {
                "full_name": "",
                "email": "",
                "phone": "",
                "linkedin": "",
                "github": "",
                "portfolio": "",
                "location": ""
            },
            "professional_summary": "",
            "skills": [],
            "experience": [],
            "education": [],
            "certifications": [],
            "projects": [],
            "languages": []
        }
    
    def _merge_results(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Intelligently merge multiple partial results
        Optimized merging algorithm for speed
        """
        merged = self._empty_result()
        
        # Merge personal info (first non-null wins)
        for result in results:
            for key, value in result.get("personal_info", {}).items():
                if value and not merged["personal_info"].get(key):
                    merged["personal_info"][key] = value
        
        # Merge professional summary (longest wins)
        summaries = [r.get("professional_summary") for r in results if r.get("professional_summary")]
        if summaries:
            merged["professional_summary"] = max(summaries, key=len) # type: ignore
        
        # Merge skills (deduplicate, case-insensitive)
        skills_set = set()
        skills_map = {}  # lowercase -> original case
        for result in results:
            for skill in result.get("skills", []):
                skill_lower = skill.lower()
                if skill_lower not in skills_set:
                    skills_set.add(skill_lower)
                    skills_map[skill_lower] = skill
        merged["skills"] = sorted(skills_map.values())
        
        # Merge languages (deduplicate)
        langs_set = set()
        for result in results:
            langs_set.update(result.get("languages", []))
        merged["languages"] = sorted(list(langs_set))
        
        # Merge experience (deduplicate by company+position)
        seen_exp = set()
        for result in results:
            for exp in result.get("experience", []):
                key = (exp.get("company", "").lower(), exp.get("position", "").lower())
                if key not in seen_exp and key != ("", ""):
                    seen_exp.add(key)
                    merged["experience"].append(exp)
        
        # Merge education (deduplicate by institution+degree)
        seen_edu = set()
        for result in results:
            for edu in result.get("education", []):
                key = (edu.get("institution", "").lower(), edu.get("degree", "").lower())
                if key not in seen_edu and key != ("", ""):
                    seen_edu.add(key)
                    merged["education"].append(edu)
        
        # Merge certifications (deduplicate by name)
        seen_certs = set()
        for result in results:
            for cert in result.get("certifications", []):
                cert_name = cert.get("name", "").lower()
                if cert_name and cert_name not in seen_certs:
                    seen_certs.add(cert_name)
                    merged["certifications"].append(cert)
        
        # Merge projects (deduplicate by name)
        seen_projects = set()
        for result in results:
            for proj in result.get("projects", []):
                proj_name = proj.get("name", "").lower()
                if proj_name and proj_name not in seen_projects:
                    seen_projects.add(proj_name)
                    merged["projects"].append(proj)
        
        return merged
    
    def _calculate_cost(self) -> float:
        """Calculate estimated cost based on token usage"""
        input_cost = (self.input_tokens / 1000) * self.INPUT_PRICE_PER_1K
        output_cost = (self.output_tokens / 1000) * self.OUTPUT_PRICE_PER_1K
        return input_cost + output_cost


In [None]:
# Initialize extractor
extractor = OptimizedResumeExtractor(
    openai_api_key=API_KEY,
    pages_per_chunk=5,      # 5 pages per chunk (optimal for 10-30 pages)
    max_workers=5,          # 5 parallel workers
    model="gpt-4o-mini"
)

# Extract resume
result, metrics = extractor.extract_from_pdf("resume.pdf", ocr_method="tesseract", verbose=True)

# Display results
print("\n" + "="*60)
print("EXTRACTED DATA")
print("="*60)
print(json.dumps(result, indent=2))

print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Name: {result['personal_info'].get('full_name', 'N/A')}")
print(f"Email: {result['personal_info'].get('email', 'N/A')}")
print(f"Phone: {result['personal_info'].get('phone', 'N/A')}")
print(f"\nSkills ({len(result['skills'])}): {', '.join(result['skills'][:10])}...")
print(f"\nExperience ({len(result['experience'])} positions):")
for exp in result['experience'][:3]:
    print(f"  • {exp['position']} at {exp['company']}")
print(f"\nEducation ({len(result['education'])} entries):")
for edu in result['education']:
    print(f"  • {edu['degree']} from {edu['institution']}")

{'personal_info': {'full_name': 'Faizan Munsaf',
  'email': 'faizanmunsaf@gmail.com',
  'phone': '+92-3328893064',
  'linkedin': 'FaizanMunsaf-Linkedin',
  'github': 'FaizanMunsaf-GitHub',
  'portfolio': '',
  'location': 'Sodiwal Bund Road Lahore, Pakistan'},
 'professional_summary': "I am a driven and talented professional who brings a unique blend of strategic thinking, catalytic energy, empathetic understanding, and critical thinking to everything I do. With a strong educational background, including a Bachelor's degree in Computer Science and certification in Machine Learning, Natural Language Processing, Python Development and Deep Learning, I have developed a comprehensive skill set in the field of technology as a Backend Developer with Django, FastAPI and Flask. My exceptional problem-solving skills and excellent communication abilities make me a valuable asset to any team.",
 'skills': ['C++',
  'Deep Learning',
  'Django',
  'FastAPI',
  'Flask',
  'GenAi',
  'JWT Authenticat

In [None]:
# # Performance comparison for different configurations
# print("\n\n" + "="*60)
# print("CONFIGURATION COMPARISON")
# print("="*60)

# configs = [
#     {"pages_per_chunk": 3, "max_workers": 3},
#     {"pages_per_chunk": 5, "max_workers": 5},
#     {"pages_per_chunk": 7, "max_workers": 5},
# ]

# for config in configs:
#     print(f"\n📊 Testing: {config['pages_per_chunk']} pages/chunk, {config['max_workers']} workers")
#     test_extractor = OptimizedResumeExtractor(
#         openai_api_key=API_KEY,         **config,
#         model="gpt-4o-mini"
#     )
#     _, test_metrics = test_extractor.extract_from_pdf("scanned_resume.pdf", ocr_method="tesseract",verbose=False)
#     print(f"   Time: {test_metrics.total_time:.2f}s | Chunks: {test_metrics.chunks_processed} | Cost: ${test_metrics.cost_estimate:.4f}")

# Job Description Parsing

In [6]:
import json
from typing import Dict, Any
from openai import OpenAI
from dataclasses import dataclass


@dataclass
class ExtractionMetrics:
    """Track extraction performance metrics"""
    api_calls: int
    input_tokens: int
    output_tokens: int
    cost_estimate: float
    
    def __str__(self):
        return f"""
╔══════════════════════════════════════════════╗
║      JOB EXTRACTION METRICS                  ║
╠══════════════════════════════════════════════╣
║ API Calls:        {self.api_calls:>4}        ║
║ Input Tokens:     {self.input_tokens:>4}     ║
║ Output Tokens:    {self.output_tokens:>4}    ║
║ Est. Cost:        ${self.cost_estimate:>6.4f}║
╚══════════════════════════════════════════════╝
"""


class JobDescriptionExtractor:
    """
    Extract structured job requirements from job descriptions
    Returns skills, qualifications, experience, responsibilities, etc.
    
    Supports:
    - Technical and soft skills extraction
    - Required vs. preferred qualifications
    - Experience level requirements
    - Responsibilities and duties
    - Salary/compensation info
    - Benefits
    """
    
    # GPT-4o-mini pricing
    INPUT_PRICE_PER_1K = 0.000150
    OUTPUT_PRICE_PER_1K = 0.000600
    
    def __init__(self, openai_api_key: str, model: str = "gpt-4o-mini"):
        """
        Initialize job description extractor
        
        Args:
            openai_api_key: Your OpenAI API key
            model: OpenAI model to use
        """
        self.client = OpenAI(api_key=openai_api_key)
        self.model = model
        self.schema = self._get_schema()
        
        # Metrics
        self.api_calls = 0
        self.input_tokens = 0
        self.output_tokens = 0
    
    def _get_schema(self) -> dict:
        """JSON schema for structured job requirement extraction"""
        return {
            "name": "job_requirements_extraction",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "job_title": {"type": "string"},
                    "company": {"type": "string"},
                    "job_type": {"type": "string"},
                    "experience_level": {"type": "string"},
                    "years_of_experience": {"type": "string"},
                    "required_skills": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "preferred_skills": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "technical_requirements": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "required_qualifications": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "preferred_qualifications": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "education": {
                        "type": "object",
                        "properties": {
                            "minimum_degree": {"type": "string"},
                            "field_of_study": {
                                "type": "array",
                                "items": {"type": "string"}
                            }
                        },
                        "required": ["minimum_degree", "field_of_study"],
                        "additionalProperties": False
                    },
                    "certifications": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "key_responsibilities": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "salary_range": {
                        "type": "object",
                        "properties": {
                            "min": {"type": "string"},
                            "max": {"type": "string"},
                            "currency": {"type": "string"},
                            "period": {"type": "string"}
                        },
                        "required": ["min", "max", "currency", "period"],
                        "additionalProperties": False
                    },
                    "benefits": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "location": {"type": "string"},
                    "remote_status": {"type": "string"},
                    "languages": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "soft_skills": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "industry": {"type": "string"},
                    "reporting_to": {"type": "string"}
                },
                "required": [
                    "job_title", "company", "job_type", "experience_level", "years_of_experience",
                    "required_skills", "preferred_skills", "technical_requirements",
                    "required_qualifications", "preferred_qualifications", "education", 
                    "certifications", "responsibilities", "key_responsibilities", "salary_range",
                    "benefits", "location", "remote_status", "languages", "soft_skills",
                    "industry", "reporting_to"
                ],
                "additionalProperties": False
            }
        }
    
    def extract(self, job_description: str, verbose: bool = True) -> tuple[Dict[str, Any], ExtractionMetrics]:
        """
        Extract structured requirements from job description
        
        Args:
            job_description: Raw job description text
            verbose: Print extraction progress
            
        Returns:
            (extracted_data, metrics)
        """
        if verbose:
            print("🚀 Starting job requirement extraction...")
            print(f"📄 Model: {self.model}\n")
        
        # Reset metrics
        self.api_calls = 0
        self.input_tokens = 0
        self.output_tokens = 0
        
        # Extract requirements
        if verbose:
            print("📋 Parsing job description...")
        
        extracted_data = self._extract_requirements(job_description)
        
        # Calculate metrics
        cost = self._calculate_cost()
        
        metrics = ExtractionMetrics(
            api_calls=self.api_calls,
            input_tokens=self.input_tokens,
            output_tokens=self.output_tokens,
            cost_estimate=cost
        )
        
        if verbose:
            print("✅ Extraction complete!\n")
            print(metrics)
        
        return extracted_data, metrics
    
    def _extract_requirements(self, job_description: str) -> Dict[str, Any]:
        """Extract requirements using GPT-4o-mini with structured output"""
        messages = [
            {
                "role": "system",
                "content": """You are an expert job description analyzer. Extract ALL job requirements and information from the provided job description.
                
Extract information systematically:
- Identify required vs preferred skills and qualifications
- Extract technical and soft skills
- List key responsibilities
- Identify education and certification requirements
- Extract salary, benefits, and location info
- Determine remote status and job type

Use empty strings "" for missing text fields and empty arrays [] for missing lists."""
            },
            {
                "role": "user",
                "content": f"Extract structured job requirements from this job description:\n\n{job_description}"
            }
        ]
        
        response = llm.generate_json(
            messages=messages,
            model=self.model,
            schema=self.schema
        )
        
        # Track metrics
        self.api_calls += 1
        self.input_tokens += response.usage.prompt_tokens
        self.output_tokens += response.usage.completion_tokens
        
        return json.loads(response.choices[0].message.content)
    
    def extract_to_file(self, job_description: str, output_file: str, verbose: bool = True) -> None:
        """Extract and save results to JSON file"""
        extracted_data, metrics = self.extract(job_description, verbose)
        
        with open(output_file, 'w') as f:
            json.dump(extracted_data, f, indent=2)
        
        if verbose:
            print(f"💾 Results saved to: {output_file}\n")
    
    def _calculate_cost(self) -> float:
        """Calculate estimated cost based on token usage"""
        input_cost = (self.input_tokens / 1000) * self.INPUT_PRICE_PER_1K
        output_cost = (self.output_tokens / 1000) * self.OUTPUT_PRICE_PER_1K
        return input_cost + output_cost


In [8]:

    
extractor = JobDescriptionExtractor(openai_api_key=API_KEY)

# Sample job description
sample_job = """
Senior Software Engineer - Remote
Company: TechCorp Inc.

About the Role:
We're looking for a Senior Software Engineer to join our platform team. You'll architect scalable systems 
and mentor junior developers while working on cutting-edge technologies.

Requirements:
- 5+ years of software development experience
- Expert-level proficiency in Python, JavaScript, and Go
- Strong experience with microservices architecture
- AWS or Google Cloud Platform experience required
- PostgreSQL and Redis expertise
- Docker and Kubernetes knowledge

Nice to Have:
- Experience with ML/AI pipelines
- GraphQL expertise
- Previous startup experience
- Open source contributions

Responsibilities:
- Design and implement scalable backend systems
- Code reviews and mentoring junior engineers
- Participate in architectural decisions
- Collaborate with product and design teams

Education:
- Bachelor's degree in Computer Science or related field

Benefits:
- $150,000 - $200,000 salary
- Health insurance, 401(k) matching
- Unlimited PTO
- Home office stipend
"""

# Extract requirements
result, metrics = extractor.extract(sample_job)

# Print results
print(json.dumps(result, indent=2))

🚀 Starting job requirement extraction...
📄 Model: gpt-4o-mini

📋 Parsing job description...
✅ Extraction complete!


╔══════════════════════════════════════════════╗
║      JOB EXTRACTION METRICS                  ║
╠══════════════════════════════════════════════╣
║ API Calls:           1        ║
║ Input Tokens:      641     ║
║ Output Tokens:     268    ║
║ Est. Cost:        $0.0003║
╚══════════════════════════════════════════════╝

{
  "job_title": "Senior Software Engineer",
  "company": "TechCorp Inc.",
  "job_type": "Remote",
  "experience_level": "Senior",
  "years_of_experience": "5+",
  "required_skills": [
    "Python",
    "JavaScript",
    "Go",
    "Microservices architecture",
    "AWS",
    "Google Cloud Platform",
    "PostgreSQL",
    "Redis",
    "Docker",
    "Kubernetes"
  ],
  "preferred_skills": [
    "ML/AI pipelines",
    "GraphQL",
    "Previous startup experience",
    "Open source contributions"
  ],
  "technical_requirements": [
    "Python",
    "JavaScript"

# Question Generation Pre Processing

In [13]:
import json
from typing import Dict, Any, List, Literal, Optional
from openai import OpenAI
from dataclasses import dataclass
from enum import Enum


class QuestionSource(Enum):
    """Question source types"""
    CLIENT_PROVIDED = "client_provided"
    AI_GENERATED = "ai_generated"
    HYBRID = "hybrid"


class DifficultyLevel(Enum):
    """Difficulty levels for interview questions"""
    EASY = "easy"
    MEDIUM = "medium"
    HARD = "hard"


class InterviewPhase(Enum):
    """Interview phases"""
    FIRST = "first_round"
    SECOND = "second_round"
    FINAL = "final_round"


@dataclass
class InterviewConfig:
    """Configuration for interview question generation"""
    phase: InterviewPhase
    difficulty_level: DifficultyLevel
    question_source: QuestionSource
    interview_duration_minutes: int  # Changed from total_questions
    question_length: Literal["short", "medium", "long"]
    client_questions: List[str] = None
    client_questions_count: int = 0  # Only used if HYBRID mode
    buffer_time_percent: float = 0.15  # 15% buffer for transitions, rapport building
    
    def __post_init__(self):
        if self.client_questions is None:
            self.client_questions = []
        if self.question_source == QuestionSource.HYBRID and not self.client_questions:
            raise ValueError("Client questions required for HYBRID mode")
        
        # Calculate number of questions based on time
        self.total_questions = self._calculate_question_count()
    
    def _calculate_question_count(self) -> int:
        """Calculate how many questions fit in the given time"""
        # Answer time per question (in minutes)
        answer_times = {
            "short": 2,    # 1-2 minutes
            "medium": 4,   # 3-5 minutes
            "long": 7      # 5-10 minutes
        }
        
        # Add 1 minute per question for asking the question + brief discussion
        time_per_question = answer_times[self.question_length] + 1
        
        # Apply buffer (e.g., 15% for intro, transitions, wrap-up)
        available_time = self.interview_duration_minutes * (1 - self.buffer_time_percent)
        
        # Calculate questions
        question_count = int(available_time / time_per_question)
        
        # Ensure at least 1 question
        return max(1, question_count)


@dataclass
class InterviewMetrics:
    """Metrics for interview generation"""
    api_calls: int
    input_tokens: int
    output_tokens: int
    cost_estimate: float
    questions_generated: int
    interview_duration_minutes: int
    estimated_actual_duration_minutes: int
    buffer_time_minutes: float
    
    def __str__(self):
        return f"""
╔════════════════════════════════════════════════════╗
║      INTERVIEW GENERATION METRICS                  ║
╠════════════════════════════════════════════════════╣
║ Target Duration:      {self.interview_duration_minutes:>3} minutes            ║
║ Estimated Actual:     {self.estimated_actual_duration_minutes:>3} minutes            ║
║ Buffer Time:          {self.buffer_time_minutes:>3.1f} minutes            ║
║ Questions Generated:  {self.questions_generated:>3}                    ║
║                                                    ║
║ API Calls:            {self.api_calls:>4}                   ║
║ Input Tokens:         {self.input_tokens:>6}                 ║
║ Output Tokens:        {self.output_tokens:>6}                 ║
║ Est. Cost:            ${self.cost_estimate:>7.4f}              ║
╚════════════════════════════════════════════════════╝
"""


class InterviewQuestionGenerator:
    """
    Generate customized interview questions based on TIME DURATION:
    - Specify interview duration (e.g., 10, 30, 60 minutes)
    - System automatically calculates optimal number of questions
    - Accounts for answer length, transitions, and buffer time
    """
    
    INPUT_PRICE_PER_1K = 0.000150
    OUTPUT_PRICE_PER_1K = 0.000600
    
    def __init__(self, openai_api_key: str, model: str = "gpt-4o-mini"):
        self.client = OpenAI(api_key=openai_api_key)
        self.model = model
        self.schema = self._get_schema()
        
        # Metrics
        self.api_calls = 0
        self.input_tokens = 0
        self.output_tokens = 0
    
    def _get_schema(self) -> dict:
        """JSON schema for interview questions"""
        return {
            "name": "interview_questions_generation",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                    "questions": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "question_id": {"type": "integer"},
                                "question": {"type": "string"},
                                "category": {"type": "string"},
                                "difficulty": {"type": "string"},
                                "expected_answer_type": {"type": "string"},
                                "estimated_time_minutes": {"type": "integer"},
                                "key_points": {
                                    "type": "array",
                                    "items": {"type": "string"}
                                },
                                "follow_up_questions": {
                                    "type": "array",
                                    "items": {"type": "string"}
                                },
                                "why_asked": {"type": "string"},
                                "relevance_to_cv": {"type": "string"}
                            },
                            "required": [
                                "question_id", "question", "category", "difficulty",
                                "expected_answer_type", "estimated_time_minutes",
                                "key_points", "follow_up_questions",
                                "why_asked", "relevance_to_cv"
                            ],
                            "additionalProperties": False
                        }
                    },
                    "interview_summary": {
                        "type": "object",
                        "properties": {
                            "total_questions": {"type": "integer"},
                            "difficulty_distribution": {
                                "type": "object",
                                "properties": {
                                    "easy": {"type": "integer"},
                                    "medium": {"type": "integer"},
                                    "hard": {"type": "integer"}
                                },
                                "required": ["easy", "medium", "hard"],
                                "additionalProperties": False
                            },
                            "question_categories": {
                                "type": "array",
                                "items": {"type": "string"}
                            },
                            "estimated_interview_duration_minutes": {"type": "integer"}
                        },
                        "required": [
                            "total_questions", "difficulty_distribution",
                            "question_categories", "estimated_interview_duration_minutes"
                        ],
                        "additionalProperties": False
                    }
                },
                "required": ["questions", "interview_summary"],
                "additionalProperties": False
            }
        }
    
    def generate(
        self,
        job_requirements: Dict[str, Any],
        candidate_cv: Dict[str, Any],
        config: InterviewConfig,
        verbose: bool = True
    ) -> tuple[Dict[str, Any], InterviewMetrics]:
        """
        Generate interview questions based on time duration
        
        Args:
            job_requirements: Extracted job requirements JSON
            candidate_cv: Extracted resume data JSON
            config: InterviewConfig with TIME-BASED parameters
            verbose: Print progress
            
        Returns:
            (generated_questions, metrics)
        """
        self.api_calls = 0
        self.input_tokens = 0
        self.output_tokens = 0
        
        if verbose:
            print("🚀 Starting TIME-BASED interview question generation...")
            print(f"⏱️  Interview Duration: {config.interview_duration_minutes} minutes")
            print(f"📊 Calculated Questions: {config.total_questions}")
            print(f"📄 Model: {self.model}")
            print(f"🎯 Configuration: {config.phase.value} - {config.difficulty_level.value}")
            print(f"📊 Source: {config.question_source.value}")
            print(f"⏳ Answer Length: {config.question_length}\n")
        
        # Build prompt based on source type
        if config.question_source == QuestionSource.CLIENT_PROVIDED:
            if verbose:
                print("✓ Using client-provided questions only")
            questions = self._format_client_questions(config.client_questions, config)
        
        elif config.question_source == QuestionSource.AI_GENERATED:
            if verbose:
                print("🤖 Generating all questions with AI...")
            questions = self._generate_ai_questions(
                job_requirements, candidate_cv, config, verbose
            )
        
        else:  # HYBRID
            if verbose:
                print(f"🔄 Using hybrid approach: {config.client_questions_count} client + "
                      f"{config.total_questions - config.client_questions_count} AI questions")
            questions = self._generate_hybrid_questions(
                job_requirements, candidate_cv, config, verbose
            )
        
        # Calculate metrics
        cost = self._calculate_cost()
        buffer_time = config.interview_duration_minutes * config.buffer_time_percent
        
        metrics = InterviewMetrics(
            api_calls=self.api_calls,
            input_tokens=self.input_tokens,
            output_tokens=self.output_tokens,
            cost_estimate=cost,
            questions_generated=len(questions.get("questions", [])),
            interview_duration_minutes=config.interview_duration_minutes,
            estimated_actual_duration_minutes=questions["interview_summary"]["estimated_interview_duration_minutes"],
            buffer_time_minutes=buffer_time
        )
        
        if verbose:
            print("\n✅ Interview generation complete!\n")
            print(metrics)
        
        return questions, metrics
    
    def _format_client_questions(self, questions: List[str], config: InterviewConfig) -> Dict[str, Any]:
        """Format client-provided questions with time estimates"""
        answer_times = {"short": 2, "medium": 4, "long": 7}
        time_per_q = answer_times[config.question_length] + 1  # +1 for asking
        
        formatted_questions = []
        for idx, question in enumerate(questions[:config.total_questions], 1):
            formatted_questions.append({
                "question_id": idx,
                "question": question,
                "category": "Client-Provided",
                "difficulty": config.difficulty_level.value,
                "expected_answer_type": config.question_length,
                "estimated_time_minutes": time_per_q,
                "key_points": [],
                "follow_up_questions": [],
                "why_asked": "Provided by client",
                "relevance_to_cv": "To be determined during interview"
            })
        
        total_time = len(formatted_questions) * time_per_q
        
        return {
            "questions": formatted_questions,
            "interview_summary": {
                "total_questions": len(formatted_questions),
                "difficulty_distribution": {
                    "easy": len([q for q in formatted_questions if config.difficulty_level == DifficultyLevel.EASY]),
                    "medium": len([q for q in formatted_questions if config.difficulty_level == DifficultyLevel.MEDIUM]),
                    "hard": len([q for q in formatted_questions if config.difficulty_level == DifficultyLevel.HARD])
                },
                "question_categories": ["Client-Provided"],
                "estimated_interview_duration_minutes": total_time
            }
        }
    
    def _generate_ai_questions(
        self,
        job_requirements: Dict[str, Any],
        candidate_cv: Dict[str, Any],
        config: InterviewConfig,
        verbose: bool
    ) -> Dict[str, Any]:
        """Generate all questions using AI"""
        prompt = self._build_generation_prompt(
            job_requirements, candidate_cv, config, all_ai=True
        )
        return self._call_api(prompt)
    
    def _generate_hybrid_questions(
        self,
        job_requirements: Dict[str, Any],
        candidate_cv: Dict[str, Any],
        config: InterviewConfig,
        verbose: bool
    ) -> Dict[str, Any]:
        """Generate hybrid questions (client + AI)"""
        client_qs_section = f"""
CLIENT PROVIDED QUESTIONS (Must include these):
{json.dumps(config.client_questions, indent=2)}

Include these {config.client_questions_count} questions as-is, then generate {config.total_questions - config.client_questions_count} additional AI questions.
"""
        
        prompt = self._build_generation_prompt(
            job_requirements, candidate_cv, config, all_ai=False,
            client_questions_section=client_qs_section
        )
        return self._call_api(prompt)
    
    def _build_generation_prompt(
        self,
        job_requirements: Dict[str, Any],
        candidate_cv: Dict[str, Any],
        config: InterviewConfig,
        all_ai: bool = True,
        client_questions_section: str = ""
    ) -> str:
        """Build the prompt for question generation"""
        
        phase_descriptions = {
            InterviewPhase.FIRST: "screening round - focus on basic fit and foundational skills",
            InterviewPhase.SECOND: "technical round - dive deep into technical expertise and problem-solving",
            InterviewPhase.FINAL: "executive/final round - assess culture fit, leadership, and final suitability"
        }
        
        answer_length_guidance = {
            "short": "1-2 minute answers expected",
            "medium": "3-5 minute answers expected",
            "long": "5-10 minute answers expected"
        }
        
        time_per_question = {
            "short": 3,   # 2 min answer + 1 min asking/transition
            "medium": 5,  # 4 min answer + 1 min asking/transition
            "long": 8     # 7 min answer + 1 min asking/transition
        }
        
        prompt = f"""You are an expert interview panel coordinator. Generate interview questions for a {config.interview_duration_minutes}-minute interview.

TIME CONSTRAINTS:
- Total Interview Duration: {config.interview_duration_minutes} minutes
- Questions to Generate: {config.total_questions}
- Time per Question: ~{time_per_question[config.question_length]} minutes (including answer + asking)
- Buffer Time: {config.buffer_time_percent * 100}% for intro, transitions, wrap-up

INTERVIEW CONTEXT:
- Phase: {config.phase.value} ({phase_descriptions[config.phase]})
- Difficulty: {config.difficulty_level.value}
- Answer Length: {answer_length_guidance[config.question_length]}
- Question Source: {config.question_source.value}

JOB REQUIREMENTS:
{json.dumps(job_requirements, indent=2)}

CANDIDATE PROFILE:
{json.dumps(candidate_cv, indent=2)}

{client_questions_section}

REQUIREMENTS:
1. Generate EXACTLY {config.total_questions} questions (optimized for {config.interview_duration_minutes} minutes)
2. Each question should have estimated_time_minutes set to {time_per_question[config.question_length]}
3. Tailor questions to match the candidate's experience and the job role
4. For {config.phase.value}, adjust question depth and scope appropriately
5. Include follow-up questions for each main question
6. Provide key points that indicate a good answer
7. Explain why each question is relevant to this role
8. Reference the candidate's CV when relevant
9. Ensure difficulty level is consistent: {config.difficulty_level.value}
10. Mix question types: technical, behavioral, situational, problem-solving
11. Questions should expose gaps or strengths relative to job requirements
12. Prioritize most important questions first (in case interview runs short)

GENERATE {config.total_questions} TIME-OPTIMIZED INTERVIEW QUESTIONS."""
        
        return prompt
    
    def _call_api(self, prompt: str) -> Dict[str, Any]:
        """Call OpenAI API with schema validation"""
        messages = [
            {
                "role": "system",
                "content": "You are an expert interview question generator. Generate comprehensive, time-optimized interview questions with detailed follow-ups and evaluation criteria."
            },
            {
                "role": "user",
                "content": prompt
            }
        ]
        
        response = llm.generate_json(
            messages=messages,
            model=self.model,
            schema=self.schema
        )
        # Track metrics
        self.api_calls += 1
        self.input_tokens += response.usage.prompt_tokens
        self.output_tokens += response.usage.completion_tokens
        
        return json.loads(response.choices[0].message.content)
    
    def save_to_file(self, questions: Dict[str, Any], output_file: str, verbose: bool = True) -> None:
        """Save generated questions to JSON file"""
        with open(output_file, 'w') as f:
            json.dump(questions, f, indent=2)
        
        if verbose:
            print(f"💾 Questions saved to: {output_file}\n")
    
    def _calculate_cost(self) -> float:
        """Calculate estimated cost"""
        input_cost = (self.input_tokens / 1000) * self.INPUT_PRICE_PER_1K
        output_cost = (self.output_tokens / 1000) * self.OUTPUT_PRICE_PER_1K
        return input_cost + output_cost

In [14]:
# ============================================================================
#                           EXAMPLE USAGE
# ============================================================================

if __name__ == "__main__":
    

    generator = InterviewQuestionGenerator(openai_api_key=API_KEY)
    
    # Sample data
    job_requirements = {
        "job_title": "Senior Software Engineer",
        "required_skills": ["Python", "JavaScript", "Go", "microservices", "AWS", "Docker"],
        "experience_required": "5+ years"
    }
    
    candidate_cv = {
        "personal_info": {"full_name": "Faizan Munsaf"},
        "skills": ["Python", "FastAPI", "Django", "Machine Learning"],
        "experience": [{"company": "Software Alliance", "position": "ML Engineer & FullStack Developer"}]
    }
    
    print("=" * 60)
    print("SCENARIO 1: 10-minute quick screening interview")
    print("=" * 60)
    config1 = InterviewConfig(
        phase=InterviewPhase.FIRST,
        difficulty_level=DifficultyLevel.EASY,
        question_source=QuestionSource.AI_GENERATED,
        interview_duration_minutes=10,
        question_length="short"  # Will generate ~3 questions
    )
    
    questions1, metrics1 = generator.generate(job_requirements, candidate_cv, config1)
    
    print("\n" + "=" * 60)
    print("SCENARIO 2: 30-minute technical deep-dive")
    print("=" * 60)
    config2 = InterviewConfig(
        phase=InterviewPhase.SECOND,
        difficulty_level=DifficultyLevel.HARD,
        question_source=QuestionSource.AI_GENERATED,
        interview_duration_minutes=30,
        question_length="medium"  # Will generate ~5 questions
    )
    
    questions2, metrics2 = generator.generate(job_requirements, candidate_cv, config2)
    
    print("\n" + "=" * 60)
    print("SCENARIO 3: 60-minute final round with hybrid questions")
    print("=" * 60)
    config3 = InterviewConfig(
        phase=InterviewPhase.FINAL,
        difficulty_level=DifficultyLevel.MEDIUM,
        question_source=QuestionSource.HYBRID,
        interview_duration_minutes=60,
        question_length="long",  # Will generate ~6 total questions
        client_questions=[
            "Tell us about yourself and your career journey",
            "Where do you see yourself in 5 years?"
        ],
        client_questions_count=2  # 2 client + 4 AI = 6 total
    )
    
    questions3, metrics3 = generator.generate(job_requirements, candidate_cv, config3)
    
    # Save results
    generator.save_to_file(questions1, "interview_10min.json")
    generator.save_to_file(questions2, "interview_30min.json")
    generator.save_to_file(questions3, "interview_60min.json")

SCENARIO 1: 10-minute quick screening interview
🚀 Starting TIME-BASED interview question generation...
⏱️  Interview Duration: 10 minutes
📊 Calculated Questions: 2
📄 Model: gpt-4o-mini
🎯 Configuration: first_round - easy
📊 Source: ai_generated
⏳ Answer Length: short

🤖 Generating all questions with AI...

✅ Interview generation complete!


╔════════════════════════════════════════════════════╗
║      INTERVIEW GENERATION METRICS                  ║
╠════════════════════════════════════════════════════╣
║ Target Duration:       10 minutes            ║
║ Estimated Actual:      10 minutes            ║
║ Buffer Time:          1.5 minutes            ║
║ Questions Generated:    2                    ║
║                                                    ║
║ API Calls:               1                   ║
║ Input Tokens:            678                 ║
║ Output Tokens:           430                 ║
║ Est. Cost:            $ 0.0004              ║
╚══════════════════════════════════════════════

In [None]:
questions3

{'questions': [{'question_id': 1,
   'question': 'Tell us about yourself and your career journey.',
   'category': 'Behavioral',
   'difficulty': 'medium',
   'expected_answer_type': 'narrative',
   'estimated_time_minutes': 8,
   'key_points': ['Clarity in career transitions',
    'Key achievements and milestones',
    'Connection to the role'],
   'follow_up_questions': ['What motivated you to choose a career in software engineering?',
    'Could you elaborate on a project that significantly impacted your career?'],
   'why_asked': "This question helps understand the candidate's background, motivations, and how their experiences have shaped their professional identity.",
   'relevance_to_cv': "The candidate's experience as an ML Engineer & FullStack Developer at Software Alliance ties directly to their qualifications for the Senior Software Engineer role."},
  {'question_id': 2,
   'question': 'Where do you see yourself in 5 years?',
   'category': 'Career Aspirations',
   'difficult

: 

# Question Generation After Pre Processing

In [None]:
quest_dict = {'questions': [{'question_id': 1,
   'question': 'Tell us about yourself and your career journey.',
   'category': 'Behavioral',
   'difficulty': 'medium',
   'expected_answer_type': 'narrative',
   'estimated_time_minutes': 8,
   'key_points': ['Clarity in career transitions',
    'Key achievements and milestones',
    'Connection to the role'],
   'follow_up_questions': ['What motivated you to choose a career in software engineering?',
    'Could you elaborate on a project that significantly impacted your career?'],
   'why_asked': "This question helps understand the candidate's background, motivations, and how their experiences have shaped their professional identity.",
   'relevance_to_cv': "The candidate's experience as an ML Engineer & FullStack Developer at Software Alliance ties directly to their qualifications for the Senior Software Engineer role."},
  {'question_id': 2,
   'question': 'Where do you see yourself in 5 years?',
   'category': 'Career Aspirations',
   'difficulty': 'medium',
   'expected_answer_type': 'aspirational',
   'estimated_time_minutes': 8,
   'key_points': ['Alignment with company growth',
    'Specific roles or skill development',
    'Leadership aspirations'],
   'follow_up_questions': ["What steps do you think you'll take to achieve these goals?",
    'How do you see your goals impacting the team you work with?'],
   'why_asked': "Evaluating long-term goals helps gauge the candidate's commitment and cultural fit with the organization's growth trajectory.",
   'relevance_to_cv': 'His aspiration in software engineering will reflect his motivation to grow within the technology domain.'},
  {'question_id': 3,
   'question': 'Given your experience with Python, can you discuss a challenging project where you utilized Python to solve a complex problem?',
   'category': 'Technical',
   'difficulty': 'medium',
   'expected_answer_type': 'problem-solving',
   'estimated_time_minutes': 8,
   'key_points': ['Description of project complexity',
    'Use of Python and relevant libraries',
    'Impact of the solution'],
   'follow_up_questions': ['What difficulties did you encounter during this project, and how did you overcome them?',
    'If you were to improve that project now, what would you change?'],
   'why_asked': 'This question assesses technical proficiency in Python, critical for the role, and the ability to apply skills in real-world scenarios.',
   'relevance_to_cv': "The candidate's experience suggests proficiency in Python but needs to delve into practical application."},
  {'question_id': 4,
   'question': 'Can you describe an experience where you had to lead a team to deliver a software project? What challenges did you face?',
   'category': 'Leadership',
   'difficulty': 'medium',
   'expected_answer_type': 'narrative',
   'estimated_time_minutes': 8,
   'key_points': ['Leadership approach',
    'Conflict management',
    'Final outcomes of the project'],
   'follow_up_questions': ['How did you ensure that all team members were on the same page?',
    'What tools or methodologies did you use for project management?'],
   'why_asked': 'Leadership is key for a senior role, and this question evaluates both skills and experiences related to team management and delivery.',
   'relevance_to_cv': "The candidate's role likely involved collaboration in software development; this will provide insights into their leadership style."},
  {'question_id': 5,
   'question': 'Imagine that your microservices architecture is experiencing performance issues. How would you go about diagnosing and resolving them?',
   'category': 'Problem-Solving',
   'difficulty': 'medium',
   'expected_answer_type': 'analytical',
   'estimated_time_minutes': 8,
   'key_points': ['Systematic approach to diagnostics',
    'Tools or metrics to use',
    'Proposed solutions'],
   'follow_up_questions': ['What are the potential impacts of these performance issues on business operations?',
    'How do you prioritize which issues to tackle first?'],
   'why_asked': 'This evaluates critical thinking, analytical skills, and knowledge of microservices, which are essential for the role.',
   'relevance_to_cv': "The candidate's experience with microservices will be critical in understanding their approach to system-wide issues."},
  {'question_id': 6,
   'question': 'In your opinion, what are the key differences between traditional software development approaches and Agile methodologies? How have you applied Agile principles in your previous roles?',
   'category': 'Situational',
   'difficulty': 'medium',
   'expected_answer_type': 'conceptual',
   'estimated_time_minutes': 8,
   'key_points': ['Clear understanding of Agile principles',
    'Examples of Agile applied in a team',
    'Impact on project delivery'],
   'follow_up_questions': ['Can you give a specific example of a challenge you faced using Agile?',
    'How do you handle team dynamics in an Agile environment?'],
   'why_asked': 'Understanding Agile is crucial for modern software development, and this question tests the candidate’s adaptability to team processes.',
   'relevance_to_cv': "Faizan's past roles may have required adaptability to team methodologies, making this relevant to collaboration and project success."}],
 'interview_summary': {'total_questions': 6,
  'difficulty_distribution': {'easy': 0, 'medium': 6, 'hard': 0},
  'question_categories': ['Behavioral',
   'Career Aspirations',
   'Technical',
   'Leadership',
   'Problem-Solving',
   'Situational'],
  'estimated_interview_duration_minutes': 60}}


In [9]:
def evaluation(
    question: str, 
    answer: str, 
    question_category: str, 
    follow_up_question: list[str],
    why_asked: str,
):
    evaluation_prompt = f"""
    You are an expert interview evaluator. Analyze whether it is relevant to ask a follow-up question based on the candidate's answer.

    Instructions:
    - Read the interview question and candidate's answer.
    - Decide if a follow-up question is necessary to clarify, probe deeper, or evaluate competency.
    - Output only a single number between 0 and 10.
      - 0 = Follow-up not relevant at all
      - 10 = Follow-up extremely relevant and necessary

    INTERVIEW QUESTION:
    {question}

    CANDIDATE ANSWER:
    {answer}

    QUESTION CATEGORY:
    {question_category}

    FOLLOW-UP QUESTIONS:
    {follow_up_question}

    WHY THIS QUESTION WAS ASKED:
    {why_asked}

    IMPORTANT:
    - Output ONLY a single numerical value (0–10). No explanation.
    """

    # Stream response
    response_text = ""
    for chunk in llm.chat_stream(
        messages=[
            {"role": "system", "content": "You are an expert interview evaluator."},
            {"role": "user", "content": evaluation_prompt}
        ],
        model="gpt-4o-mini"
    ):
        if chunk.get("content"):
            response_text += chunk["content"] # type: ignore

    # Clean output to extract only number
    response_text = response_text.strip()
    try:
        relevance_score = float(response_text)
    except:
        relevance_score = 0  # In case model returns unexpected output

    return relevance_score


In [10]:
eval_score = evaluation(
    question="Tell us about yourself and your career journey.",
    answer="I have been working as a FullStack Developer for the past 5 years, focusing on building scalable web applications using Python and JavaScript. My journey started with a passion for coding in high school, which led me to pursue a degree in Computer Science. Over the years, I've worked on various projects, from small startups to large enterprises, honing my skills in both front-end and back-end development. Recently, I transitioned into machine learning, where I've been able to apply my programming skills to develop predictive models and data-driven solutions.",
    question_category="Behavioral",
    follow_up_question=[
        "What motivated you to choose a career in software engineering?",
        "Could you elaborate on a project that significantly impacted your career?"
    ],
    why_asked="This question helps understand the candidate's background, motivations, and how their experiences have shaped their professional identity."
)

In [None]:
# if relevance_score < 7.5:
#         new_question_prompt = f"""
#         The follow-up questions provided are not relevant enough. Based on the candidate's answer, 
#         generate ONE better follow-up question that digs deeper, clarifies details, or evaluates competency.

#         INTERVIEW QUESTION:
#         {question}

#         CANDIDATE ANSWER:
#         {answer}

#         QUESTION CATEGORY:
#         {question_category}

#         WHY THIS QUESTION WAS ASKED:
#         {why_asked}

#         IMPORTANT: 
#         - Return ONLY the new follow-up question.
#         - Do NOT include explanations.
#         """

#         new_follow_up = ""
#         for chunk in llm.chat_stream(
#             messages=[
#                 {"role": "system", "content": "You are an expert interview question generator."},
#                 {"role": "user", "content": new_question_prompt}
#             ],
#             model="gpt-4o-mini"
#         ):
#             if chunk.get("content"):
#                 new_follow_up += chunk["content"]

Relevance Score for Follow-Up Question: 8.0/10
