In [1]:
# ======= Cell 1: Install dependencies =======
!pip install -q gradio pdfplumber python-docx sentence-transformers faiss-cpu scikit-learn spacy nltk transformers requests
!python -m spacy download en_core_web_sm

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

print("✓ All dependencies installed successfully!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


✓ All dependencies installed successfully!


In [2]:
# ======= Cell 2: Configuration & Imports =======
import os
from typing import List, Dict, Tuple, Any, Optional
import re
import io
import json
import pdfplumber
import docx
import tempfile
from collections import Counter
import math
import html
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# NLP & ML libraries
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

# Web & UI
import gradio as gr
import requests

# ======= Configuration =======
HF_TOKEN = "hf_"
HF_MODEL = "google/flan-t5-base"

# Load spaCy NLP
try:
    nlp = spacy.load("en_core_web_sm")
except:
    print("spaCy model not loaded. Will use fallback.")
    nlp = None

# Load embedding model
print("Loading embedding model (this takes ~30 seconds)...")
embed_model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(embed_model_name)
print("✓ Embedding model loaded!")

stop_words = set(stopwords.words('english'))

print("✓ All imports successful!")




Loading embedding model (this takes ~30 seconds)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Embedding model loaded!
✓ All imports successful!


In [3]:
# ======= Cell 3: Enhanced Skills List & Extraction =======
COMMON_SKILLS = {
    # Programming Languages
    "python", "java", "c++", "c#", "c", "javascript", "typescript", "golang", "rust", "kotlin",
    "swift", "ruby", "php", "scala", "r", "matlab", "perl", "bash", "shell",

    # Web Development
    "react", "vue", "angular", "node.js", "express", "django", "flask", "fastapi", "spring",
    "asp.net", "html", "css", "sass", "webpack", "jest", "cypress", "rest", "graphql",

    # Data & ML
    "tensorflow", "pytorch", "scikit-learn", "keras", "xgboost", "lightgbm", "catboost",
    "pandas", "numpy", "scipy", "matplotlib", "seaborn", "plotly", "spark", "hadoop",
    "sql", "nosql", "mongodb", "postgresql", "mysql", "redis", "elasticsearch",
    "data analysis", "data science", "ml", "machine learning", "deep learning", "nlp", "computer vision",
    "cv", "opencv", "cv2", "image processing", "nlp", "rag", "llm", "embedding",

    # Cloud & DevOps
    "aws", "azure", "gcp", "google cloud", "docker", "kubernetes", "jenkins", "ci/cd",
    "git", "github", "gitlab", "terraform", "ansible", "prometheus", "grafana", "ecs",
    "lambda", "s3", "dynamodb", "ec2", "rds", "cloudwatch",

    # Tools & Platforms
    "tableau", "powerbi", "excel", "jira", "confluence", "linux", "windows", "macos",
    "vim", "vscode", "jupyter", "colab", "kaggle", "anaconda", "conda", "pip",

    # Methodologies & Soft Skills
    "agile", "scrum", "kanban", "waterfall", "devops", "microservices", "api design",
    "rest api", "database design", "system design", "oop", "functional programming",
    "testing", "debugging", "optimization", "performance tuning", "security",
}

COMMON_SKILLS = sorted(list(COMMON_SKILLS))

def extract_skills(text: str, skills_list: List[str] = COMMON_SKILLS) -> List[str]:
    """
    Extract skills from text using multiple strategies.
    """
    text_lower = text.lower()
    found = set()

    # Strategy 1: Exact substring matching
    for skill in skills_list:
        if skill in text_lower:
            # Validate: ensure it's a word boundary (not part of another word)
            pattern = r'\b' + re.escape(skill) + r'\b'
            if re.search(pattern, text_lower):
                found.add(skill)

    # Strategy 2: Token-based matching for multi-word skills
    for skill in skills_list:
        if ' ' in skill:  # multi-word skills
            if skill in text_lower:
                found.add(skill)

    # Strategy 3: NLP-based for missed skills
    if nlp:
        try:
            doc = nlp(text[:5000])  # limit text to avoid slowness
            for token in doc:
                t = token.text.lower()
                if t in skills_list:
                    found.add(t)
        except:
            pass

    return sorted(list(found))

def extract_skills_from_jd(jd_text: str) -> List[str]:
    """Extract and rank skills from job description."""
    skills = extract_skills(jd_text)
    return skills

print("✓ Skills extraction module loaded!")


✓ Skills extraction module loaded!


In [4]:
# ======= Cell 4: Robust Document Parsing =======
EMAIL_RE = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
PHONE_RE = re.compile(r'(\+?\d[\d\-\s()]{7,}\d)')

def extract_text_from_pdf(file_obj) -> str:
    """Extract text from PDF with error handling."""
    try:
        text_chunks = []
        with pdfplumber.open(file_obj) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text_chunks.append(text)
        return "\n".join(text_chunks)
    except Exception as e:
        print(f"⚠ PDF parsing error: {e}")
        return ""

def extract_text_from_docx(file_obj) -> str:
    """Extract text from DOCX with error handling."""
    try:
        doc = docx.Document(file_obj)
        return "\n".join([p.text for p in doc.paragraphs])
    except Exception as e:
        print(f"⚠ DOCX parsing error: {e}")
        return ""

def extract_text_from_txt(file_obj) -> str:
    """Extract text from TXT with error handling."""
    try:
        if hasattr(file_obj, 'read'):
            content = file_obj.read()
            if isinstance(content, bytes):
                return content.decode('utf-8', errors='ignore')
            return content
        else:
            with open(file_obj, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
    except Exception as e:
        print(f"⚠ TXT parsing error: {e}")
        return ""

def extract_text_from_file(file_obj, filename: str) -> str:
    """Route to appropriate extractor based on file extension."""
    ext = filename.lower().split('.')[-1]

    try:
        file_obj.seek(0)
    except:
        pass

    if ext == "pdf":
        return extract_text_from_pdf(file_obj)
    elif ext in ["docx", "doc"]:
        return extract_text_from_docx(file_obj)
    elif ext == "txt":
        return extract_text_from_txt(file_obj)
    else:
        # Try as text
        return extract_text_from_txt(file_obj)

def clean_text(text: str) -> str:
    """Clean and normalize text."""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s\.\,-]', '', text)
    return text.strip()

print("✓ Document parsing module loaded!")


✓ Document parsing module loaded!


In [5]:
# ======= Cell 5: Resume Parsing & NER =======
def extract_name(text: str) -> str:
    """Extract candidate name from resume."""
    lines = [l.strip() for l in text.split('\n') if l.strip()]

    if not lines:
        return "Unknown"

    # Strategy 1: NLP PERSON entity at top
    if nlp:
        try:
            for line in lines[:8]:
                doc = nlp(line)
                for ent in doc.ents:
                    if ent.label_ == "PERSON":
                        return ent.text
        except:
            pass

    # Strategy 2: First short line
    for line in lines[:5]:
        if 3 <= len(line.split()) <= 4 and len(line) < 60:
            return line

    return lines[:50]

def extract_contacts(text: str) -> Dict[str, List[str]]:
    """Extract emails and phone numbers."""
    emails = list(set(EMAIL_RE.findall(text)))

    # Phone extraction with validation
    phones = []
    for match in PHONE_RE.findall(text):
        # Extract only digits
        digits = re.sub(r'\D', '', match)
        if 7 <= len(digits) <= 15:  # Reasonable phone length
            phones.append(digits)
    phones = list(set(phones))

    return {
        "emails": emails,
        "phones": phones
    }

def extract_education_info(text: str) -> Dict[str, Any]:
    """
    Extract education details with improved heuristics.
    """
    edu_keywords = [
        "bachelor", "master", "phd", "doctorate", "degree",
        "b.tech", "m.tech", "b.sc", "m.sc", "mba", "b.a", "m.a",
        "diploma", "certificate", "bootcamp", "associate"
    ]

    lines = [l.strip() for l in text.split('\n') if l.strip()]

    education_lines = []
    for line in lines:
        if any(keyword in line.lower() for keyword in edu_keywords):
            if len(line) < 150:  # Reasonable length
                education_lines.append(line)

    # Try to find education section
    if not education_lines:
        for i, line in enumerate(lines):
            if "education" in line.lower():
                # Take next 10 lines as education
                education_lines = lines[i+1:min(i+10, len(lines))]
                break

    return {
        "education_lines": education_lines[:5],  # Top 5
        "has_higher_ed": any(k in text.lower() for k in ["bachelor", "master", "phd", "b.tech", "m.tech"])
    }

def extract_experience_info(text: str) -> Dict[str, Any]:
    """
    Extract experience details with improved heuristics.
    """
    exp_keywords = [
        "experience", "work experience", "employment", "professional",
        "engineer", "developer", "intern", "analyst", "manager",
        "specialist", "consultant", "researcher", "worked", "responsible"
    ]

    lines = [l.strip() for l in text.split('\n') if l.strip()]
    experience_lines = []

    # Find experience section
    for i, line in enumerate(lines):
        if any(keyword in line.lower() for keyword in exp_keywords):
            # Collect lines until next section
            j = i + 1
            while j < len(lines) and not any(sec in lines[j].lower() for sec in ["education", "skills", "projects"]):
                if lines[j].strip():
                    experience_lines.append(lines[j])
                j += 1
                if len(experience_lines) >= 15:  # Limit
                    break

    # Extract years of experience
    years_match = re.findall(r'(\d+)\s*\+?\s*(?:years|yrs)', text.lower())
    years_of_exp = max([int(y) for y in years_match]) if years_match else 0

    return {
        "experience_lines": experience_lines[:10],
        "years_of_experience": years_of_exp
    }

def parse_resume(text: str, filename: str = "") -> Dict[str, Any]:
    """
    Complete resume parsing with all information extraction.
    """
    text = clean_text(text) if text else ""

    name = extract_name(text)
    contacts = extract_contacts(text)
    edu_info = extract_education_info(text)
    exp_info = extract_experience_info(text)
    skills = extract_skills(text)

    # Create summary for embedding
    summary_parts = [
        name,
        " ".join(contacts.get("emails", [])),
        " ".join(edu_info.get("education_lines", [])),
        " ".join(exp_info.get("experience_lines", [])),
        " ".join(skills),
        text[:1500]  # First part of resume
    ]
    summary_text = " ".join(summary_parts)

    return {
        "filename": filename,
        "name": name,
        "contacts": contacts,
        "education": edu_info,
        "experience": exp_info,
        "skills": skills,
        "raw_text": text,
        "summary_text": summary_text,
        "years_of_experience": exp_info.get("years_of_experience", 0)
    }

print("✓ Resume parsing module loaded!")


✓ Resume parsing module loaded!


In [6]:
# ======= Cell 6: Embedding & Similarity Functions =======
def embed_text(text: str, max_length: int = 512) -> np.ndarray:
    """
    Create embedding for a single text.
    Truncate if too long to avoid memory issues.
    """
    text = text[:max_length]
    emb = embedder.encode([text], convert_to_numpy=True, show_progress_bar=False)
    # Normalize
    emb = emb / (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-10)
    return emb

def embed_texts(texts: List[str]) -> np.ndarray:
    """Create embeddings for multiple texts."""
    texts = [t[:512] for t in texts]
    embs = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    # Normalize
    embs = embs / (np.linalg.norm(embs, axis=1, keepdims=True) + 1e-10)
    return embs

def semantic_similarity(text_a: str, text_b: str) -> float:
    """Compute cosine similarity between two texts."""
    try:
        embs = embed_texts([text_a, text_b])
        sim = float(np.dot(embs, embs))
        # Normalize from [-1, 1] to [0, 1]
        return max(0.0, (sim + 1) / 2)
    except Exception as e:
        print(f"⚠ Similarity error: {e}")
        return 0.0

def batch_semantic_similarity(reference_text: str, comparison_texts: List[str]) -> List[float]:
    """
    Efficiently compute similarity between one reference and multiple texts.
    """
    try:
        ref_emb = embed_text(reference_text)
        compare_embs = embed_texts(comparison_texts)

        similarities = []
        for emb in compare_embs:
            sim = float(np.dot(ref_emb, emb))
            similarities.append(max(0.0, (sim + 1) / 2))

        return similarities
    except Exception as e:
        print(f"⚠ Batch similarity error: {e}")
        return [0.0] * len(comparison_texts)

print("✓ Embedding module loaded!")


✓ Embedding module loaded!


In [7]:
# ======= Cell 7: Evaluation Metrics =======
def calculate_jaccard_similarity(set_a: List[str], set_b: List[str]) -> float:
    """
    Jaccard similarity between two sets (skills, keywords, etc.).
    Intersection / Union
    """
    if not set_a and not set_b:
        return 1.0

    set_a = set([str(x).lower() for x in set_a])
    set_b = set([str(x).lower() for x in set_b])

    intersection = len(set_a & set_b)
    union = len(set_a | set_b)

    return intersection / union if union > 0 else 0.0

def evaluate_metric_skills_match(jd_skills: List[str], resume_skills: List[str]) -> float:
    """
    Precision: % of JD skills found in resume (weighted towards JD requirements).
    """
    if not jd_skills:
        return 0.0

    matched = sum(1 for skill in jd_skills if any(
        skill.lower() in resume_skill.lower() or resume_skill.lower() in skill.lower()
        for resume_skill in resume_skills
    ))

    score = matched / len(jd_skills)
    return min(1.0, score)

def evaluate_metric_skills_recall(jd_skills: List[str], resume_skills: List[str]) -> float:
    """
    Recall: % of resume skills that match JD (penalizes over-qualification).
    """
    if not resume_skills:
        return 1.0

    matched = sum(1 for resume_skill in resume_skills if any(
        resume_skill.lower() in jd_skill.lower() or jd_skill.lower() in resume_skill.lower()
        for jd_skill in jd_skills
    ))

    score = matched / len(resume_skills)
    return score

def evaluate_metric_relevance_ranking(jd_text: str, resume_summaries: List[str]) -> List[float]:
    """
    Rank resumes by semantic relevance to JD using embeddings.
    Returns correlation score.
    """
    if not resume_summaries:
        return [0.0]

    similarities = batch_semantic_similarity(jd_text, resume_summaries)
    return similarities

print("✓ Evaluation metrics module loaded!")


✓ Evaluation metrics module loaded!


In [8]:
# ======= Cell 8: Multi-Factor Scoring System =======
DEFAULT_WEIGHTS = {
    "skills_match": 0.40,           # Precision: How many JD skills in resume?
    "experience_relevance": 0.25,   # Semantic match between JD and experience
    "education_relevance": 0.15,    # Does resume have required education?
    "years_of_experience": 0.10,    # Bonus for experience level
    "overall_fit": 0.10             # HuggingFace LLM soft evaluation (optional)
}

def score_candidate(
    jd_text: str,
    jd_skills: List[str],
    resume_parsed: Dict[str, Any],
    weights: Dict[str, float] = DEFAULT_WEIGHTS,
    use_llm: bool = True
) -> Dict[str, Any]:
    """
    Compute multi-factor score for a candidate resume.

    Returns:
        breakdown: Dict with individual scores and final score (0-100)
    """

    resume_skills = resume_parsed.get("skills", [])
    resume_text = resume_parsed.get("raw_text", "")
    experience_text = " ".join(resume_parsed.get("experience", {}).get("experience_lines", []))
    education_lines = resume_parsed.get("education", {}).get("education_lines", [])
    years_exp = resume_parsed.get("years_of_experience", 0)

    # ===== FACTOR 1: Skills Match (Precision) =====
    skills_match_score = evaluate_metric_skills_match(jd_skills, resume_skills)

    # ===== FACTOR 2: Experience Relevance (Semantic) =====
    if experience_text.strip():
        exp_relevance_score = semantic_similarity(jd_text, experience_text)
    else:
        exp_relevance_score = semantic_similarity(jd_text, resume_text)

    # ===== FACTOR 3: Education Relevance =====
    edu_has_higher_ed = resume_parsed.get("education", {}).get("has_higher_ed", False)
    if education_lines:
        edu_text = " ".join(education_lines)
        edu_relevance_score = semantic_similarity(jd_text, edu_text)
        edu_relevance_score = edu_relevance_score * 0.7 + (0.3 if edu_has_higher_ed else 0.0)
    else:
        edu_relevance_score = 0.3 if edu_has_higher_ed else 0.0

    # ===== FACTOR 4: Years of Experience =====
    # Normalize to 0-1 (assuming 10 years is max for score)
    years_exp_score = min(1.0, years_exp / 10.0) if years_exp > 0 else 0.2

    # ===== FACTOR 5: Overall Fit (HuggingFace LLM) =====
    llm_score = 0.0
    if use_llm and HF_TOKEN:
        llm_score = get_llm_fit_score(jd_text, resume_text)

    # ===== Weighted Aggregation =====
    final_score = (
        skills_match_score * weights["skills_match"]
        + exp_relevance_score * weights["experience_relevance"]
        + edu_relevance_score * weights["education_relevance"]
        + years_exp_score * weights["years_of_experience"]
        + llm_score * weights["overall_fit"]
    )

    # Normalize to 0-100
    final_score_100 = round(final_score * 100, 2)

    breakdown = {
        "skills_match_score": round(skills_match_score * 100, 2),
        "experience_relevance_score": round(exp_relevance_score * 100, 2),
        "education_relevance_score": round(edu_relevance_score * 100, 2),
        "years_of_experience_score": round(years_exp_score * 100, 2),
        "llm_fit_score": round(llm_score * 100, 2),
        "final_score": final_score_100,
        "weights_used": weights
    }

    return breakdown

def get_llm_fit_score(jd_text: str, resume_text: str) -> float:
    """
    Use HuggingFace Inference API to get LLM evaluation.
    Returns a score 0-1.
    """
    if not HF_TOKEN:
        return 0.0

    # Truncate texts to avoid token limits
    jd_short = jd_text[:800]
    resume_short = resume_text[:800]

    prompt = f"""Rate how well this candidate matches the job description (0-1 scale).

Job Description (key requirements):
{jd_short}

Candidate Resume Summary:
{resume_short}

Respond ONLY with a JSON: {{"score": 0.75}}
"""

    try:
        headers = {"Authorization": f"Bearer {HF_TOKEN}"}
        payload = {
            "inputs": prompt,
            "parameters": {
                "max_new_tokens": 50,
                "temperature": 0.3,
                "top_p": 0.9
            }
        }

        response = requests.post(
            f"https://api-inference.huggingface.co/models/{HF_MODEL}",
            headers=headers,
            json=payload,
            timeout=30
        )

        if response.status_code == 200:
            result = response.json()

            # Extract response text
            if isinstance(result, list) and len(result) > 0:
                text = result.get('generated_text', '')
            else:
                text = result.get('generated_text', '')

            # Parse JSON score
            score_match = re.search(r'"score"\s*:\s*([0-1](?:\.\d+)?)', text)
            if score_match:
                return float(score_match.group(1))

        return 0.0

    except Exception as e:
        print(f"⚠ LLM evaluation failed: {e}")
        return 0.0

print("✓ Scoring module loaded!")


✓ Scoring module loaded!


In [9]:
# ======= Cell 9: Resume Processing & Ranking =======
def process_uploaded_files(files: List[Any]) -> Tuple[List[Dict[str, Any]], List[str]]:
    """
    Parse uploaded resume files.

    Returns:
        parsed_resumes: List of parsed resume dicts
        errors: List of error messages
    """
    parsed_resumes = []
    errors = []

    if not files:
        return [], ["No files uploaded"]

    for file_obj in files:
        try:
            # Extract filename
            if hasattr(file_obj, 'name'):
                filename = file_obj.name
            elif isinstance(file_obj, dict) and 'name' in file_obj:
                filename = file_obj['name']
            else:
                filename = "resume"

            # Extract text
            try:
                text = extract_text_from_file(file_obj, filename)
            except Exception as e:
                errors.append(f"{filename}: Failed to extract text - {str(e)}")
                continue

            if not text or len(text.strip()) < 50:
                errors.append(f"{filename}: Document is empty or too short")
                continue

            # Parse resume
            parsed = parse_resume(text, filename=filename)
            parsed_resumes.append(parsed)

        except Exception as e:
            errors.append(f"{filename}: {str(e)}")

    return parsed_resumes, errors

def rank_candidates(
    jd_text: str,
    jd_skills: List[str],
    parsed_resumes: List[Dict[str, Any]],
    top_k: int = 5,
    use_llm: bool = True
) -> Tuple[List[Dict[str, Any]], List[str]]:
    """
    Score and rank candidates.

    Returns:
        ranked_results: List of top-k candidates with scores
        warnings: List of warning messages
    """
    warnings_list = []

    if not parsed_resumes:
        return [], ["No resumes to rank"]

    if not jd_text.strip():
        warnings_list.append("Job description is empty")

    # Score all candidates
    scored_candidates = []
    for resume in parsed_resumes:
        breakdown = score_candidate(
            jd_text,
            jd_skills,
            resume,
            use_llm=use_llm
        )

        scored_candidates.append({
            "resume": resume,
            "breakdown": breakdown
        })

    # Sort by final score descending
    scored_candidates.sort(key=lambda x: x["breakdown"]["final_score"], reverse=True)

    # Prepare results
    results = []
    for rank, candidate in enumerate(scored_candidates[:top_k], 1):
        resume = candidate["resume"]
        breakdown = candidate["breakdown"]

        result = {
            "rank": rank,
            "filename": resume.get("filename", "Unknown"),
            "name": resume.get("name", "Unknown"),
            "email": resume.get("contacts", {}).get("emails", [None]),
            "phone": resume.get("contacts", {}).get("phones", [None]),
            "years_of_experience": resume.get("years_of_experience", 0),
            "skills_found": resume.get("skills", []),
            "final_score": breakdown["final_score"],
            "score_breakdown": {
                "skills_match": breakdown["skills_match_score"],
                "experience_relevance": breakdown["experience_relevance_score"],
                "education_relevance": breakdown["education_relevance_score"],
                "years_of_experience": breakdown["years_of_experience_score"],
                "llm_fit": breakdown["llm_fit_score"]
            },
            "education": resume.get("education", {}).get("education_lines", []),
            "experience_summary": resume.get("experience", {}).get("experience_lines", [])[:3],
            "matched_skills": [
                s for s in resume.get("skills", []) if any(
                    jskill.lower() in s.lower() or s.lower() in jskill.lower()
                    for jskill in jd_skills
                )
            ]
        }
        results.append(result)

    return results, warnings_list

print("✓ Processing & ranking module loaded!")


✓ Processing & ranking module loaded!


In [10]:
# ======= Cell 10: Gradio Web Interface =======
def create_html_report(results: List[Dict[str, Any]], jd_skills: List[str]) -> str:
    """Create beautiful HTML report of ranked candidates."""

    if not results:
        return "<div style='padding:20px;color:red;'>No candidates to display</div>"

    html_parts = [
        "<style>",
        """
        .candidate-card {
            border: 1px solid #ddd;
            border-radius: 8px;
            padding: 20px;
            margin: 15px 0;
            background: #f9f9f9;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        .rank-badge {
            display: inline-block;
            background: #2196F3;
            color: white;
            padding: 8px 12px;
            border-radius: 50%;
            font-weight: bold;
            font-size: 18px;
            margin-right: 10px;
        }
        .score-large {
            font-size: 32px;
            font-weight: bold;
            color: #2196F3;
            margin: 10px 0;
        }
        .score-breakdown {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 10px;
            margin: 15px 0;
        }
        .score-item {
            background: white;
            padding: 12px;
            border-radius: 6px;
            border-left: 4px solid #2196F3;
        }
        .score-label {
            font-size: 12px;
            color: #666;
            font-weight: bold;
        }
        .score-value {
            font-size: 20px;
            font-weight: bold;
            color: #2196F3;
        }
        .skills-container {
            margin: 15px 0;
        }
        .skill-badge {
            display: inline-block;
            background: #4CAF50;
            color: white;
            padding: 5px 10px;
            margin: 3px;
            border-radius: 15px;
            font-size: 12px;
        }
        .skill-badge.matched {
            background: #2196F3;
        }
        .contact-info {
            font-size: 14px;
            color: #666;
            margin: 10px 0;
        }
        .section-title {
            font-size: 14px;
            font-weight: bold;
            color: #333;
            margin-top: 12px;
            margin-bottom: 8px;
        }
        """,
        "</style>"
    ]

    for result in results:
        matched_skills = result.get("matched_skills", [])
        all_skills = result.get("skills_found", [])

        html_parts.append(f"""
        <div class="candidate-card">
            <div style="display: flex; align-items: center;">
                <span class="rank-badge">#{result['rank']}</span>
                <div>
                    <h2 style="margin:0;">{html.escape(result['name'])}</h2>
                    <p style="margin:0;color:#666;">{html.escape(result['filename'])}</p>
                </div>
            </div>

            <div class="score-large">{result['final_score']}/100</div>

            <div class="contact-info">
                {f"📧 {html.escape(result['email'] or 'N/A')}" if result.get('email') else ""}
                {f" | 📱 {html.escape(result['phone'] or 'N/A')}" if result.get('phone') else ""}
                {f" | 💼 {result['years_of_experience']} years exp" if result.get('years_of_experience') else ""}
            </div>

            <div class="score-breakdown">
                <div class="score-item">
                    <div class="score-label">Skills Match</div>
                    <div class="score-value">{result['score_breakdown']['skills_match']:.1f}%</div>
                </div>
                <div class="score-item">
                    <div class="score-label">Experience</div>
                    <div class="score-value">{result['score_breakdown']['experience_relevance']:.1f}%</div>
                </div>
                <div class="score-item">
                    <div class="score-label">Education</div>
                    <div class="score-value">{result['score_breakdown']['education_relevance']:.1f}%</div>
                </div>
                <div class="score-item">
                    <div class="score-label">Experience Level</div>
                    <div class="score-value">{result['score_breakdown']['years_of_experience']:.1f}%</div>
                </div>
                {"<div class='score-item'><div class='score-label'>LLM Fit</div><div class='score-value'>" + f"{result['score_breakdown']['llm_fit']:.1f}%" + "</div></div>" if result['score_breakdown']['llm_fit'] > 0 else ""}
            </div>

            <div class="skills-container">
                <div class="section-title">Matched Skills ({len(matched_skills)}/{len(jd_skills)} required):</div>
                {' '.join([f'<span class="skill-badge matched">{html.escape(s)}</span>' for s in matched_skills[:8]])}
            </div>

            <div class="skills-container">
                <div class="section-title">All Skills Found ({len(all_skills)}):</div>
                {' '.join([f'<span class="skill-badge">{html.escape(s)}</span>' for s in all_skills[:12]])}
            </div>

            {"<div class='section-title'>Education:</div><ul style='margin:8px 0;'>" + "".join([f"<li>{html.escape(e)}</li>" for e in result['education'][:3]]) + "</ul>" if result.get('education') else ""}

            {"<div class='section-title'>Recent Experience:</div><ul style='margin:8px 0;'>" + "".join([f"<li>{html.escape(e)}</li>" for e in result['experience_summary']]) + "</ul>" if result.get('experience_summary') else ""}
        </div>
        """)

    return "".join(html_parts)

# Gradio Interface
with gr.Blocks(title="Resume Ranking Application", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📄 Resume Ranking Application")
    gr.Markdown("**AI-powered recruitment system** — Upload resumes and a job description to automatically rank candidates")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Input Data")

            resume_files = gr.File(
                label="📂 Upload Resumes (PDF, DOCX, TXT)",
                file_count="multiple",
                file_types=[".pdf", ".docx", ".doc", ".txt"]
            )

            jd_text = gr.Textbox(
                label="📋 Job Description",
                placeholder="Paste the job description here...",
                lines=10
            )

            custom_skills = gr.Textbox(
                label="⭐ Optional: Priority Skills (comma-separated)",
                placeholder="e.g., python, aws, kubernetes",
                lines=2
            )

            with gr.Row():
                top_k_slider = gr.Slider(
                    minimum=1,
                    maximum=20,
                    value=5,
                    step=1,
                    label="Top candidates to show"
                )

                use_llm_checkbox = gr.Checkbox(
                    label="Use HuggingFace LLM evaluation",
                    value=True
                )

            rank_button = gr.Button("🚀 Rank Candidates", variant="primary", scale=2)

        with gr.Column(scale=1):
            gr.Markdown("### Results")
            output_html = gr.HTML(label="Ranked Candidates")

            with gr.Row():
                download_json = gr.JSON(label="Detailed Results (JSON)", visible=False)

    # Error/Warning display
    error_message = gr.Markdown(visible=False)

    def on_rank_click(files, jd, top_k, custom_skills_text, use_llm):
        # Reset error
        error_html = ""

        # Validate inputs
        if not jd or len(jd.strip()) < 50:
            return "", {}, True, "⚠️ **Please enter a job description (at least 50 characters)**"

        if not files or len(files) == 0:
            return "", {}, True, "⚠️ **Please upload at least one resume**"

        try:
            # Process files
            parsed_resumes, file_errors = process_uploaded_files(files)

            if file_errors:
                error_html = "\n".join([f"- {e}" for e in file_errors])
                error_html = f"**⚠️ File Processing Issues:**\n{error_html}\n\n"

            if not parsed_resumes:
                return "", {}, True, f"{error_html}**❌ No valid resumes were parsed**"

            # Extract JD skills
            jd_skills = extract_skills_from_jd(jd)

            # Add custom skills
            if custom_skills_text:
                custom = [s.strip().lower() for s in custom_skills_text.split(",") if s.strip()]
                jd_skills = sorted(list(set(jd_skills + custom)))

            # Rank candidates
            results, warnings = rank_candidates(
                jd,
                jd_skills,
                parsed_resumes,
                top_k=int(top_k),
                use_llm=use_llm
            )

            if not results:
                return "", {}, True, f"{error_html}**❌ No candidates to rank**"

            # Generate HTML report
            html_report = create_html_report(results, jd_skills)

            # Prepare JSON
            results_json = {
                "timestamp": datetime.now().isoformat(),
                "jd_skills_extracted": jd_skills,
                "total_candidates_processed": len(parsed_resumes),
                "top_candidates": results
            }

            success_msg = f"**✅ Successfully ranked {len(parsed_resumes)} resumes**\n"
            if warnings:
                success_msg += "\n**Notes:**\n" + "\n".join([f"- {w}" for w in warnings])

            return html_report, results_json, False, success_msg

        except Exception as e:
            import traceback
            error_trace = traceback.format_exc()
            return "", {}, True, f"**❌ Error during ranking:**\n```\n{str(e)}\n```"

    rank_button.click(
        fn=on_rank_click,
        inputs=[resume_files, jd_text, top_k_slider, custom_skills, use_llm_checkbox],
        outputs=[output_html, download_json, error_message, error_message]
    )

    # Add examples section
    gr.Markdown("---")
    gr.Markdown("### 📝 Example Job Description")
    gr.Markdown("""
    ```
    Senior Machine Learning Engineer

    Requirements:
    - 5+ years of ML/AI experience
    - Expert in Python, TensorFlow, PyTorch
    - Experience with NLP and LLMs
    - AWS or GCP cloud expertise
    - Strong system design skills
    - Experience deploying ML models to production

    Nice to have:
    - Docker & Kubernetes
    - Experience with RAG systems
    - Publication in top-tier conferences
    ```
    """)

print("✓ Gradio interface created!")


✓ Gradio interface created!


In [11]:
# ======= Cell 11: Launch Gradio App =======
print("\n" + "="*60)
print("🚀 LAUNCHING RESUME RANKING APPLICATION")
print("="*60 + "\n")

if not HF_TOKEN:
    print("⚠️  WARNING: HF_TOKEN not set!")
    print("    LLM evaluation will be disabled.")
    print("    To enable: Set HF_TOKEN in Cell 2\n")
else:
    print("✅ HuggingFace token detected - LLM evaluation enabled\n")

demo.launch(share=True, inbrowser=False)

print("\n" + "="*60)
print("📌 KEEP THIS CELL RUNNING TO KEEP THE APP ACTIVE")
print("="*60)



🚀 LAUNCHING RESUME RANKING APPLICATION

✅ HuggingFace token detected - LLM evaluation enabled

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://df812f3f82802052e1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



📌 KEEP THIS CELL RUNNING TO KEEP THE APP ACTIVE
