In [29]:
!pip install gradio pdfplumber python-docx scikit-learn




In [30]:
import re
from pathlib import Path

import pdfplumber
import docx
import gradio as gr

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [31]:
def extract_text_from_file(file_path: str) -> str:
    """
    Extracts text from PDF, DOCX, or TXT file.
    """
    ext = Path(file_path).suffix.lower()

    if ext == ".pdf":
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
        return text

    elif ext == ".docx":
        d = docx.Document(file_path)
        return "\n".join(p.text for p in d.paragraphs)

    elif ext in [".txt", ".md"]:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()

    else:
        raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")


In [32]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def extract_keywords_from_jd(jd: str):
    """
    Very simple keyword extractor from job description.
    You could later replace this with LangChain + GPT for smarter parsing.
    """
    jd_clean = clean_text(jd)
    words = re.findall(r"\b[a-zA-Z]{3,}\b", jd_clean)

    stopwords = {
        "with", "from", "this", "that", "have", "will", "your", "about",
        "which", "such", "their", "they", "them", "like", "some", "more",
        "into", "able", "must", "should", "role", "team", "work", "tasks",
        "responsible", "responsibilities", "requirements", "requirement",
        "skills", "skill", "experience", "years", "good", "strong",
        "knowledge", "and", "for", "the", "you", "our", "job", "description"
    }

    keywords = [w for w in words if w not in stopwords]
    return sorted(list(set(keywords)))


def tfidf_similarity_scores(jd: str, resume_texts: list):
    """
    Compute TF-IDF based similarity between JD and each resume.
    Returns scores in [0, 100].
    """
    corpus = [jd] + resume_texts
    corpus_clean = [clean_text(t) for t in corpus]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus_clean)

    jd_vec = tfidf_matrix[0:1]
    resume_vecs = tfidf_matrix[1:]

    sims = cosine_similarity(jd_vec, resume_vecs)[0]  # 1 x N -> (N,)
    scores = [float(s * 100) for s in sims]
    return scores


In [33]:
def rank_resumes(job_description: str, files):
    if not job_description.strip():
        return "‚ùó Please paste a Job Description first."

    if not files:
        return "‚ùó Please upload at least one resume (PDF, DOCX, or TXT)."

    # Extract JD keywords (for explanation only)
    jd_keywords = extract_keywords_from_jd(job_description)
    if not jd_keywords:
        jd_keywords = []

    resume_texts = []
    resume_names = []

    for f in files:
        try:
            text = extract_text_from_file(f.name)
            if not text.strip():
                continue
            resume_texts.append(text)
            resume_names.append(Path(f.name).name)
        except Exception as e:
            print(f"Error reading {f.name}: {e}")

    if not resume_texts:
        return "‚ö†Ô∏è Could not read any resume text. Make sure files are text-based PDFs or DOCX."

    # TF-IDF similarity scores
    sim_scores = tfidf_similarity_scores(job_description, resume_texts)

    results = []
    for name, text, sim in zip(resume_names, resume_texts, sim_scores):
        # Simple keyword hit count for explanation
        resume_clean = clean_text(text)
        hits = [kw for kw in jd_keywords if kw in resume_clean]
        coverage = round(100 * len(hits) / len(jd_keywords), 2) if jd_keywords else 0.0

        # Blend similarity + coverage (you can tweak weights)
        final_score = round(0.6 * sim + 0.4 * coverage, 2)

        results.append({
            "name": name,
            "sim_score": round(sim, 2),
            "kw_coverage": coverage,
            "final_score": final_score,
            "matched_keywords": hits[:20],  # show up to 20
        })

    # Sort by final_score desc
    results_sorted = sorted(results, key=lambda r: r["final_score"], reverse=True)

    # Build Markdown report
    md = "# üìä Resume Screening Results\n\n"
    md += "| Rank | Resume | Final Score | JD Similarity | Keyword Coverage |\n"
    md += "|------|--------|-------------|---------------|------------------|\n"
    for i, r in enumerate(results_sorted, start=1):
        md += (
            f"| {i} | {r['name']} | {r['final_score']} | "
            f"{r['sim_score']} | {r['kw_coverage']}% |\n"
        )

    md += "\n---\n"
    md += "### üîç Per-candidate highlights\n"
    for i, r in enumerate(results_sorted, start=1):
        md += f"\n**{i}. {r['name']}**  \n"
        md += f"- Final score: `{r['final_score']}`  \n"
        md += f"- JD similarity (TF-IDF): `{r['sim_score']}`  \n"
        md += f"- Keyword coverage: `{r['kw_coverage']}%`  \n"
        if r["matched_keywords"]:
            md += f"- Sample matched keywords: `{', '.join(r['matched_keywords'])}`  \n"
        else:
            md += "- Sample matched keywords: _none detected_  \n"

    md += (
        "\n> ‚ÑπÔ∏è This is a local heuristic model (TF-IDF + keywords). "
        "In a production system, you‚Äôd plug in OpenAI GPT / Claude / Gemini via LangChain "
        "for deeper semantic scoring and explanations."
    )

    return md


In [34]:
def rank_resumes(job_description: str, files):
    if not job_description.strip():
        return "‚ùó Please paste a Job Description first."

    if not files:
        return "‚ùó Please upload at least one resume (PDF, DOCX, or TXT)."

    # Extract JD keywords (for explanation only)
    jd_keywords = extract_keywords_from_jd(job_description)
    if not jd_keywords:
        jd_keywords = []

    resume_texts = []
    resume_names = []

    for f in files:
        try:
            text = extract_text_from_file(f.name)
            if not text.strip():
                continue
            resume_texts.append(text)
            resume_names.append(Path(f.name).name)
        except Exception as e:
            print(f"Error reading {f.name}: {e}")

    if not resume_texts:
        return "‚ö†Ô∏è Could not read any resume text. Make sure files are text-based PDFs or DOCX."

    # TF-IDF similarity scores
    sim_scores = tfidf_similarity_scores(job_description, resume_texts)

    results = []
    for name, text, sim in zip(resume_names, resume_texts, sim_scores):
        # Simple keyword hit count for explanation
        resume_clean = clean_text(text)
        hits = [kw for kw in jd_keywords if kw in resume_clean]
        coverage = round(100 * len(hits) / len(jd_keywords), 2) if jd_keywords else 0.0

        # Blend similarity + coverage (you can tweak weights)
        final_score = round(0.6 * sim + 0.4 * coverage, 2)

        results.append({
            "name": name,
            "sim_score": round(sim, 2),
            "kw_coverage": coverage,
            "final_score": final_score,
            "matched_keywords": hits[:20],  # show up to 20
        })

    # Sort by final_score desc
    results_sorted = sorted(results, key=lambda r: r["final_score"], reverse=True)

    # Build Markdown report
    md = "# üìä Resume Screening Results\n\n"
    md += "| Rank | Resume | Final Score | JD Similarity | Keyword Coverage |\n"
    md += "|------|--------|-------------|---------------|------------------|\n"
    for i, r in enumerate(results_sorted, start=1):
        md += (
            f"| {i} | {r['name']} | {r['final_score']} | "
            f"{r['sim_score']} | {r['kw_coverage']}% |\n"
        )

    md += "\n---\n"
    md += "### üîç Per-candidate highlights\n"
    for i, r in enumerate(results_sorted, start=1):
        md += f"\n**{i}. {r['name']}**  \n"
        md += f"- Final score: `{r['final_score']}`  \n"
        md += f"- JD similarity (TF-IDF): `{r['sim_score']}`  \n"
        md += f"- Keyword coverage: `{r['kw_coverage']}%`  \n"
        if r["matched_keywords"]:
            md += f"- Sample matched keywords: `{', '.join(r['matched_keywords'])}`  \n"
        else:
            md += "- Sample matched keywords: _none detected_  \n"

    md += (
        "\n> ‚ÑπÔ∏è This is a local heuristic model (TF-IDF + keywords). "
        "In a production system, you‚Äôd plug in OpenAI GPT / Claude / Gemini via LangChain "
        "for deeper semantic scoring and explanations."
    )

    return md


In [35]:
PROJECT_INFO_MD = """
# üß† ResumeScreening Agent ‚Äì Project Overview

This project demonstrates an **AI-powered Resume Screening Agent** that:

- Takes a **Job Description (JD)** as input  
- Accepts multiple **candidate resumes** (PDF/DOCX/TXT)  
- Ranks resumes based on **semantic similarity** + **keyword coverage**  
- Shows per-candidate highlights for explanations  

---

## üîó Target AI Stack (Conceptual Design)

Even though this notebook uses local Python + TF-IDF, the project is **designed** to plug into a modern AI stack:

### ü§ñ AI Models
- **OpenAI GPT** ‚Äì main reasoning engine (fit scoring, explanations)
- **Claude** ‚Äì great for long resumes and multi-page JDs
- **Gemini** ‚Äì ideal if heavily integrated with Google Workspace

### üß∞ Frameworks
- **LangChain** ‚Äì tools, prompts, chains (JD parsing, scoring tools)
- **CrewAI** ‚Äì multi-agent setup (Screening Agent + Interview Agent, etc.)
- **LlamaIndex** ‚Äì building RAG over historical candidate data & job archives

### üóÇÔ∏è Vector Databases
- **Pinecone** ‚Äì managed, scalable semantic search for resumes & JDs
- **ChromaDB / Weaviate** ‚Äì flexible, self-hosted or managed options
- **FAISS** ‚Äì fast local vector search for prototypes / small teams

### üñ•Ô∏è UI Layer
- **Streamlit** ‚Äì internal recruiter dashboard (upload JDs/resumes, view rankings)
- **Gradio** ‚Äì quick demo UI (what you're using now)
- **HTML/JS** ‚Äì public-facing preview / portfolio website

### üßæ Databases
- **Firebase / Supabase** ‚Äì auth, logs, recruiter accounts, audit trails
- **Notion DB** ‚Äì store candidate pipelines and interview notes as tables
- **Google Sheets** ‚Äì simple ATS for smaller teams / college projects

### üåê APIs & Integrations
- **Google Calendar** ‚Äì automatically schedule interviews for shortlisted candidates
- **Calendly** ‚Äì send scheduling links to candidates
- **Notion / Sheets** ‚Äì sync ranked candidate lists
- **Zapier** ‚Äì trigger Slack/Email notifications from shortlist events
- **Shopify** ‚Äì (optional) sync hires or roles for e-commerce teams

---

## üöÄ How to Present This as a Project

You can describe your system as:

1. **Frontend** ‚Äì Streamlit/Gradio app where HR pastes JD and uploads resumes.  
2. **Backend** ‚Äì Python + LangChain/CrewAI orchestrating:
   - JD parsing (LLM)
   - Resume embedding (vector DB)
   - Scoring & ranking (LLM + similarity)
3. **Data Layer** ‚Äì Pinecone/ChromaDB/FAISS for embeddings, plus Notion DB / Sheets for pipeline.
4. **Automation Layer** ‚Äì Zapier + Google Calendar + Calendly for interview scheduling.

This notebook gives you a **working core (ranking logic + UI)** that you can later extend with actual API keys and real LLM calls.
"""


In [36]:
def screening_interface(job_description, files):
    return rank_resumes(job_description, files)

with gr.Blocks() as demo:
    gr.Markdown("## ü§ñ ResumeScreening Agent ‚Äì Rank Resumes by Job Description")

    with gr.Tab("üîç Resume Screening Tool"):
        gr.Markdown(
            "Paste a **Job Description** and upload **multiple resumes**. "
            "The agent will compute similarity scores and rank them."
        )
        
        jd_input = gr.Textbox(
            label="Job Description",
            lines=10,
            placeholder="Paste the full JD here..."
        )
        
        # FIX: Use gr.Files instead of gr.File(multiple=True)
        resumes_input = gr.Files(
            label="Upload resumes (PDF, DOCX, TXT)",
            file_types=[".pdf", ".docx", ".txt"]
        )

        output = gr.Markdown()
        btn = gr.Button("Rank Resumes")
        btn.click(fn=screening_interface, inputs=[jd_input, resumes_input], outputs=output)

    with gr.Tab("üìö Project & AI Stack"):
        gr.Markdown(PROJECT_INFO_MD)

demo.launch()


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


