<a href="https://colab.research.google.com/github/Balavignesh-25/Resume_Parser/blob/main/Secure_AI_Resume_Job_Matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Update package lists
!apt-get update -qq
# Install Tesseract OCR and Poppler utilities for PDF text extraction
!apt-get install -y tesseract-ocr poppler-utils
# Install Python libraries: pytesseract for OCR, pillow for image processing, pdf2image for PDF to image conversion, groq for LLM, and gradio for UI
!pip install -q pytesseract pillow pdf2image groq gradio
# Install PyPDF2 for direct PDF text extraction
!pip install -q PyPDF2

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 54 not upgraded.
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import re
import json
import http.client
import hashlib
import pytesseract
import pandas as pd
import plotly.graph_objects as go # Although imported, plotly is not used in the final Gradio app logic.
import gradio as gr

from groq import Groq
from datetime import datetime # Although imported, datetime is not used in the final Gradio app logic.
from pdf2image import convert_from_path
import PyPDF2
import spacy

In [6]:
# Check if the code is running in Google Colab
try:
    from google.colab import userdata
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

In [7]:
def get_secret(key_name: str) -> str:
    """
    Securely fetch secrets from Colab userdata or environment variables.
    """
    if IN_COLAB:
        return userdata.get(key_name)
    return os.getenv(key_name)

# Load API keys for RapidAPI (JSearch) and Groq
RAPIDAPI_KEY = get_secret("RAPIDAPI_KEY")
GROQ_API_KEY = get_secret("GROQ_API_KEY")

# Validate if API keys are loaded successfully
if not RAPIDAPI_KEY or not GROQ_API_KEY:
    raise EnvironmentError(
        "‚ùå API Keys missing!\n"
        "‚Ä¢ In Colab ‚Üí Add them in Secrets and restart runtime\n"
        "‚Ä¢ Locally ‚Üí Export them as environment variables"
    )

print("‚úÖ API keys loaded successfully")

‚úÖ API keys loaded successfully


In [8]:
# Load the English language model for spaCy. Although imported, spacy is not explicitly used for skill extraction in the current logic.
nlp = spacy.load("en_core_web_sm")

In [9]:
# Define a mapping for skill variants to canonical skill names
SKILL_MAP = {
    "ml": "machine learning",
    "ai": "machine learning",
    "sklearn": "machine learning",
    "dl": "deep learning",
    "nlp": "nlp",
    "reactjs": "react",
    "nodejs": "node",
    "postgresql": "sql",
    "mysql": "sql",
    "ci/cd": "devops",
}

# Define a set of canonical skills to look for in resumes
CANONICAL_SKILLS = set(SKILL_MAP.values()) | {
    "python", "java", "c++", "sql", "aws", "docker", "kubernetes",
    "deep learning", "machine learning", "nlp", "pandas", "numpy",
    "tensorflow", "pytorch", "fastapi", "flask", "gradio"
}

In [10]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file. Tries direct text extraction first, then OCR if insufficient text is found.
    """
    text = ""

    try:
        # Attempt direct text extraction using PyPDF2
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += (page.extract_text() or "")
    except:
        pass # Ignore errors during PyPDF2 extraction

    # If less than 200 characters are extracted, use OCR (Tesseract) on images of the PDF pages
    if len(text.strip()) < 200:
        images = convert_from_path(pdf_path)
        for img in images:
            text += pytesseract.image_to_string(img)

    return text.lower()

In [11]:
def extract_skills(text):
    """
    Extracts skills from the given text based on pre-defined canonical skills and their variants.
    Returns a dictionary of found skills and their counts.
    """
    found = {}

    # Find canonical skills directly
    for skill in CANONICAL_SKILLS:
        matches = len(re.findall(rf"\b{skill}\b", text))
        if matches:
            found[skill] = matches

    # Find skill variants and map them to canonical skills
    for variant, canonical in SKILL_MAP.items():
        matches = len(re.findall(rf"\b{variant}\b", text))
        if matches:
            found[canonical] = found.get(canonical, 0) + matches

    return found

In [12]:
# Cache for job search results to avoid redundant API calls
JOB_CACHE = {}

def fetch_jobs(query, num_jobs):
    """
    Fetches job listings from the JSearch API based on a query.
    Caches results to improve performance for repeated queries.
    """
    key = hashlib.md5(query.encode()).hexdigest()
    if key in JOB_CACHE:
        return JOB_CACHE[key][:num_jobs]

    conn = http.client.HTTPSConnection("jsearch.p.rapidapi.com")
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": "jsearch.p.rapidapi.com",
    }

    # Construct the API endpoint for job search
    endpoint = f"/search?query={query.replace(' ', '%20')}&page=1&num_pages=1"
    conn.request("GET", endpoint, headers=headers)

    # Parse the API response
    res = json.loads(conn.getresponse().read())
    jobs = res.get("data", [])
    JOB_CACHE[key] = jobs
    return jobs[:num_jobs]

In [13]:
# System prompt for the LLM, instructing it to act as an ATS scoring engine and return only valid JSON
SYSTEM_PROMPT = """
You are an ATS scoring engine.
Return ONLY valid JSON.
"""

def analyze_with_llm(resume, jobs):
    """
    Analyzes a resume against a list of jobs using the Groq LLM to generate match scores and recommendations.
    Includes a fallback mechanism if the LLM call fails.
    """
    client = Groq(api_key=GROQ_API_KEY)

    try:
        # Make a request to the Groq LLM
        response = client.chat.completions.create(
            model="llama-3.3-70b-versatile", # Specify the LLM model to use
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Resume:\n{resume}\nJobs:\n{jobs}"}
            ],
            temperature=0.2, # Set a low temperature for more deterministic output
            max_tokens=3000 # Limit the response size
        )

        # Parse the JSON response from the LLM
        return json.loads(response.choices[0].message.content)

    except:
        # üî• Fallback mode: If LLM call fails, provide generic scores and recommendations
        return [{
            "rank": i + 1,
            "job_title": job.get("job_title", ""),
            "company": job.get("employer_name", ""),
            "match_score": 50, # Default score in fallback mode
            "matching_skills": [],
            "missing_skills": [],
            "reason": "LLM unavailable ‚Äì fallback scoring",
            "recommendation": "Consider"
        } for i, job in enumerate(jobs)]

In [14]:
def process_resume(pdf, query, num_jobs):
    """
    Main function to process a resume PDF, extract skills, fetch jobs, and analyze matches with an LLM.
    Returns a pandas DataFrame of job matches and a JSON string of extracted skills.
    """
    # Extract text from the uploaded PDF resume
    text = extract_text_from_pdf(pdf.name)
    # Extract skills from the resume text
    skills = extract_skills(text)

    # Fetch job listings based on the user's query
    jobs = fetch_jobs(query, int(num_jobs))
    # Analyze the resume against the fetched jobs using the LLM
    results = analyze_with_llm(text[:5000], jobs) # Pass only the first 5000 characters of the resume to the LLM

    # Convert the LLM results into a pandas DataFrame
    df = pd.DataFrame(results)
    # Return the DataFrame and a JSON string of the extracted skills
    return df, json.dumps(skills, indent=2)

In [15]:
# Set up the Gradio interface for the Resume Job Matcher application
with gr.Blocks() as demo:
    gr.Markdown("# üöÄ Secure AI Resume Job Matcher")

    # Input components
    pdf = gr.File(label="Upload Resume PDF") # Allows users to upload a PDF file
    query = gr.Textbox(value="machine learning engineer", label="Job Search Query") # Text input for job search
    num_jobs = gr.Slider(1, 10, 5, label="Number of Jobs to Fetch") # Slider to select number of jobs

    # Button to trigger the analysis
    btn = gr.Button("Analyze")

    # Output components
    table = gr.Dataframe(label="Job Match Results") # Displays job match results in a table
    skills = gr.Code(label="Extracted Skills (with confidence)", language="json") # Displays extracted skills in JSON format

    # Define the interaction: when the button is clicked, call process_resume and update outputs
    btn.click(process_resume, [pdf, query, num_jobs], [table, skills])

# Launch the Gradio application
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://76d73c8f394ef02fe5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


