<a href="https://colab.research.google.com/github/Balavignesh-25/Resume_Parser/blob/main/RESUME_PARSER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!apt-get update -y # Update package lists
!apt-get install -y poppler-utils tesseract-ocr # Install Poppler utilities (for PDF rendering) and Tesseract OCR engine

!pip install -q pdf2image pytesseract PyPDF2 spacy groq # Install Python libraries: pdf2image, pytesseract, PyPDF2, spacy, groq
!python -m spacy download en_core_web_sm # Download the small English language model for spaCy

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connected to cloud.r-pr                                                                               Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Fetched 255 kB in 1s (198 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not see

In [17]:
# Check if the code is running in Google Colab environment
try:
    from google.colab import userdata # Attempt to import userdata from google.colab
    IN_COLAB = True # If successful, set IN_COLAB flag to True
except ImportError:
    IN_COLAB = False # If import fails, set IN_COLAB flag to False (e.g., running locally)

In [18]:
import os # Import the os module for environment variable access

def get_secret(key_name: str) -> str:
    """
    Securely fetch secrets from Colab userdata or environment variables.
    Prioritizes Colab userdata if running in Colab, otherwise checks environment variables.
    """
    if IN_COLAB:
        return userdata.get(key_name) # Fetch secret from Colab userdata
    return os.getenv(key_name) # Fetch secret from environment variables

# Load API keys for RapidAPI (JSearch) and Groq services
RAPIDAPI_KEY = get_secret("RAPIDAPI_KEY") # Get RapidAPI key
GROQ_API_KEY = get_secret("GROQ_API_KEY") # Get Groq API key

# Validate if API keys are loaded successfully
if not RAPIDAPI_KEY or not GROQ_API_KEY:
    raise EnvironmentError(
        "‚ùå API Keys missing!\n"
        "‚Ä¢ In Colab ‚Üí Add them in Secrets and restart runtime\n"
        "‚Ä¢ Locally ‚Üí Export them as environment variables"
    )

print("‚úÖ API keys loaded successfully") # Confirmation message if keys are loaded

‚úÖ API keys loaded successfully


In [19]:
# =============================================================================
# OCR + SKILL EXTRACTION + JOB MATCHING
# =============================================================================

# Import necessary modules and libraries
import os # For operating system interaction
import re # For regular expressions
import json # For JSON parsing and serialization
import http.client # For making HTTP connections
from google.colab import files # For file upload functionality in Google Colab
from pdf2image import convert_from_path # For converting PDF pages to images
import pytesseract # For performing Optical Character Recognition (OCR)
import PyPDF2 # For extracting text from PDF documents
import spacy # For Natural Language Processing (NLP) tasks
from groq import Groq # For interacting with the Groq AI API

In [20]:
# =============================================================================
# GUARDRAIL CONFIGURATION
# =============================================================================

# Maximum allowed size for uploaded PDF resumes in megabytes
MAX_PDF_MB = 5
# Maximum number of characters to process from the extracted resume text
MAX_RESUME_CHARS = 8000
# Maximum number of skills to extract from the resume
MAX_SKILLS = 50
# Maximum number of job listings to fetch and analyze
MAX_JOBS = 5

In [21]:

# =============================================================================
# NLP SETUP
# =============================================================================

try:
    # Attempt to load the pre-trained small English spaCy model
    nlp = spacy.load("en_core_web_sm")
except:
    # If the model is not found, raise a RuntimeError with instructions to download it
    raise RuntimeError("‚ùå spaCy model missing. Run: python -m spacy download en_core_web_sm")

# =============================================================================
# SKILL NORMALIZATION MAPPINGS
# =============================================================================

# Dictionary to map skill variants to their canonical forms (e.g., 'mysql' to 'sql')
SKILL_NORMALIZATION = {
    "mysql": "sql", "postgresql": "sql", "sqlite": "sql", "database": "sql",
    "reactjs": "react", "nodejs": "node",
    "ml": "machine learning", "ai": "machine learning",
    "nlp": "nlp", "deep learning": "deep learning",
    "aws": "aws", "ec2": "aws", "s3": "aws", "lambda": "aws"
}

# Set of canonical (standardized) skills to look for directly in the resume
CANONICAL_SKILLS = {
    "python","java","c","c++","sql","react","node","aws",
    "machine learning","deep learning","nlp",
    "docker","kubernetes","git","linux"
}

# =============================================================================
# FILE VALIDATION FUNCTIONS
# =============================================================================

def validate_pdf(filename, file_bytes):
    """
    Validates an uploaded PDF file based on its extension and size.
    """
    # Check if the file has a .pdf extension (case-insensitive)
    if not filename.lower().endswith(".pdf"):
        raise ValueError("‚ùå Only PDF files are allowed")

    # Calculate file size in megabytes
    size_mb = len(file_bytes) / (1024 * 1024)
    # Check if the file size exceeds the maximum allowed limit
    if size_mb > MAX_PDF_MB:
        raise ValueError(f"‚ùå File too large ({size_mb:.2f} MB). Max {MAX_PDF_MB} MB")

In [22]:

# =============================================================================
# OCR + TEXT EXTRACTION FUNCTIONS
# =============================================================================

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file. It first tries PyPDF2 for text extraction.
    If that yields insufficient text, it falls back to OCR using pdf2image and pytesseract.
    """
    text = ""

    # Attempt to extract text directly from the PDF using PyPDF2
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            # Iterate through each page and extract text
            for page in reader.pages:
                if page.extract_text():
                    text += page.extract_text()
    except:
        # Ignore errors during PyPDF2 extraction and proceed to OCR if needed
        pass

    # If insufficient text was extracted, use OCR (Optical Character Recognition)
    if len(text.strip()) < 100:
        # Convert PDF pages to images
        images = convert_from_path(pdf_path)
        # Use Tesseract OCR to extract text from each image
        for img in images:
            text += pytesseract.image_to_string(img)

    # Return the extracted text, truncated to a maximum character limit
    return text.lower()[:MAX_RESUME_CHARS]

# =============================================================================
# SKILL EXTRACTION FUNCTIONS
# =============================================================================

def extract_skills(text):
    """
    Extracts skills from a given text by checking against canonical skills,
    normalizing skill variants, and using spaCy for noun chunk extraction.
    """
    skills = set()

    # Check for direct matches of canonical skills in the text
    for skill in CANONICAL_SKILLS:
        if re.search(rf"\b{skill}\b", text):
            skills.add(skill)

    # Check for skill variants and add their canonical forms
    for variant, canonical in SKILL_NORMALIZATION.items():
        if re.search(rf"\b{variant}\b", text):
            skills.add(canonical)

    # Use spaCy's NLP model to find noun chunks and normalize them into skills
    doc = nlp(text)
    for chunk in doc.noun_chunks:
        c = chunk.text.lower()
        if c in SKILL_NORMALIZATION:
            skills.add(SKILL_NORMALIZATION[c])

    # Return a sorted list of unique skills, truncated to a maximum limit
    return sorted(list(skills))[:MAX_SKILLS]

# =============================================================================
# JOB FETCHING (JSEARCH API) FUNCTIONS
# =============================================================================

def fetch_jobs(query):
    """
    Fetches job listings from the JSearch RapidAPI based on a given query.
    """
    conn = http.client.HTTPSConnection("jsearch.p.rapidapi.com")

    # Set API key and host in headers for authentication
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": "jsearch.p.rapidapi.com"
    }

    # Construct the API endpoint with the URL-encoded query
    endpoint = f"/search?query={query.replace(' ','%20')}&page=1&num_pages=1"
    conn.request("GET", endpoint, headers=headers)

    # Get the response, read and decode it, then parse as JSON
    res = conn.getresponse()
    data = json.loads(res.read().decode("utf-8"))
    conn.close()

    # Check if the 'data' key exists in the response
    if "data" not in data:
        return []

    # Return the top N jobs (MAX_JOBS) from the response
    return data["data"][:MAX_JOBS]


In [23]:
import os
import re
import json
import http.client
from google.colab import files # Re-adding this import
from pdf2image import convert_from_path
import pytesseract
import PyPDF2
import spacy
from groq import Groq

# =============================================================================
# JOB MATCHING VIA GROQ (AI) FUNCTIONS
# =============================================================================

SYSTEM_PROMPT = """
You are an ATS-grade resume matcher.
Rules:
- Do NOT hallucinate skills
- Match only from resume
- Return valid JSON only
"""

def analyze_resume(resume, skills, jobs):
    """
    Analyzes a resume against a list of job descriptions using the Groq API.
    It generates a match score, matching skills, missing skills, reason, and recommendation for each job.
    """
    # Initialize the Groq client with the API key
    client = Groq(api_key=GROQ_API_KEY)

    job_blocks = []
    # Format each job description into a structured block for the AI prompt
    for i, j in enumerate(jobs, 1):
        job_blocks.append(f"""
Job {i}:
Title: {j.get('job_title')}
Company: {j.get('employer_name')}
Description: {j.get('job_description')[:1200]} # Truncate description to prevent token overflow
""")

    # Construct the user prompt for the Groq API
    # This includes the extracted resume skills, the full resume text, and the formatted job blocks
    user_prompt = f"""
Resume Skills:
{", ".join(skills)}

Resume:
{resume}

Jobs:
{" ".join(job_blocks)}

Return JSON:
[
  {{
    "rank": 1,
    "job_title": "",
    "company": "",
    "match_score": 0,
    "matching_skills": [],
    "missing_skills": [],
    "reason": "",
    "recommendation": ""
  }}
]
"""

    # Call the Groq API to get completion for chat messages
    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile", # Specify the AI model to use
        messages=[
            {"role":"system","content":SYSTEM_PROMPT}, # System message guides the AI's behavior
            {"role":"user","content":user_prompt} # User's query with resume and job details
        ],
        temperature=0.2, # Controls the randomness of the output (lower = more deterministic)
        max_tokens=3000 # Maximum number of tokens in the AI's response
    )

    # Parse the AI's JSON response and return it
    return json.loads(response.choices[0].message.content)

# =============================================================================
# MAIN FUNCTION (Application Entry Point)
# =============================================================================

def main():
    print("\nüöÄ AI Resume Skill & Job Matcher (COLAB)\n")

    # 1. Upload Resume PDF from the user
    print("üì§ Upload resume PDF")
    uploaded = files.upload()

    # Handle case where no file was uploaded
    if not uploaded:
        print("‚ùå No file uploaded")
        return

    # Get the filename and its binary content from the uploaded dictionary
    filename = list(uploaded.keys())[0]
    file_bytes = uploaded[filename]

    # Validate the uploaded PDF file (e.g., format, size)
    validate_pdf(filename, file_bytes)

    # Save the uploaded binary content to a temporary file locally for processing
    with open(filename, "wb") as f:
        f.write(file_bytes)

    # 2. Extract Text content from the PDF resume
    print("üìÑ Extracting text...")
    resume_text = extract_text_from_pdf(filename)

    # 3. Extract relevant skills from the extracted resume text
    print("üß† Extracting skills...")
    skills = extract_skills(resume_text)
    print(f"\n‚úÖ Skills ({len(skills)}): {skills}")

    # 4. Prompt user for a job role query; uses 'python developer' as default
    query = input("\nüîç Job role (default: python developer): ").strip() or "python developer"

    # 5. Fetch job listings based on the user's query from an external API
    jobs = fetch_jobs(query)

    # Handle case where no jobs were found for the given query
    if not jobs:
        print("‚ùå No jobs found")
        return

    # 6. Use AI (Groq) to match the extracted resume skills and content with the fetched jobs
    print("\nü§ñ Matching resume with jobs...")
    results = analyze_resume(resume_text, skills, jobs)

    # 7. Display the ranked job matching results to the user
    print("\nüéØ RANKED RESULTS\n")
    for r in results:
        print("="*80)
        print(f"#{r['rank']} | {r['job_title']} @ {r['company']}")
        print(f"Score: {r['match_score']}% | {r['recommendation']}")
        print(f"Matched: {r['matching_skills']}")
        print(f"Missing: {r['missing_skills']}")
        print(f"Reason: {r['reason']}")

    # 8. Save the detailed job matching results to a JSON file
    with open("job_match_results.json","w") as f:
        json.dump(results,f,indent=2)
    print("\nüíæ Results saved to job_match_results.json")
    print("\nüéâ DONE")

# =============================================================================
# RUN APPLICATION
# =============================================================================

# Entry point for executing the script
if __name__ == "__main__":
    main()



üöÄ AI Resume Skill & Job Matcher (COLAB)

üì§ Upload resume PDF


Saving Bala Vignesh Resume - AI (Integra connect) .pdf to Bala Vignesh Resume - AI (Integra connect) .pdf
üìÑ Extracting text...
üß† Extracting skills...

‚úÖ Skills (7): ['c', 'c++', 'deep learning', 'java', 'machine learning', 'python', 'sql']

üîç Job role (default: python developer): Machine Learning Intern

ü§ñ Matching resume with jobs...

üéØ RANKED RESULTS

#1 | Summer 2026 Artificial Intelligence/Machine Learning Associate Internship @ General Dynamics
Score: 80% | Consider highlighting any data analytics or database experience in the resume, even if it's not explicitly mentioned.
Matched: ['machine learning', 'python']
Missing: ['data analytics', 'database']
Reason: The job description mentions AI and machine learning, which are skills mentioned in the resume. However, it requires data analytics and database skills, which are not explicitly mentioned.
#2 | AI Applied Research & Machine Learning Internship (PhD) @ GEICO
Score: 70% | Consider applying for other internships