In [None]:
# Install necessary packages
!pip install --upgrade pymupdf
!pip install tqdm
!pip install -U sentence-transformers
!pip install spacy
!pip install transformers
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import os
import fitz
import re
import string
import json
import numpy as np
import pandas as pd
import nltk
import spacy
import torch
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from collections import defaultdict
from google.colab import drive
from transformers import AutoTokenizer, AutoModel

In [None]:
# Define a helper function for sklearn's cosine_similarity when using transformers
def cosine_similarity(a, b):
    a_norm = np.linalg.norm(a)
    b_norm = np.linalg.norm(b)
    sim = np.dot(a, b.T) / (a_norm * b_norm)
    return np.array([[sim]])

def sort_pdfs(file_path):
    file_name = os.path.basename(file_path)
    sorted_pdfs = file_name.replace('candidate_', '').replace('.pdf', '')
    return int(sorted_pdfs)

In [None]:
class NLPResumeParser:
    def __init__(self, use_transformers=True):
        """
        Initialize the parser with spaCy and optional transformer model.

        Parameters:
        - use_transformers: If True, will use a transformer model for better section classification
        """
        self.nlp = spacy.load("en_core_web_lg")
        self.use_transformers = use_transformers

        # Common section identifiers with example text (for semantic matching)
        self.section_examples = {
            'contact': 'Email phone address location linkedin github',
            'summary': 'Professional summary about experience and skills overview profile',
            'experience': 'Work experience professional experience employment history job positions companies',
            'education': 'University college school degree bachelor master PhD GPA courses academic',
            'skills': 'Technical skills programming languages frameworks tools software proficiency expertise',
            'projects': 'Projects portfolio Github personal projects academic projects team projects',
            'certifications': 'Certifications certificate license credentials accreditation',
            'achievements': 'Awards honors achievements recognitions accomplishments',
            'languages': 'Language proficiency fluent native bilingual',
            'interests': 'Hobbies interests activities personal interests passions'
        }

        # Initialize transformer model if requested
        if self.use_transformers:
            try:
                self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
                self.model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
            except Exception as e:
                print(f"Failed to load transformer models: {e}")
                print("Falling back to spaCy-only mode")
                self.use_transformers = False

            # Pre-compute embeddings for section examples
            if self.use_transformers:
                self.section_embeddings = {}
                for section, text in self.section_examples.items():
                    self.section_embeddings[section] = self._get_transformer_embedding(text)

    def _get_transformer_embedding(self, text):
        """Get embedding from transformer model"""
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Use CLS token embedding as the sentence embedding
        return outputs.last_hidden_state[:, 0, :].numpy()

    def extract_text_with_layout(self, pdf_path):
        """
        Extract text from PDF with basic layout information.
        Returns a list of blocks with text, font info and position.
        """
        try:
            doc = fitz.open(pdf_path)
            blocks = []

            for page_num, page in enumerate(doc):
                # Get the blocks which include text, font and position info
                page_blocks = page.get_text("dict")["blocks"]

                for block in page_blocks:
                    if "lines" in block:
                        for line in block["lines"]:
                            for span in line["spans"]:
                                # Extract useful properties
                                text = span["text"].strip()
                                if not text:
                                    continue

                                blocks.append({
                                    "text": text,
                                    "font": span["font"],
                                    "size": span["size"],
                                    "flags": span["flags"],  # Bold, italic, etc.
                                    "page": page_num,
                                    "bbox": span["bbox"],    # Position
                                    "is_upper": text.isupper(),
                                    "is_title": text[0].isupper() if text else False,
                                    "char_count": len(text),
                                    "word_count": len(text.split())
                                })

            return blocks
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {str(e)}")
            return []

    def identify_sections(self, blocks):
        """
        Identify potential section headers from the extracted blocks.
        Returns a list of (index, section_name, confidence) tuples.
        """
        potential_headers = []

        # First, filter blocks that could be headers based on layout features
        for i, block in enumerate(blocks):
            # Skip very long text blocks - headers are usually short
            if block["word_count"] > 6 or block["char_count"] > 40:
                continue

            # Header heuristics
            header_confidence = 0

            # Feature 1: Uppercase or Title case
            if block["is_upper"]:
                header_confidence += 0.3
            elif block["is_title"]:
                header_confidence += 0.2

            # Feature 2: Larger font than surroundings
            if i > 0 and i < len(blocks) - 1:
                if block["size"] > blocks[i-1]["size"] and block["size"] >= blocks[i+1]["size"]:
                    header_confidence += 0.3

            # Feature 3: Bold text
            if block["flags"] & 2:  # Check bold flag
                header_confidence += 0.2

            # Feature 4: New line/paragraph
            if i > 0 and blocks[i]["page"] > blocks[i-1]["page"]:
                header_confidence += 0.1

            # Only consider blocks with reasonable confidence
            if header_confidence >= 0.3:
                # Identify section type using NLP
                section_name, section_confidence = self._classify_section(block["text"])

                # Combine layout and semantic confidence
                combined_confidence = header_confidence * 0.6 + section_confidence * 0.4

                potential_headers.append((i, section_name, combined_confidence))

        # Sort by confidence
        potential_headers.sort(key=lambda x: x[2], reverse=True)

        # Deduplicate overlapping sections
        final_headers = []
        used_indices = set()

        for idx, name, conf in potential_headers:
            # Skip if too close to another header with higher confidence
            if any(abs(idx - used_idx) < 3 for used_idx in used_indices):
                continue

            final_headers.append((idx, name, conf))
            used_indices.add(idx)

        # Sort by position
        final_headers.sort(key=lambda x: x[0])

        return final_headers

    def _classify_section(self, text):
        """
        Classify a potential section header into a known category.
        Returns (section_name, confidence).
        """
        text_lower = text.lower().strip()

        # Direct matching for common headers (with normalization)
        normalized_text = re.sub(r'[^a-z0-9\s]', '', text_lower)

        # Check exact matches with common section headers
        for section, examples in self.section_examples.items():
            example_keywords = examples.lower().split()
            if normalized_text in example_keywords or normalized_text.rstrip(':') in example_keywords:
                return section, 1.0

        # Try semantic matching using transformers or spaCy
        if self.use_transformers:
            # Get embedding for the header text
            header_embedding = self._get_transformer_embedding(text_lower)

            # Calculate similarity with each section example
            similarities = {}
            for section, embedding in self.section_embeddings.items():
                sim = cosine_similarity(header_embedding, embedding)[0][0]
                similarities[section] = sim

            # Get the highest similarity section
            best_section = max(similarities.items(), key=lambda x: x[1])
            return best_section[0], best_section[1]
        else:
            # Use spaCy's word vectors (fallback)
            header_doc = self.nlp(text_lower)

            # Calculate similarity with each section example
            best_section = None
            best_score = 0

            for section, example in self.section_examples.items():
                example_doc = self.nlp(example.lower())
                similarity = header_doc.similarity(example_doc)

                if similarity > best_score:
                    best_score = similarity
                    best_section = section

            return best_section, best_score

    def extract_sections(self, pdf_path):
        """
        Extract sections from a resume PDF.
        Returns a dictionary of section name -> content.
        """
        # Extract text blocks with layout info
        blocks = self.extract_text_with_layout(pdf_path)
        if not blocks:
            return {}

        # Identify section headers
        sections = self.identify_sections(blocks)

        # Extract content for each section
        section_contents = defaultdict(str)

        # Special case for contact section (usually at the top)
        contact_text = " ".join([b["text"] for b in blocks[:10]])
        section_contents["contact"] = contact_text

        # Process the rest of the sections
        for i, (idx, section_name, _) in enumerate(sections):
            # Get content until the next section
            if i < len(sections) - 1:
                next_idx = sections[i+1][0]
                content_blocks = blocks[idx+1:next_idx]
            else:
                content_blocks = blocks[idx+1:]

            # Join the text of all blocks in this section
            content = " ".join([b["text"] for b in content_blocks])
            section_contents[section_name] = content

        return dict(section_contents)

    def process_for_embedding(self, pdf_path):
        """
        Process a resume PDF into structured sections for embedding.
        Returns a dictionary with normalized sections and structured text.
        """
        # Extract sections
        sections = self.extract_sections(pdf_path)

        # Create structured text with section markers
        structured_text = []

        # Prioritize important sections
        priority_sections = ['summary', 'contact', 'experience', 'education', 'skills', 'projects',
                            'certifications', 'achievements', 'languages', 'interests']

        for section in priority_sections:
            if section in sections and sections[section].strip():
                structured_text.append(f"<{section.upper()}>")
                structured_text.append(sections[section])
                structured_text.append(f"</{section.upper()}>")

        # Add any remaining sections
        for section, content in sections.items():
            if section not in priority_sections and content.strip():
                structured_text.append(f"<{section.upper()}>")
                structured_text.append(content)
                structured_text.append(f"</{section.upper()}>")

        # Join all sections into a single text
        combined_text = "\n".join(structured_text)

        return {
            "sections": sections,
            "structured_text": combined_text
        }

In [None]:
# Mount Google Drive
drive.mount('/content/drive') # Put resumes within your google drive account

# Define paths
training_dir = '/content/drive/MyDrive/jobfiles/dataset/trainResumes'
testing_dir = '/content/drive/MyDrive/jobfiles/dataset/testResumes'
job_dir = '/content/drive/MyDrive/jobfiles/dataset/jobPostings'

# Get file lists
training_pdfs = [os.path.join(training_dir, file) for file in os.listdir(training_dir) if file.endswith('.pdf')] # Training
testing_pdfs = [os.path.join(testing_dir, file) for file in os.listdir(testing_dir) if file.endswith('.pdf')] # Testing
job_pdfs = [os.path.join(job_dir, file) for file in os.listdir(job_dir) if file.endswith('.pdf')] # Job Postings

print("Found training PDFs:", len(training_pdfs))
print("Found testing PDFs:", len(testing_pdfs))
print("Found job PDFs:", len(job_pdfs))

# Sort files
training_pdfs = sorted(training_pdfs, key=sort_pdfs)
testing_pdfs = sorted(testing_pdfs, key=sort_pdfs)

print("Sorted training PDFs:", training_pdfs)
print("Sorted testing PDFs:", testing_pdfs)

Mounted at /content/drive
Found training PDFs: 90
Found testing PDFs: 60
Found job PDFs: 1
Sorted training PDFs: ['/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_000.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_001.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_002.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_003.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_006.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_007.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_008.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_009.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_010.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_011.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_012.pdf', '/content/drive/MyDrive/jobfiles/dataset/trainResumes/candidate_013.pdf'

In [None]:
# Initialize our NLP parser
parser = NLPResumeParser(use_transformers=True)

# Create a function that wraps the parser for easy integration
def extract_structured_text(pdf_path):
    """
    Enhanced function to extract text with section structure from PDF.
    """
    result = parser.process_for_embedding(pdf_path)
    return result

# Function to save data to JSON
def save_to_json(data_dict, filename):
    """Convert the data to a list of dictionaries and save to JSON"""
    json_data = []
    for pdf_path, processed in data_dict.items():
        # Extract candidate name from the contact section
        candidate_name = ""
        if "contact" in processed["sections"]:
            contact_lines = processed["sections"]["contact"].split("\n")
            if contact_lines:
                candidate_name = contact_lines[0].strip()

        json_data.append({
            "file_name": os.path.basename(pdf_path),
            "candidate_name": candidate_name,
            "sections": processed["sections"],
            "structured_text": processed["structured_text"]
        })

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=2, ensure_ascii=False)

    return json_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
# Process training resumes
training_data = {}
for pdf in tqdm(training_pdfs, desc="Extracting training data"):
    training_data[pdf] = extract_structured_text(pdf)

# Save processed training data
training_json = save_to_json(training_data, '/content/drive/MyDrive/jobfiles/normalized_resumes.json')
print(f"Saved {len(training_json)} training resumes")

Extracting training data: 100%|██████████| 90/90 [01:13<00:00,  1.22it/s]


Saved 90 training resumes


In [None]:
# Process testing resumes
testing_data = {}
for pdf in tqdm(testing_pdfs, desc="Extracting test data"):
    testing_data[pdf] = extract_structured_text(pdf)

# Save processed testing data
testing_json = save_to_json(testing_data, '/content/drive/MyDrive/jobfiles/normalized_testing_resumes.json')
print(f"Saved {len(testing_json)} testing resumes")

Extracting test data: 100%|██████████| 60/60 [00:46<00:00,  1.28it/s]

Saved 60 testing resumes





In [None]:
# Process job descriptions
job_data = {}
for pdf in tqdm(job_pdfs, desc="Extracting job data"):
    job_data[pdf] = extract_structured_text(pdf)

# Save processed job data
job_json = save_to_json(job_data, '/content/drive/MyDrive/jobfiles/normalized_jobs.json')
print(f"Saved {len(job_json)} job descriptions")

Extracting job data: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]


Saved 1 job descriptions


In [None]:
# Load the sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')

# Extract structured texts for embedding
training_texts = [item["structured_text"] for item in training_json]
job_desc = job_json[0]["structured_text"]  # For our matching task, take the first job description

# Encode the job description
job_embedding = model.encode(job_desc, convert_to_tensor=True)

# Encode all resume texts
resume_embeddings = model.encode(training_texts, convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(job_embedding, resume_embeddings)[0]

# Rank candidates
ranked_resumes = sorted(zip(training_json, cosine_scores.cpu().numpy()), key=lambda x: x[1], reverse=True)

# Print top matches
print("Top 5 Candidate Matches:")
for i, (resume, score) in enumerate(ranked_resumes[:5]):
    print(f"Rank {i+1}: {resume['candidate_name']} (File: {resume['file_name']}) - Match Score: {score*100:.2f}%")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Top 5 Candidate Matches:
Rank 1: Ellie Mackey F R E S H E R  I N T E R N Executive Profile Actively seeking job. An enthusiast, with a Master's degree in Electronics & Telecommunication, very keen to learn & eagerly looking for opportunities. Other Activities MACHINE LEARNING with Python. Projects (File: candidate_113.pdf) - Match Score: 81.11%
Rank 2: P R O F I L E Software Developer and Data Analytics fresher. Looking to get into the field of Machine Learning. LTT DIGITAL SERVICES JUNIOR SDE, FEB 2019 - TILL DATE Understanding text models and using them to create chatbots on webpages of clients. MARIANA HOLDEN S K I L L S Machine Learning, Natural (File: candidate_134.pdf) - Match Score: 78.36%
Rank 3: SYDNEY JONES PROFILE I am currently working as a junior Machine Learning Engineer and I want to learn more about how Machine Learning projects impact real work scenarios. SKILLS Data Mining, Deep Learning, Machine Learning, Software Testing , Algorithms, Python, JIRA, NLP, SQL, Unsuper

In [None]:
# Save model for later use
model.save("sentence-transformer-model")
!zip -r sentence-transformer-model.zip sentence-transformer-model

In [None]:
def section_based_matching(job_data, resume_data, model):
    """
    Perform section-by-section matching between job and resumes.
    """
    # Define which sections to compare and their weights
    section_comparisons = {
        'experience': {'job_sections': ['experience'], 'weight': 0.35},
        'education': {'job_sections': ['education'], 'weight': 0.15},
        'skills': {'job_sections': ['skills', 'requirements'], 'weight': 0.30},
        'summary': {'job_sections': ['summary', 'description'], 'weight': 0.20},
    }

    # Get the job sections
    job_sections = job_data["sections"]

    # Calculate scores for each resume
    resume_scores = []

    for resume in resume_data:
        resume_sections = resume["sections"]
        section_scores = {}
        weighted_score = 0
        total_weight = 0

        # Compare each relevant section
        for resume_section, config in section_comparisons.items():
            if resume_section not in resume_sections:
                continue

            resume_text = resume_sections[resume_section]

            # Find matching job sections
            job_texts = []
            for job_section in config['job_sections']:
                if job_section in job_sections:
                    job_texts.append(job_sections[job_section])

            if not job_texts or not resume_text:
                continue

            # Join multiple job sections if needed
            job_text = " ".join(job_texts)

            # Encode texts
            resume_embedding = model.encode(resume_text, convert_to_tensor=True)
            job_embedding = model.encode(job_text, convert_to_tensor=True)

            # Calculate similarity
            similarity = util.pytorch_cos_sim(job_embedding, resume_embedding)[0].item()

            # Store score and add to weighted total
            section_scores[resume_section] = similarity
            weighted_score += similarity * config['weight']
            total_weight += config['weight']

        # Normalize the weighted score
        final_score = weighted_score / total_weight if total_weight > 0 else 0

        resume_scores.append({
            'resume': resume,
            'overall_score': final_score,
            'section_scores': section_scores
        })

    # Sort by overall score
    resume_scores.sort(key=lambda x: x['overall_score'], reverse=True)
    return resume_scores

# Perform section-based matching
section_matches = section_based_matching(job_json[0], training_json, model)

# Print top matches with section details
print("\nTop 5 Candidates (Section-Based Matching):")
for i, match in enumerate(section_matches[:5]):
    resume = match['resume']
    print(f"\nRank {i+1}: {resume['candidate_name']} (File: {resume['file_name']}) - Overall Score: {match['overall_score']*100:.2f}%")
    print("Section Scores:")
    for section, score in match['section_scores'].items():
        print(f"  - {section.title()}: {score*100:.2f}%")


Top 5 Candidates (Section-Based Matching):

Rank 1: NOLAN CLARK MACHINE LEARNING GRADUATE RESEARCH ASSISTANT PROFESSIONAL PROFILE As a Graduate Research and Teaching assistant I have an accumulated experience of nearly 1 year and a half. The key areas of focus during this time was on Machine Learning based business attunement methods, along with the (File: candidate_032.pdf) - Overall Score: 44.15%
Section Scores:
  - Experience: 29.64%
  - Skills: 61.07%

Rank 2: JOSE GARCIA INTERN PROFESSIONAL PROFILE Graduate database management engineer looking to join a role in data analytics or cloud-based machine learning solutions. PROFICIENT SKILLS Software Development, Data Science, Data Analysis, Data Structure, Business Analytics, Java, Powerbi, Tableau, SQL, (File: candidate_088.pdf) - Overall Score: 42.48%
Section Scores:
  - Experience: 28.26%
  - Skills: 59.06%

Rank 3: BENJAMIN OSTA FRESHER DEVELOPER PROFESSIONAL PROFILE As a fresher I have in the field of Software Development and bus

In [None]:
# Import the required libraries if not already imported
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity

In [None]:
def tfidf_weighted_matching(job_data, resume_data):
    """
    Perform TF-IDF weighted matching between job descriptions and resumes.
    """
    # Extract text from job and resumes
    if isinstance(job_data, list):
        job_text = job_data[0]["structured_text"]
    else:
        job_text = job_data["structured_text"]

    resume_texts = [resume["structured_text"] for resume in resume_data]

    # Combine all texts for TF-IDF calculation
    all_texts = [job_text] + resume_texts

    # Create and fit TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        max_features=10000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.85
    )

    # Transform all texts to TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(all_texts)

    # Get the job vector and resume vectors
    job_vector = tfidf_matrix[0:1]
    resume_vectors = tfidf_matrix[1:]

    # Calculate similarity scores
    similarity_scores = sklearn_cosine_similarity(job_vector, resume_vectors)[0]

    # Create result list with resumes and scores
    results = []
    for i, (resume, score) in enumerate(zip(resume_data, similarity_scores)):
        results.append({
            'resume': resume,
            'file_name': resume["file_name"],
            'candidate_name': resume["candidate_name"],
            'tfidf_score': float(score)
        })

    # Sort by score in descending order
    results.sort(key=lambda x: x['tfidf_score'], reverse=True)

    return results

In [None]:
def tfidf_section_matching(job_data, resume_data):
    """
    Perform TF-IDF weighted matching between sections of job descriptions and resumes.
    """
    # Define which sections to compare and their weights
    section_comparisons = {
        'experience': {'job_sections': ['experience'], 'weight': 0.35},
        'education': {'job_sections': ['education'], 'weight': 0.15},
        'skills': {'job_sections': ['skills', 'requirements'], 'weight': 0.30},
        'summary': {'job_sections': ['summary', 'description'], 'weight': 0.20},
    }

    # Get job sections
    job_sections = job_data["sections"] if isinstance(job_data, dict) else job_data[0]["sections"]

    # Dictionary to store TF-IDF vectorizers for each section type
    tfidf_vectorizers = {}

    # Results for each resume
    resume_scores = []

    # Process each resume
    for resume in resume_data:
        resume_sections = resume["sections"]
        section_scores = {}
        weighted_score = 0
        total_weight = 0

        # Process each section type
        for section_type, config in section_comparisons.items():
            # Skip if resume doesn't have this section
            if section_type not in resume_sections:
                continue

            resume_section_text = resume_sections[section_type]

            # Get related job section texts
            job_section_texts = []
            for job_section in config['job_sections']:
                if job_section in job_sections:
                    job_section_texts.append(job_sections[job_section])

            # Skip if either job or resume is missing the section
            if not job_section_texts or not resume_section_text:
                continue

            # Combine job section texts
            job_section_text = " ".join(job_section_texts)

            # Create or get TF-IDF vectorizer for this section type
            if section_type not in tfidf_vectorizers:
                tfidf_vectorizers[section_type] = TfidfVectorizer(
                    max_features=5000,
                    stop_words='english',
                    ngram_range=(1, 2)
                )

            # Fit and transform texts
            section_texts = [job_section_text, resume_section_text]
            try:
                tfidf_matrix = tfidf_vectorizers[section_type].fit_transform(section_texts)

                # Calculate similarity
                if tfidf_matrix.shape[0] >= 2:  # Ensure we have at least 2 documents
                    similarity = sklearn_cosine_similarity(
                        tfidf_matrix[0:1],  # Job vector
                        tfidf_matrix[1:2]   # Resume vector
                    )[0][0]
                else:
                    similarity = 0.0
            except:
                # Handle cases where TF-IDF fails (e.g., empty documents after preprocessing)
                similarity = 0.0

            # Store score and update weighted total
            section_scores[section_type] = float(similarity)
            weighted_score += similarity * config['weight']
            total_weight += config['weight']

        # Calculate final weighted score
        final_score = weighted_score / total_weight if total_weight > 0 else 0.0

        # Add to results
        resume_scores.append({
            'resume': resume,
            'resume_file': resume["file_name"],
            'candidate_name': resume["candidate_name"],
            'overall_tfidf_score': float(final_score),
            'section_scores': section_scores
        })

    # Sort by overall score
    resume_scores.sort(key=lambda x: x['overall_tfidf_score'], reverse=True)

    return resume_scores

In [None]:
def hybrid_matching(job_data, resume_data, embedding_model):
    """
    Combine transformer embeddings with TF-IDF for improved matching.
    """
    # Get transformer embedding scores
    if isinstance(job_data, list):
        job_embedding = embedding_model.encode(job_data[0]["structured_text"], convert_to_tensor=True)
    else:
        job_embedding = embedding_model.encode(job_data["structured_text"], convert_to_tensor=True)

    resume_embeddings = embedding_model.encode(
        [resume["structured_text"] for resume in resume_data],
        convert_to_tensor=True
    )

    transformer_scores = util.pytorch_cos_sim(job_embedding, resume_embeddings)[0].cpu().numpy()

    # Get TF-IDF scores
    tfidf_results = tfidf_weighted_matching(job_data, resume_data)

    # Get section-based TF-IDF scores
    section_results = tfidf_section_matching(job_data, resume_data)

    # Calculate combined score
    combined_results = []
    for i, resume in enumerate(resume_data):
        transformer_score = float(transformer_scores[i])

        # Find this resume in the TF-IDF results
        tfidf_score = next((r['tfidf_score'] for r in tfidf_results
                           if r['file_name'] == resume['file_name']), 0.0)

        # Find this resume in the section results
        section_score = next((r['overall_tfidf_score'] for r in section_results
                             if r['resume_file'] == resume['file_name']), 0.0)

        # Combined score (weighted average)
        combined_score = (
            transformer_score * 0.4 +    # 40% transformer
            tfidf_score * 0.3 +          # 30% TF-IDF
            section_score * 0.3          # 30% section-based
        )

        # Get detailed section scores
        resume_section_scores = next((r['section_scores'] for r in section_results
                                    if r['resume_file'] == resume['file_name']), {})

        combined_results.append({
            'resume_file': resume['file_name'],
            'candidate_name': resume['candidate_name'],
            'transformer_score': transformer_score,
            'tfidf_score': tfidf_score,
            'section_score': section_score,
            'combined_score': combined_score,
            'section_details': resume_section_scores
        })

    # Sort by combined score
    combined_results.sort(key=lambda x: x['combined_score'], reverse=True)

    return combined_results

In [None]:
# Apply TF-IDF matching
print("\nRunning TF-IDF Weighted Matching...")
tfidf_results = tfidf_weighted_matching(job_json[0], training_json)

# Print top TF-IDF matches
print("\nTop 5 Candidates (TF-IDF Matching):")
for i, match in enumerate(tfidf_results[:5]):
    print(f"Rank {i+1}: {match['candidate_name']} (File: {match['file_name']}) - TF-IDF Score: {match['tfidf_score']*100:.2f}%")


Running TF-IDF Weighted Matching...

Top 5 Candidates (TF-IDF Matching):
Rank 1: P R O F I L E Software Developer and Data Analytics fresher. Looking to get into the field of Machine Learning. LTT DIGITAL SERVICES JUNIOR SDE, FEB 2019 - TILL DATE Understanding text models and using them to create chatbots on webpages of clients. MARIANA HOLDEN S K I L L S Machine Learning, Natural (File: candidate_134.pdf) - TF-IDF Score: 20.16%
Rank 2: MIRAD YASTEIN A N A L Y S T  I N T E R N SKILLS Artificial Intelligence, Deep Learning, Reinforcement Learning, Tensorflow Keras, Scikit learn, Numpy, Pandas, Matplotlib. EDUCATION B.Tech(IT) from IIIT D&M Kancheepuram, Chennai in 2019 PERSONAL PROFILE (File: candidate_149.pdf) - TF-IDF Score: 17.82%
Rank 3: Joseline Hernandez D A T A  S C I E N T I S T  A N D  M A C H I N E  L E A N R I N G E N G I N E E R Executive Profile Good understanding of Data Preprocessing, training and deployment of machine learning and deep learning models.. Worked on Time S

In [None]:
# Apply section-based TF-IDF matching
print("\nRunning Section-Based TF-IDF Matching...")
section_tfidf_results = tfidf_section_matching(job_json[0], training_json)

# Print top section-based matches
print("\nTop 5 Candidates (Section-Based TF-IDF Matching):")
for i, match in enumerate(section_tfidf_results[:5]):
    print(f"\nRank {i+1}: {match['candidate_name']} (File: {match['resume_file']}) - Overall Score: {match['overall_tfidf_score']*100:.2f}%")
    print("Section Scores:")
    for section, score in match['section_scores'].items():
        print(f"  - {section.title()}: {score*100:.2f}%")


Running Section-Based TF-IDF Matching...

Top 5 Candidates (Section-Based TF-IDF Matching):

Rank 1: WYATT COOPER DATA STRATEGY AND DELIVERY SPECIALIST PROFESSIONAL PROFILE A quality-oriented professional with a keen interest in solving complex business problems by way of AI-based design & thinking.  A technology-agnostic individual with a strong background in data science, statistics, and economics (File: candidate_025.pdf) - Overall Score: 12.58%
Section Scores:
  - Experience: 7.40%
  - Skills: 18.63%

Rank 2: NOLAN CLARK MACHINE LEARNING GRADUATE RESEARCH ASSISTANT PROFESSIONAL PROFILE As a Graduate Research and Teaching assistant I have an accumulated experience of nearly 1 year and a half. The key areas of focus during this time was on Machine Learning based business attunement methods, along with the (File: candidate_032.pdf) - Overall Score: 7.57%
Section Scores:
  - Experience: 7.74%
  - Skills: 7.38%

Rank 3: SHAWN BUFFET DATA PROCESSING INTERN PROFESSIONAL PROFILE Seeking R

In [None]:
# Apply hybrid matching
print("\nRunning Hybrid Matching (Transformer + TF-IDF)...")
hybrid_results = hybrid_matching(job_json[0], training_json, model)

# Print top hybrid matches
print("\nTop 5 Candidates (Hybrid Matching):")
for i, match in enumerate(hybrid_results[:5]):
    print(f"\nRank {i+1}: {match['candidate_name']} (File: {match['resume_file']}) - Combined Score: {match['combined_score']*100:.2f}%")
    print(f"  - Transformer Score: {match['transformer_score']*100:.2f}%")
    print(f"  - TF-IDF Score: {match['tfidf_score']*100:.2f}%")
    print(f"  - Section Score: {match['section_score']*100:.2f}%")
    print("Section Details:")
    for section, score in match['section_details'].items():
        print(f"    * {section.title()}: {score*100:.2f}%")

# Save the hybrid results to JSON
with open('/content/drive/MyDrive/jobfiles/hybrid_matching_results.json', 'w') as f:
    import json
    json.dump(hybrid_results, f, indent=2)

print("\nHybrid matching results saved to 'hybrid_matching_results.json'")


Running Hybrid Matching (Transformer + TF-IDF)...

Top 5 Candidates (Hybrid Matching):

Rank 1: HAYLEE ROGERS SOFTWARE DEVELOPER(AI/ML) PROFESSIONAL PROFILE I am a software developer working with cloud infrastructure and management. I have been part of different projects that were developed on different Machine Learning algorithms. PROFICIENT SKILLS Machine Learning, Deep Learning (File: candidate_074.pdf) - Combined Score: 37.40%
  - Transformer Score: 76.93%
  - TF-IDF Score: 15.81%
  - Section Score: 6.30%
Section Details:
    * Experience: 0.00%
    * Skills: 13.66%

Rank 2: P R O F I L E Software Developer and Data Analytics fresher. Looking to get into the field of Machine Learning. LTT DIGITAL SERVICES JUNIOR SDE, FEB 2019 - TILL DATE Understanding text models and using them to create chatbots on webpages of clients. MARIANA HOLDEN S K I L L S Machine Learning, Natural (File: candidate_134.pdf) - Combined Score: 37.40%
  - Transformer Score: 78.36%
  - TF-IDF Score: 20.16%
  - 