In [2]:
%pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m114.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.5


In [None]:
import os
import pandas as pd
import fitz  # PyMuPDF
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
resumes_root = "/content/drive/MyDrive/resume_data"
resumes_csv = "/content/drive/MyDrive/Resume.csv"
jobs_csv = "/content/drive/MyDrive/data_job_posts.csv"

# Load Resume CSV (ID, Category)

resumes_df = pd.read_csv(resumes_csv)
resumes_df = resumes_df[['ID', 'Category']]


In [None]:
# Fast PDF Text Extraction
def extract_pdf_text_fast(pdf_path):
    try:
        with fitz.open(pdf_path) as doc:
            return " ".join(page.get_text("text") for page in doc)
    except Exception:
        return ""

def process_resume(row):
    cat = row["Category"]
    resume_id = str(row["ID"])
    pdf_path = os.path.join(resumes_root, cat, f"{resume_id}.pdf")
    if os.path.exists(pdf_path):
        return extract_pdf_text_fast(pdf_path)
    return ""

# Extract and Cache Resume Texts
cache_file = "resumes_with_text.csv"

if os.path.exists(cache_file):
    print("✅ Loaded cached resume text data.")
    resumes_df = pd.read_csv(cache_file)
else:
    print("⚙️ Extracting text from PDF resumes...")
    with ThreadPoolExecutor(max_workers=12) as executor:
        results = list(tqdm(executor.map(process_resume, [row for _, row in resumes_df.iterrows()]),
                            total=len(resumes_df),
                            desc="Extracting Resume PDFs"))
    resumes_df["Resume_str"] = results
    resumes_df = resumes_df[resumes_df["Resume_str"].str.strip() != ""].reset_index(drop=True)
    resumes_df.to_csv(cache_file, index=False)
    print("✅ Cached extracted resumes to resumes_with_text.csv")

In [None]:
# Load Job Descriptions
job_desc_df = pd.read_csv(jobs_csv)
job_desc_df = job_desc_df.dropna(subset=["JobDescription"])
job_desc_df["Category"] = job_desc_df["IT"].fillna("General")

# Create Positive and Negative Pairs (Safe & Balanced)
job_desc_df["Category"] = job_desc_df["Category"].fillna("General").astype(str)
resumes_df["Category"] = resumes_df["Category"].fillna("General").astype(str)

positive_pairs = []
negative_pairs = []

for _, r in tqdm(resumes_df.iterrows(), total=len(resumes_df), desc="Pairing Data"):
    resume_cat = str(r["Category"]).strip().lower()

    # Filter jobs by category safely
    cat_jobs = job_desc_df[job_desc_df["Category"].str.lower() == resume_cat]
    diff_jobs = job_desc_df[job_desc_df["Category"].str.lower() != resume_cat]

    # ✅ Positive pair (category match or fallback random)
    if not cat_jobs.empty:
        job_pos = cat_jobs.sample(1).iloc[0]
    else:
        job_pos = job_desc_df.sample(1).iloc[0]  # fallback random positive
    positive_pairs.append([r["Resume_str"], job_pos["JobDescription"], 1])

    # ✅ Negative pair (always different category)
    if not diff_jobs.empty:
        job_neg = diff_jobs.sample(1).iloc[0]
    else:
        job_neg = job_desc_df.sample(1).iloc[0]
    negative_pairs.append([r["Resume_str"], job_neg["JobDescription"], 0])

pairs_df = pd.DataFrame(positive_pairs + negative_pairs, columns=["Resume_str", "JobDescription", "label"])

print(f"✅ Created {len(pairs_df)} total pairs.")
print(pairs_df['label'].value_counts())


In [None]:
# Split Data
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    pairs_df,
    test_size=0.2,
    random_state=42,
    stratify=pairs_df["label"]
)

# Prepare Training Data
from sentence_transformers import InputExample

train_examples = [
    InputExample(texts=[r, j], label=float(l))
    for r, j, l in zip(train_df["Resume_str"], train_df["JobDescription"], train_df["label"])
]

test_examples = [
    InputExample(texts=[r, j], label=float(l))
    for r, j, l in zip(test_df["Resume_str"], test_df["JobDescription"], test_df["label"])
]

# Initialize Transformer Model
from sentence_transformers import SentenceTransformer, losses, evaluation
from torch.utils.data import DataLoader

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Prepare Dataloaders and Loss Function
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.CosineSimilarityLoss(model)

# Evaluator (Optional, for validation)
test_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_examples, name='test')

# Train the Model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=test_evaluator,
    epochs=3,
    warmup_steps=100,
    output_path="./outputs/resume_matcher_model",
    show_progress_bar=True
)


In [None]:
model.save("./outputs/resume_matcher_model")
from sentence_transformers import SentenceTransformer, util

# Load your saved model
model = SentenceTransformer("./outputs/resume_matcher_model")


In [3]:
# ===============================================
# 🔧 Imports
# ===============================================
import os
import pandas as pd
import fitz
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch

# ===============================================
# 📂 Paths
# ===============================================
resumes_csv = "/content/drive/MyDrive/Resume.csv"
jobs_csv = "/content/drive/MyDrive/data_job_posts.csv"
output_dir = "./outputs/resume_matcher_model"

# ===============================================
# 🧾 Load Resume and Job Data
# ===============================================
resumes_df = pd.read_csv(resumes_csv)
jobs_df = pd.read_csv(jobs_csv)

# Clean up
resumes_df["Category"] = resumes_df["Category"].astype(str).str.strip().str.lower()
jobs_df["Category"] = jobs_df["IT"].fillna("general").astype(str).str.strip().str.lower()
jobs_df = jobs_df.dropna(subset=["JobDescription"])

# ===============================================
# 🔁 Create Positive + Negative Pairs
# ===============================================
positive_pairs, negative_pairs = [], []

for _, r in tqdm(resumes_df.iterrows(), total=len(resumes_df)):
    cat_jobs = jobs_df[jobs_df["Category"] == r["Category"]]
    diff_jobs = jobs_df[jobs_df["Category"] != r["Category"]]

    if not cat_jobs.empty:
        job = cat_jobs.sample(1).iloc[0]
        positive_pairs.append([r["Resume_str"], job["JobDescription"], 1])

    if not diff_jobs.empty:
        job = diff_jobs.sample(1).iloc[0]
        negative_pairs.append([r["Resume_str"], job["JobDescription"], 0])

pairs_df = pd.DataFrame(positive_pairs + negative_pairs,
                        columns=["Resume_str", "JobDescription", "label"])
print("✅ Total pairs created:", len(pairs_df))

# ===============================================
# ✂️ Split Data
# ===============================================
train_df, test_df = train_test_split(
    pairs_df,
    test_size=0.2,
    random_state=42,
    stratify=pairs_df["label"]
)

train_examples = [
    InputExample(texts=[r, j], label=float(l))
    for r, j, l in zip(train_df["Resume_str"], train_df["JobDescription"], train_df["label"])
]

test_examples = [
    InputExample(texts=[r, j], label=float(l))
    for r, j, l in zip(test_df["Resume_str"], test_df["JobDescription"], test_df["label"])
]

# ===============================================
# ⚙️ Model Setup
# ===============================================
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens', device=device)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
train_loss = losses.MultipleNegativesRankingLoss(model)
test_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_examples, name='test')

# ===============================================
# 🏋️ Train (Memory-Safe)
# ===============================================
torch.cuda.empty_cache()
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=test_evaluator,
    epochs=3,
    warmup_steps=100,
    output_path=output_dir,
    show_progress_bar=True
)

print("✅ Model trained and saved at:", output_dir)


100%|██████████| 2484/2484 [00:26<00:00, 92.43it/s] 


✅ Total pairs created: 2484


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshivika2934[0m ([33mshivika2934-amity-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Test Pearson Cosine,Test Spearman Cosine
497,No log,No log,,
500,1.558300,No Log,No Log,No Log
994,1.558300,No log,,
1000,1.343700,No Log,No Log,No Log
1491,1.343700,No log,,


  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)


✅ Model trained and saved at: ./outputs/resume_matcher_model


In [5]:
import fitz
from sentence_transformers import SentenceTransformer, util

# Load trained model
model = SentenceTransformer("./outputs/resume_matcher_model")

# Function to extract text
def extract_pdf_text(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text.strip()

# Example
resume_pdf = "/content/drive/MyDrive/resume_data/HR/10399912.pdf"
job_description = """We are seeking a detail-oriented HR Personnel Assistant / Administrative Data Entry Clerk to support the Human Resources and Administrative departments with day-to-day operations, employee data management, documentation, and clerical support. The ideal candidate will have proven experience in maintaining accurate records, assisting with new hire processes, managing correspondence, and providing exceptional organizational and customer service support."""

# Extract resume text
resume_text = extract_pdf_text(resume_pdf)

# Compute embeddings
resume_emb = model.encode(resume_text, convert_to_tensor=True)
job_emb = model.encode(job_description, convert_to_tensor=True)

# Compute similarity
similarity = util.cos_sim(resume_emb, job_emb)[0][0].item() * 100
print(f"🔍 Similarity Score: {similarity:.2f}%")


🔍 Similarity Score: 99.66%


In [6]:
import fitz
import re
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import pipeline

# ===========================================
# 🔹 Load Models
# ===========================================
semantic_model = SentenceTransformer("./outputs/resume_matcher_model")
gpt_analyzer = pipeline("text-generation", model="gpt2")  # You can replace with a stronger LLM locally or via API

# ===========================================
# 📄 Extract PDF Text
# ===========================================
def extract_pdf_text(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text.strip()

# ===========================================
# 🧩 Split Resume into Sections
# ===========================================
def split_resume_sections(text):
    sections = {}
    section_titles = ["Summary", "Highlights", "Experience", "Education", "Skills"]
    current_section = "General"
    sections[current_section] = ""

    for line in text.splitlines():
        line_stripped = line.strip()
        if not line_stripped:
            continue
        for title in section_titles:
            if re.search(rf"\b{title}\b", line_stripped, re.IGNORECASE):
                current_section = title
                sections[current_section] = ""
                break
        else:
            sections[current_section] += line_stripped + " "
    return sections

# ===========================================
# 🧠 Semantic Similarity
# ===========================================
def compute_similarity(text1, text2):
    emb1 = semantic_model.encode(text1, convert_to_tensor=True)
    emb2 = semantic_model.encode(text2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item() * 100

# ===========================================
# 🤖 AI Resume Analyzer
# ===========================================
def ai_resume_analysis(resume_text, job_description, section_scores):
    prompt = f"""
You are an AI resume evaluation expert. Analyze the following resume against the given job description.
Provide an overall evaluation, skill gap analysis, and improvement suggestions.

Resume:
{resume_text}

Job Description:
{job_description}

Section Similarity Scores:
{section_scores}

Return a structured evaluation in this format:
1️⃣ Summary of Match
2️⃣ Strengths
3️⃣ Weaknesses
4️⃣ Skill Gaps
5️⃣ Improvement Suggestions
    """
    analysis = gpt_analyzer(prompt, max_length=600, temperature=0.7)[0]['generated_text']
    return analysis.strip()

# ===========================================
# 🚀 Main Analysis Function
# ===========================================
def analyze_resume_with_ai(resume_pdf, job_description):
    resume_text = extract_pdf_text(resume_pdf)
    resume_sections = split_resume_sections(resume_text)

    # Section-wise semantic scores
    section_scores = {}
    for section, content in resume_sections.items():
        if content.strip():
            section_scores[section] = round(compute_similarity(content, job_description), 2)

    overall_score = round(compute_similarity(resume_text, job_description), 2)

    # AI-based qualitative reasoning
    ai_report = ai_resume_analysis(resume_text, job_description, section_scores)

    return {
        "Overall Similarity (%)": overall_score,
        "Section Scores": section_scores,
        "AI Analysis": ai_report
    }

# ===========================================
# 🧾 Example Usage
# ===========================================
resume_pdf = "/content/drive/MyDrive/resume_data/HR/10399912.pdf"
job_description = """
We are seeking a detail-oriented HR Personnel Assistant / Administrative Data Entry Clerk to support the Human Resources and Administrative departments with day-to-day operations, employee data management, documentation, and clerical support. The ideal candidate will have proven experience in maintaining accurate records, assisting with new hire processes, managing correspondence, and providing exceptional organizational and customer service support.
"""

report = analyze_resume_with_ai(resume_pdf, job_description)

print("🔍 AI Resume Analysis\n" + "="*50)
print(f"Overall Similarity: {report['Overall Similarity (%)']}%")
print("\nSection-wise Similarity:")
for sec, score in report["Section Scores"].items():
    print(f"  - {sec}: {score}%")

print("\n🤖 AI Insights:\n")
print(report["AI Analysis"])


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=600) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


🔍 AI Resume Analysis
Overall Similarity: 99.66%

Section-wise Similarity:
  - General: 98.96%
  - Summary: 99.5%
  - Skills: 87.85%
  - Experience: 99.78%
  - Education: 99.12%

🤖 AI Insights:

You are an AI resume evaluation expert. Analyze the following resume against the given job description.
Provide an overall evaluation, skill gap analysis, and improvement suggestions.

Resume:
HR PERSONNEL ASSISTANT
Summary
I am a U.S. citizen who is authorized to work in the US for any employer. I have worked 8 years as an Office Clerk, 2 years as a Student
Intern/Office Assistant, and 4 years as a Contractor. I am applying for the Data Entry Clerk position (Advert ID# 224278 Advert ID# 224278).
My skills and experiences include: Administrative Support, Auditing, File Management, Meeting Facilitation, Office Materials Management, &
Inventory Management.
Highlights
COMPUTER SKILLS: Microsoft Word, MS Excel, MS Outlook, MS PowerPoint, PeopleSoft. TYPING SKILLS: 40-60 WPM.
ADDITIONAL SKILLS: Admin

In [8]:
mv /content/outputs/resume_matcher_model ./outputs/transformer_resume_matcher_model