In [1]:
import sys
import tensorflow as tf
import pandas as pd

In [2]:
path = r'D:\Projects\ATS_Resume_Matcher\data\Resume\Resume.csv'
df = pd.read_csv(path)

In [3]:
print(f"\nSuccessfully loaded {len(df)} resumes!")
df.head()


Successfully loaded 2484 resumes!


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
# 1. Drop the HTML column as we don't need it for training
df = df[['Category', 'Resume_str']]

In [5]:
# 2. Rename the column for easier use
df.columns = ['Category', 'Resume']

In [6]:
print("Resumes per Category:")
print(df['Category'].value_counts())

Resumes per Category:
Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
ADVOCATE                  118
CHEF                      118
FINANCE                   118
ENGINEERING               118
ACCOUNTANT                118
FITNESS                   117
AVIATION                  117
SALES                     116
HEALTHCARE                115
CONSULTANT                115
BANKING                   115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64


In [7]:
print("\nMissing Values:\n", df.isnull().sum())


Missing Values:
 Category    0
Resume      0
dtype: int64


### we need to perform Text Preprocessing. Raw resumes are messy—they have extra spaces, special characters, and "stopwords" (like "the", "and", "is") that don't help the AI understand the actual skills.

In [8]:
import re

def clean_resume(text):
    # 1. Remove URLs
    text = re.sub(r'http\S+\s*', ' ', text)
    # 2. Remove RT and cc (common in email/social patterns)
    text = re.sub(r'RT|cc', ' ', text)
    # 3. Remove hashtags and mentions
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'@\S+', '  ', text)
    # 4. Remove punctuations and special characters
    text = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    # 5. Remove non-ascii characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    # 6. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

# Apply the cleaning function to the Resume column
df['Clean_Resume'] = df['Resume'].apply(lambda x: clean_resume(x))

# Show a comparison of the first resume
print("Original Text (First 100 chars):", df['Resume'].iloc[0][:100])
print("\nCleaned Text (First 100 chars):", df['Clean_Resume'].iloc[0][:100])

Original Text (First 100 chars):          HR ADMINISTRATOR/MARKETING ASSOCIATE

HR ADMINISTRATOR       Summary     Dedicated Customer

Cleaned Text (First 100 chars): hr administrator marketing associate hr administrator summary dedicated customer service manager wit


### Specifically, we'll use a model called all-MiniLM-L6-v2. It is fast, lightweight, and excellent at finding "semantic similarity" (meaning it knows that "Developer" and "Programmer" are similar).

In [9]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Let's test it with a simple example
sentences = ["I am a software engineer", "I develop mobile applications"]
embeddings = model.encode(sentences)

print("Embedding Shape:", embeddings.shape)
print("\nSuccess! The model is ready to convert text into math.")

  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 154.02it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding Shape: (2, 384)

Success! The model is ready to convert text into math.


In [10]:
import numpy as np

resume_list = df['Clean_Resume'].tolist()

print(f"Encoding all {len(df)} resumes... This might take a minute.")

# Convert text to embeddings
resume_embeddings = model.encode(resume_list, show_progress_bar=True)

print("\nEncoding Complete!")
print("Shape of Resume Embeddings:", resume_embeddings.shape)

Encoding all 2484 resumes... This might take a minute.


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 78/78 [05:29<00:00,  4.23s/it]


Encoding Complete!
Shape of Resume Embeddings: (2484, 384)





In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. sample Job Description
job_description = """
We are looking for a Machine Learning Engineer with experience in Python, 
TensorFlow, and Data Science. The candidate should be able to build 
predictive models and work with NLP and Transformers.
"""

# 2. Clean and Encode the Job Description
cleaned_jd = clean_resume(job_description) # We use the same cleaning function
jd_embedding = model.encode([cleaned_jd])

# 3. Calculate Similarity between the JD and ALL Resumes
# This compares the JD vector to all 2484 resume vectors
similarities = cosine_similarity(jd_embedding, resume_embeddings)[0]

# 4. Get the Top 5 most similar resumes
top_indices = similarities.argsort()[-5:][::-1]

print("Top 5 Matching Resumes for the Job:")
print("-" * 30)
for i in top_indices:
    print(f"Category: {df['Category'].iloc[i]}")
    print(f"Similarity Score: {similarities[i]:.4f}")
    print(f"Resume Snippet: {df['Resume'].iloc[i][:150]}...")
    print("-" * 30)

Top 5 Matching Resumes for the Job:
------------------------------
Category: ENGINEERING
Similarity Score: 0.4356
Resume Snippet:          ENGINEERING AND QUALITY TECHNICIAN       Career Overview    A highly experienced skilled graduate with Analytics degree with a very good expe...
------------------------------
Category: AGRICULTURE
Similarity Score: 0.4253
Resume Snippet:          VP, PRINCIPAL       Summary     I am highly skilled,growth mindset IT professional having more than 20 years experience mostly in financial i...
------------------------------
Category: ENGINEERING
Similarity Score: 0.4218
Resume Snippet:          ENGINEERING INTERN           Profile     Proficient Entry Level Electrical Engineer with excellent technical, analytical and communication sk...
------------------------------
Category: BPO
Similarity Score: 0.3991
Resume Snippet:          TEST ANALYST-INTERN/CONTRACTOR       Profile    3+ years of professional experience in Software Testing. Experience in workin

In [12]:
def extract_skills(text, skill_list):
    # A simple list of skills to look for - you can expand this!
    found_skills = [skill for skill in skill_list if skill.lower() in text.lower()]
    return found_skills

# "Master List" of technical skills to track
technical_skills = ['Python', 'TensorFlow', 'Keras', 'NLP', 'Machine Learning', 'Data Science', 'SQL', 'Tableau', 'PyTorch', 'C++']

# Let's analyze the #1 match from your previous step
top_resume_index = top_indices[0]
top_resume_text = df['Resume'].iloc[top_resume_index]

jd_skills = extract_skills(job_description, technical_skills)
resume_skills = extract_skills(top_resume_text, technical_skills)

# Find missing skills
missing_skills = list(set(jd_skills) - set(resume_skills))

print(f"Analysis for Top Match (Category: {df['Category'].iloc[top_resume_index]}):")
print(f"Skills found in JD: {jd_skills}")
print(f"Skills found in Resume: {resume_skills}")
print(f"--- MISSING SKILLS: {missing_skills} ---")

Analysis for Top Match (Category: ENGINEERING):
Skills found in JD: ['Python', 'TensorFlow', 'NLP', 'Machine Learning', 'Data Science']
Skills found in Resume: ['Python', 'Machine Learning', 'SQL', 'Tableau']
--- MISSING SKILLS: ['Data Science', 'NLP', 'TensorFlow'] ---
