In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("Resume.csv", encoding='latin1')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [3]:
df.columns

Index(['ID', 'Resume_str', 'Resume_html', 'Category'], dtype='object')

In [4]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n', ' ')
    return text

In [6]:
df['cleaned_resume'] = df['Resume_str'].apply(clean_text)

In [7]:
job_description = """
Looking for a Data Scientist with experience in Python, Machine Learning,
SQL, data analysis, statistics, pandas, numpy, and visualization.
"""

In [8]:
job_cleaned = clean_text(job_description)

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')

tfidf_matrix = vectorizer.fit_transform(
    df['cleaned_resume'].tolist() + [job_cleaned]
)

In [10]:
similarity_scores = cosine_similarity(
    tfidf_matrix[-1],   # job description
    tfidf_matrix[:-1]   # resumes
)

df['Similarity Score'] = similarity_scores[0]

In [11]:
ranked_df = df.sort_values(by='Similarity Score', ascending=False)

ranked_df[['Category', 'Similarity Score']].head(10)

Unnamed: 0,Category,Similarity Score
1218,CONSULTANT,0.248044
1339,AUTOMOBILE,0.24323
1762,ENGINEERING,0.236332
926,AGRICULTURE,0.216655
1303,DIGITAL-MEDIA,0.137732
1040,SALES,0.129356
1142,CONSULTANT,0.126775
1091,SALES,0.117274
331,INFORMATION-TECHNOLOGY,0.115478
2153,BANKING,0.109864


In [12]:
required_skills = ["python", "machine learning", "sql", "statistics"]

def missing_skills(resume):
    missing = []
    for skill in required_skills:
        if skill not in resume:
            missing.append(skill)
    return missing

ranked_df['Missing Skills'] = ranked_df['cleaned_resume'].apply(missing_skills)