In [None]:
!pip install PyPDF2 python-docx spacy wordcloud
!pip install nltk
!python -m spacy download en_core_web_sm
!pip install pandas


In [39]:
# -------------------------------
# 1️⃣ Imports
# -------------------------------
from PyPDF2 import PdfReader
import re
import string
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import kagglehub

#this is just the database stuff
path = kagglehub.dataset_download("shamimhasan8/resume-vs-job-description-matching-dataset")
import pandas as pd
import os

path = "/root/.cache/kagglehub/datasets/shamimhasan8/resume-vs-job-description-matching-dataset/versions/1"
file_path = os.path.join(path, "resume_job_matching_dataset.csv")

df = pd.read_csv(file_path)
print(df.head())
print(df.info())




#the cleaning of text part
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    return text.strip()

df['clean_resume'] = df['resume'].apply(clean_text)
df['clean_jd'] = df['job_description'].apply(clean_text)

#just to make it small otherwise executing time was going very high
df_sample = df.sample(500, random_state=42)

#our ml model
emb_model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim embeddings


resume_texts = df_sample['clean_resume'].tolist()
jd_texts = df_sample['clean_jd'].tolist()

resume_embs = emb_model.encode(resume_texts, batch_size=32, normalize_embeddings=True)
jd_embs = emb_model.encode(jd_texts, batch_size=32, normalize_embeddings=True)

#cosines part
sim_scores = [float(cosine_similarity(resume_embs[i].reshape(1,-1),
                                     jd_embs[i].reshape(1,-1))[0,0])
              for i in range(len(resume_embs))]

df_sample['match_score'] = np.array(sim_scores) * 100  # scale 0..100


def clean_resume_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    # Clean text
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9.,;:!?()\s+]', '', text)
    return text.strip()


resume_pdf_text = clean_resume_pdf('Resume_Latest.pdf')
resume_pdf_emb = emb_model.encode([resume_pdf_text], normalize_embeddings=True)

#exmaple job desc
jd_example ='''Wb developer with experience in nextjs , react , three js , react fiber , tailwind , vanilla '''
jd_emb = emb_model.encode([jd_example], normalize_embeddings=True)

score = float(cosine_similarity(resume_pdf_emb.reshape(1,-1), jd_emb.reshape(1,-1))[0,0]) * 100
print(f"ATS match score: {round(score,2)}%")

 #ats part
print(df_sample[['clean_resume','clean_jd','match_score']].head())


                                     job_description  \
0  Data Analyst needed with experience in SQL, Ex...   
1  Data Scientist needed with experience in Stati...   
2  Software Engineer needed with experience in Sy...   
3  ML Engineer needed with experience in Python, ...   
4  Software Engineer needed with experience in RE...   

                                              resume  match_score  
0  Experienced professional skilled in SQL, Power...            4  
1  Experienced professional skilled in Python, De...            4  
2  Experienced professional skilled in wait, Git,...            5  
3  Experienced professional skilled in return, De...            4  
4  Experienced professional skilled in REST APIs,...            5  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_description  10000 non-null  object
 1   resume  