In [4]:
!pip install PyPDF2 python-docx spacy wordcloud
!pip install nltk
!python -m spacy download en_core_web_sm
!pip install pandas


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.2.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can no

In [5]:
# -------------------------------
# 1️⃣ Imports
# -------------------------------
from PyPDF2 import PdfReader
import re
import string
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import kagglehub

#this is just the database stuff
path = kagglehub.dataset_download("shamimhasan8/resume-vs-job-description-matching-dataset")
file_path = os.path.join(path, "resume_job_matching_dataset.csv")

df = pd.read_csv(file_path)
print(df.head())
print(df.info())




#the cleaning of text part
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    return text.strip()

df['clean_resume'] = df['resume'].apply(clean_text)
df['clean_jd'] = df['job_description'].apply(clean_text)

#just to make it small otherwise executing time was going very high
df_sample = df.sample(500, random_state=42)

#our ml model
emb_model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim embeddings

emb_model.save("/content/saved_emu_model")

resume_texts = df_sample['clean_resume'].tolist()
jd_texts = df_sample['clean_jd'].tolist()

resume_embs = emb_model.encode(resume_texts, batch_size=32, normalize_embeddings=True)
jd_embs = emb_model.encode(jd_texts, batch_size=32, normalize_embeddings=True)

#cosines part
sim_scores = [float(cosine_similarity(resume_embs[i].reshape(1,-1),
                                     jd_embs[i].reshape(1,-1))[0,0])
              for i in range(len(resume_embs))]

df_sample['match_score'] = np.array(sim_scores) * 100  # scale 0..100


def clean_resume_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    # Clean text
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-z0-9.,;:!?()\s+]', '', text)
    return text.strip()


resume_pdf_text = clean_resume_pdf('Resume_Latest.pdf')
resume_pdf_emb = emb_model.encode([resume_pdf_text], normalize_embeddings=True)

#exmaple job desc
jd_example ='''Wb developer with experience in nextjs , react , three js , react fiber , tailwind , vanilla '''
jd_emb = emb_model.encode([jd_example], normalize_embeddings=True)

score = float(cosine_similarity(resume_pdf_emb.reshape(1,-1), jd_emb.reshape(1,-1))[0,0]) * 100
print(f"ATS match score: {round(score,2)}%")

 #ats part
print(df_sample[['clean_resume','clean_jd','match_score']].head())


Downloading from https://www.kaggle.com/api/v1/datasets/download/shamimhasan8/resume-vs-job-description-matching-dataset?dataset_version_number=1...


100%|██████████| 927k/927k [00:00<00:00, 98.3MB/s]

Extracting files...
                                     job_description  \
0  Data Analyst needed with experience in SQL, Ex...   
1  Data Scientist needed with experience in Stati...   
2  Software Engineer needed with experience in Sy...   
3  ML Engineer needed with experience in Python, ...   
4  Software Engineer needed with experience in RE...   

                                              resume  match_score  
0  Experienced professional skilled in SQL, Power...            4  
1  Experienced professional skilled in Python, De...            4  
2  Experienced professional skilled in wait, Git,...            5  
3  Experienced professional skilled in return, De...            4  
4  Experienced professional skilled in REST APIs,...            5  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_description  10000 non-null  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ATS match score: 32.58%
                                           clean_resume  \
6252  experienced professional skilled in agile user...   
4684  experienced professional skilled in life scrum...   
1731  experienced professional skilled in night alth...   
4742  experienced professional skilled in book sql w...   
4521  experienced professional skilled in computer v...   

                                               clean_jd  match_score  
6252  product manager needed with experience in scru...    68.238866  
4684  product manager needed with experience in prod...    54.737926  
1731  data scientist needed with experience in machi...    42.229700  
4742  data analyst needed with experience in tableau...    45.346928  
4521  ml engineer needed with experience in python t...    67.588079  


In [6]:
from google.colab import files
import shutil

# Zip the saved model folder
shutil.make_archive("/content/saved_emu_model", 'zip', "/content/saved_emu_model")

# Download the zip file to your local machine
files.download("/content/saved_emu_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>