In [2]:
!pip install PyPDF2 fuzzywuzzy unidecode

In [46]:
import PyPDF2
from fuzzywuzzy import fuzz
import pandas as pd
from unidecode import unidecode
import re
import os
from os.path import isfile, join

In [47]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            if page_num > 0:  # Add newline for page breaks (except the first page)
                text += "\n"
            text += pdf_reader.pages[page_num].extract_text()
    text = unidecode(text)
    return text

In [48]:
def identify_headings(text):
    headings = []
    lines = text.split('\n')

    # Define synonyms and their corresponding fuzz ratio thresholds
    heading_synonyms = {
        "Education": ["Education", "Qualifications", "Educational Qualifications", "Academic Background", "Educational Details", "Education and Training"],
        "Skills": ["Skills", "Technical Skills", "Key Competencies", "Skill Highlights", "Primary Skills", "Specializations", "Areas of Expertise", "Expertise", "Programming Languages"],
        "Experience": ["Work Experience", "Professional Background", "Professional Experience", "Work History", "Teaching Experience", "Employment History"],
        "Achievements": ["Accomplishments", "Achievements", "Notable Projects", "Qualifications"],
        "Others": ["Awards", "Honors", "Recognition", "Publications", "Certifications", "Presentations", "Volunteer Experience", "Leadership Experience","Interests","Hobbies", "Languages", "Licenses"],
        "Summary": ["Career Overview", "Summary", "About Me", "Profile Summary", "Highlights", "Objective"]
    }

    for line in lines:
        line = line.strip()
        for heading, synonyms in heading_synonyms.items():
            for synonym in synonyms:
                if re.match(r'^\s*{}\s*'.format(synonym), line, re.IGNORECASE):
                    headings.append((line, heading))

    found = False
    for tup in headings:
      if tup[1] == "Experience":
          found = True

    if not found:
      for line in lines:
        line = line.strip()
        for heading, synonyms in heading_synonyms.items():
          if heading == "Experience":
            synonyms = ["Work Experience", "Professional Background", "Professional Experience", "Work History", "Teaching Experience", "Employment History", "Experience"]
            for synonym in synonyms:
                if re.match(r'^\s*{}\s*'.format(synonym), line, re.IGNORECASE):
                    headings.append((line, heading))
    return headings

In [49]:
def clean_text(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    # Remove newline characters
    text = text.replace('\n', ' ')
    # Remove tab characters
    text = text.replace('\t', ' ')

    return text

In [50]:
def parse_resume(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    headings = identify_headings(text)

    resume_parts = {}
    current_heading = ""
    current_part = ""
    capturing_work_experience = False

    for line in text.split('\n'):
        line = line.strip()
        if (line, "Education") in headings:
            current_heading = "Education"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Skills") in headings:
            current_heading = "Skills"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Experience") in headings:
            current_heading = "Experience"
            current_part = ""
            capturing_work_experience = True
        elif (line, "Achievements") in headings:
            current_heading = "Achievements"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Others") in headings:
            current_heading = "Others"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Summary") in headings:
            current_heading = "Summary"
            current_part = ""
            capturing_work_experience = False
        else:
            if capturing_work_experience:
                current_part += line + "\n"
                resume_parts.setdefault(current_heading, "")  # Initialize the dictionary key if not present
                resume_parts[current_heading] += line + "\n"
            else:
               if current_heading:
                current_part += line + "\n"
                resume_parts[current_heading] = current_part

    # Create a DataFrame from the parsed resume parts
    fixed_columns = {
        'ResumeID': '',
        'Category':'',
        'Education': '',
        'Skills': '',
        'Experience': '',
        'Achievements': '',
        'Others':'',
        'Summary':''
    }

    # Update the fixed_columns dictionary with data from the input dictionary
    fixed_columns.update(resume_parts)

    # Create a DataFrame from the updated dictionary
    df = pd.DataFrame([fixed_columns])
    df = df.applymap(clean_text)
    return df

In [51]:
dfs = []

root_dir = 'uploads/resumes'
for root, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".pdf"):
            # Extract text from the PDF file
            pdf_path = os.path.join(root, file)
            parsed_resume_df = parse_resume(pdf_path)
            parsed_resume_df["ResumeID"] = file.replace(".pdf","")
            parsed_resume_df["Category"] = os.path.basename(root)
            dfs.append(parsed_resume_df)
dfs = pd.concat(dfs).reset_index(drop=True)

In [52]:
dfs.to_csv("Resume_extracted.csv", index=False)

In [None]:
!pip install -qU datasets transformers sentence-transformers git+https://github.com/naver/splade.git
!pip install einops

# 

In [53]:
from datasets import load_dataset
import pandas as pd
import ast
import torch
from splade.models.transformer_rep import Splade
from transformers import AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [54]:
sparse_model_id = 'naver/splade-cocondenser-selfdistil'

sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to(device)  # move to GPU if possible
sparse_model.eval()
tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

In [55]:
dataset = load_dataset("jacob-hugging-face/job-descriptions")
dataset = dataset['train']

In [56]:
df = pd.DataFrame(data=dataset)
df = df[["job_description", "position_title", "model_response"]]

In [57]:
current_directory = os.getcwd()

# Print the current working directory
print("Current Directory:", current_directory)

Current Directory: C:\Users\deepv\Desktop\IPD\trial


In [58]:
df_candidates = pd.read_csv("Resume_extracted.csv")

In [59]:
df_candidates.isna().sum() #shows statistically how well the extractor has performed?

ResumeID        0
Category        0
Education       2
Skills          4
Experience      5
Achievements    8
Others          6
Summary         7
dtype: int64

In [60]:
df['model_response'] = df['model_response'].apply(lambda x: ast.literal_eval(x))
df_jobdesc = df[df['model_response'].apply(lambda x: x.get('Educational Requirements') != 'N/A' and x.get('Required Skills') != 'N/A' and x.get('Preferred Qualifications') != 'N/A')]
df_jobdesc = df_jobdesc.sample(n=15, random_state=42).reset_index(drop=True)

In [61]:
def process_row1(row):
    text_data = str(row['Education'])
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

In [62]:
def process_row2(row):
    text_data = str(row['Skills'])
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

In [63]:
def process_row3(row):
    text_data = row['model_response'].get('Educational Requirements') + row['model_response'].get('Preferred Qualifications')
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

In [64]:
def process_row4(row):
    text_data = row['model_response'].get('Required Skills')
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

In [65]:
from scipy.spatial import distance
import numpy as np

In [66]:
def score(a, b):
  return distance.cosine(np.array(a),np.array(b))

In [67]:
def find_min_match(row):
    row_score = df_candidates['edu_embed'].apply(lambda x: score(row['edu_embed'], x)) + df_candidates['skill_embed'].apply(lambda x: score(row['skill_embed'], x))
    min_score_indices = row_score.nsmallest(5).index.tolist()
    return df_candidates.loc[min_score_indices, 'ResumeID'].tolist()

In [68]:
df_candidates['edu_embed'] = df_candidates.apply(process_row1, axis=1)
df_candidates['skill_embed'] = df_candidates.apply(process_row2, axis=1)
df_jobdesc['edu_embed'] = df_jobdesc.apply(process_row3, axis=1)
df_jobdesc['skill_embed'] = df_jobdesc.apply(process_row4, axis=1)



In [69]:
from tqdm import tqdm

In [70]:
tqdm.pandas()
df_jobdesc['qualified_candidate_ResumeID'] = df_jobdesc.progress_apply(find_min_match, axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 110.36it/s]


In [71]:
final_df = df_jobdesc[["position_title","job_description", "qualified_candidate_ResumeID"]]
final_df

Unnamed: 0,position_title,job_description,qualified_candidate_ResumeID
0,Administrative Assistant,this position is located on rikers islandphysi...,"[Samkit Shah, Prince Doshi, Deep Vyas, Neel Oz..."
1,"Vice President, Marketing, MSNBC",responsibilities\nthe vice president marketing...,"[Prince Doshi, Samkit Shah, Ayushi Uttamani, N..."
2,"Senior Vice President and Managing Director, I...",mathematica applies expertise at the intersect...,"[Samkit Shah, Neel Oza, Deep Vyas, Prince Dosh..."
3,"Regional Sales Manager, Video Conferencing (East)",the regional sales manager will report to the...,"[Prince Doshi, Samkit Shah, Deep Vyas, Preksha..."
4,Chief Executive Officer,the role of the chief executive officer ceo is...,"[Samkit Shah, Deep Vyas, Prince Doshi, Neel Oz..."
5,Project Manager (CCR focused),at haley aldrich we pride ourselves on our sm...,"[Samkit Shah, Prince Doshi, Deep Vyas, Neel Oz..."
6,Vice President/Sr. Director Business Developme...,spire learning united states remote\n\nspire i...,"[Samkit Shah, Prince Doshi, Neel Oza, Deep Vya..."
7,Director of Finance,company description\nproject finds mission is ...,"[Samkit Shah, Prince Doshi, Neel Oza, Preksha ..."
8,Software Engineer (JavaScript Backend),job description\nour vision\n\nin the new and ...,"[Deep Vyas, Prince Doshi, Neel Oza, Samkit Sha..."
9,Senior Financial Analyst,senior financial analyst\n\ncompany highlights...,"[Samkit Shah, Prince Doshi, Neel Oza, Preksha ..."


In [72]:
print(final_df.iloc[7]['job_description'])

company description
project finds mission is to provide low and moderateincome and homeless seniors with the services and support they need to enrich their lives and live independently
today project find operates three supportive housing residences that are home to about  people and four senior centers with over  members
our members and residents range from the healthy and active to the frail and homebound to the homeless
for all these individuals project find is a critical resource providing housing meals and programs that help individuals navigate the challenges of aging by encouraging community engagement and healthy living
please visit our website at
projectfindorg 
job overview
as the director of finance you will be responsible for building and enhancing the financial infrastructure of project find
the director of finance is also responsible for all aspects of the financial operations for project find
reporting directly to the executive director the director of finance provides ac

In [74]:
dfz = df_candidates[df_candidates['ResumeID'].isin(final_df['qualified_candidate_ResumeID'].iloc[7])]
dfz = dfz[["ResumeID","Category", "Education", "Skills"]]
dfz

Unnamed: 0,ResumeID,Category,Education,Skills
2,Deep Vyas,resumes,"Bhayander , Thane, India , 401105 deepvyas2003...","* Languages : C, JavaScript, Python, CSS, HTML..."
4,Neel Oza,resumes,Dwarkadas J. Sanghvi College of Engineering 20...,"Web Development: HTML5, CSS3, Bootstrap, Tailw..."
5,Preksha Shah,resumes,Dwarkadas .J .Sanghvi College of Engineering(D...,
6,Prince Doshi,resumes,Bachelors in Information Technology May 2025 D...,Threads Clone (Aug 2023) *Architected a multi-...
8,Samkit Shah,resumes,Bachelor of Technology - Information Technolog...,"JavaScript Frameworks/Libraries: NumPy, pandas..."


In [75]:
import PyPDF2
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

In [76]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [77]:
pdf_path = 'uploads/JD'  

In [78]:
import os
import PyPDF2

# Define the path to the folder containing the PDF files
pdf_folder_path = "uploads/JD"

# Get a list of PDF files in the specified folder
pdf_files = [file for file in os.listdir(pdf_folder_path) if file.lower().endswith('.pdf')]

# Check if there are any PDF files in the folder
if pdf_files:
    # Choose the first PDF file in the list
    selected_pdf_file = pdf_files[0]

    # Construct the full path to the selected PDF file
    pdf_path = os.path.join(pdf_folder_path, selected_pdf_file)

    def extract_text_from_pdf(pdf_path):
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            num_pages = len(pdf_reader.pages)
            text = ""
            for page_num in range(num_pages):
                text += pdf_reader.pages[page_num].extract_text()
            return text

    job_description_text = extract_text_from_pdf(pdf_path)
    print("Job Description Text:")
    print(job_description_text)

else:
    print("No PDF files found in the specified folder.")


Job Description Text:
 
Frontend Developer  
 
We are looking for a qualified Front -end developer to join our IT team. You should be able to translate 
our company and customer needs into functional and appealing interactive applications.  
Responsibilities  
• Use markup languages like HTML to create user -friendly web pages . 
• Maintain and improve website . 
• Optimize applications for maximum speed . 
• Design mobile -based features  
• Collaborate with back -end developers and web designers to improve usability . 
• Get feedback from, and b uild solutions for, users and customers . 
• Write functional requirement documents and guides . 
• Create quality mock  ups and prototypes . 
• Help back -end developers with coding and troubleshooting . 
• Ensure high quality graphic standards and brand consistency . 
• Stay up -to-date on emerging technologies . 
Requirements  
• Proven work experience as a Front -end developer  
• Basic understanding of server -side CSS pre -processing pl

In [79]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [80]:
def preprocess_text(text):
    # Tokenization and removing stopwords
    stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return " ".join(tokens)

# Preprocess job description text
processed_job_description = preprocess_text(job_description_text)

# Display preprocessed text
print("\nPreprocessed Job Description:")
print(processed_job_description)
print(job_description_text)


Preprocessed Job Description:
frontend developer looking qualified developer join team able translate company customer needs functional appealing interactive applications responsibilities use markup languages like html create user web pages maintain improve website optimize applications maximum speed design mobile features collaborate developers web designers improve usability feedback b uild solutions users customers write functional requirement documents guides create quality mock ups prototypes help developers coding troubleshooting ensure high quality graphic standards brand consistency stay emerging technologies requirements proven work experience developer basic understanding server css pre platforms sass proficient understanding client scripting advanced javascript libraries angularjs backbonejs reactjs proficient understanding code versioning tools git mercurial svn familiarity browser testing debugging understanding entire web development process design development deployment

In [81]:
import nltk

# Download the punkt tokenizer
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [82]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [83]:
def preprocess_text(text):
    # Tokenization and removing stopwords
    stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return " ".join(tokens)

# Preprocess job description text
processed_job_description = preprocess_text(job_description_text)

# Display preprocessed text
print("\nPreprocessed Job Description:")
print(processed_job_description)


Preprocessed Job Description:
frontend developer looking qualified developer join team able translate company customer needs functional appealing interactive applications responsibilities use markup languages like html create user web pages maintain improve website optimize applications maximum speed design mobile features collaborate developers web designers improve usability feedback b uild solutions users customers write functional requirement documents guides create quality mock ups prototypes help developers coding troubleshooting ensure high quality graphic standards brand consistency stay emerging technologies requirements proven work experience developer basic understanding server css pre platforms sass proficient understanding client scripting advanced javascript libraries angularjs backbonejs reactjs proficient understanding code versioning tools git mercurial svn familiarity browser testing debugging understanding entire web development process design development deployment

In [84]:
resume_skills_df = pd.read_csv('Resume_extracted.csv')

In [85]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    if isinstance(text, str):  # Check if the value is a string
        # Tokenization and removing stopwords
        stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)
        tokens = nltk.word_tokenize(text)
        tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
        return " ".join(tokens)
    else:
        return ""  # Return an empty string for non-string values (NaN, floats, etc.)

# Apply the preprocessing function to the 'Skills' column
resume_skills_df['Skills'] = resume_skills_df['Skills'].apply(preprocess_text)


In [86]:
vectorizer = CountVectorizer(lowercase=True, analyzer='word', binary=True)
skills_matrix_resumes = vectorizer.fit_transform(resume_skills_df['Skills'])
skills_matrix_jd = vectorizer.transform([processed_job_description])

# Convert skills to a DataFrame for better visualization
skills_resumes_df = pd.DataFrame(skills_matrix_resumes.toarray(), columns=vectorizer.get_feature_names_out())
skills_jd_df = pd.DataFrame(skills_matrix_jd.toarray(), columns=vectorizer.get_feature_names_out())

In [87]:
print("\nTokenized Skills in Resumes:")
print(skills_resumes_df)

print("\nTokenized Skills in JD:")
print(skills_jd_df)


Tokenized Skills in Resumes:
   access  accuracy  achieving  adaptive  amazon  analytics  api  app   
0       0         0          0         0       0          0    0    0  \
1       0         0          0         0       0          1    0    0   
2       0         0          0         0       0          0    0    1   
3       0         0          0         0       0          0    0    0   
4       0         0          0         0       0          0    0    0   
5       0         0          0         0       0          0    0    0   
6       1         1          1         0       0          0    1    0   
7       0         0          0         0       0          0    0    0   
8       0         0          0         1       1          0    0    0   

   application  architected  ...  user  using  utilize  utilized  video   
0            0            0  ...     0      0        0         0      0  \
1            0            0  ...     0      0        0         0      1   
2            0

In [88]:
similarity_scores = cosine_similarity(skills_matrix_resumes, skills_matrix_jd)

# Convert similarity scores to a DataFrame
similarity_df = pd.DataFrame(similarity_scores, columns=['Similarity'], index=resume_skills_df.index)

# Display similarity scores
print("\nSimilarity Scores:")
print(similarity_df)


Similarity Scores:
   Similarity
0    0.000000
1    0.181848
2    0.445742
3    0.000000
4    0.263181
5    0.000000
6    0.223814
7    0.000000
8    0.227710


In [89]:
ranked_resumes = similarity_df.sort_values(by='Similarity', ascending=False)

# Display ranked resumes
print("\nRanked Resumes:")
print(ranked_resumes)


Ranked Resumes:
   Similarity
2    0.445742
4    0.263181
8    0.227710
6    0.223814
1    0.181848
0    0.000000
3    0.000000
5    0.000000
7    0.000000


In [90]:
# Assuming similarity_df is the DataFrame with similarity scores
# and 'ResumeID' is a column in your CSV file

# Merge the similarity scores DataFrame with the original resume_skills_df
result_df = pd.concat([resume_skills_df['ResumeID'], similarity_df], axis=1)

# Sort the DataFrame by the 'Similarity' column in descending order
result_df = result_df.sort_values(by='Similarity', ascending=False)

# Display the ranked resumes with 'ResumeID'
print("Ranked Resumes:")
print(result_df)

Ranked Resumes:
           ResumeID  Similarity
2         Deep Vyas    0.445742
4          Neel Oza    0.263181
8       Samkit Shah    0.227710
6      Prince Doshi    0.223814
1   Ayushi Uttamani    0.181848
0     Ashish Shukla    0.000000
3     Krish Panchal    0.000000
5      Preksha Shah    0.000000
7  Rishikesh Sharma    0.000000


In [91]:
result_df.to_csv('result_df.csv', index=False)