In [1]:
!pip install pdfplumber




In [2]:
#import all the necessary libraries
import os
import pdfplumber
import re
import pandas as pd

In [26]:
#Step 1
#Resume Parsing and DataFrame Creation

#Step 1.1 PDF Text Extraction Script
def extract_text_from_pdf(pdf_path):  
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

#Step 1.2  Processing pdf from folder 

resume_folder = 'SampleResumes'  # my folder with PDFs
for file in os.listdir(resume_folder):
    if file.endswith('.pdf'):
        file_path = os.path.join(resume_folder, file)
        resume_text = extract_text_from_pdf(file_path)
        print(f"✅Extracted from {file}")
        print(resume_text[:300])

✅Extracted from resume 1.pdf
Sanket Sarwade
Data Scientist
As a highly motivated and detail-oriented data scientist, I am eager to begin my career in the  eld of data science. With a solid
foundation in statistics, programming, and machine learning techniques, I am well-equipped to tackle complex data problems
and deliver meani
✅Extracted from resume 2.pdf
Data Scientist Phone: (123) 456 78 99
Email: info@qwikresume.com
Website: www.qwikresume.com
ROBERT SMITH
LinkedIn:
linkedin.com/qwikresume
Address: 1737 Marshville Road,
Alabama.
Objective
Data Scientist with PhD in Physics and 1+ industrial experience. Two years of working experience
in Data Analy
✅Extracted from resume 3.pdf
SNEHA B
Vittal, India-574243
7019932906 | snehaboja@gmail.com
 www.linkedin.com/in/sneha-b-7242bb284
Objective
A graduated Engineering student from Artificial Intelligence & Machine Learning equipped with certifications in
Python, SQL and Machine Learning through Coursera platform. My academic jour
✅Extracted

In [4]:
#Step 2: Parsing Function

import re     

def parse_resume(text):
    data = {
        "Institute": None,
        "CGPA": None,
        "Year": None,
        "Branch": None,
        "Projects": None
    }

    # Institute
    for line in text.split('\n'):
        if "Institute" in line or "University" in line or "College" in line:
            data["Institute"] = line.strip()
            break

    # CGPA
    cgpa_match = re.search(r'CGPA[:\s]+([\d.]+)', text)
    if cgpa_match:
        data["CGPA"] = float(cgpa_match.group(1))

    # Year
    year_match = re.search(r'\b(20\d{2})\b', text)
    if year_match:
        data["Year"] = int(year_match.group(1))

    # Branch
    branch_match = re.search(r"(Computer Science|Electronics|Mechanical|AI|IT|Electrical)", text, re.I)
    if branch_match:
        data["Branch"] = branch_match.group()


    # Projects
    if "project" in text.lower():
        try:
            snippet = text.lower().split("project", 1)[1][:300]
            data["Projects"] = "Project" + snippet
        except:
            data["Projects"] = None

    return data

In [5]:

#Step 2.1 – Apply to All Resumes

parsed_resumes = []

for file in os.listdir(resume_folder):
    if file.endswith('.pdf'):
        file_path = os.path.join(resume_folder, file)
        text = extract_text_from_pdf(file_path)
        parsed = parse_resume(text)
        parsed['Resume'] = file
        parsed_resumes.append(parsed)

df_resumes = pd.DataFrame(parsed_resumes)
df_resumes

Unnamed: 0,Institute,CGPA,Year,Branch,Projects,Resume
0,Sinhgad College of Science Pune,,2001,ai,"Projects, i have honed my skills in data analy...",resume 1.pdf
1,,,2016,ai,Projects concerning a custom study for a manuf...,resume 2.pdf
2,Vivekananda College of Engineering and Technol...,,2020,it,"Projects\nmedical management system\nmysql, ph...",resume 3.pdf
3,Indian Institute of Technology Delhi (2019–2021),9.2,2019,ai,Projects:\n- predictive pricing model\n• devel...,resume11.pdf
4,Vellore Institute of Technology (2018–2020),8.3,2018,ai,Projects:\n- fraud detection system\n• impleme...,resume22.pdf
5,"Netaji Subhas Institute of Technology, Delhi (...",8.6,2016,it,Projects:\n• food image classification\n· buil...,resume33.pdf
6,"Indian Statistical Institute, Kolkata (2018–2020)",9.0,2018,ai,Projects:\n• customer lifetime value predictio...,resume44.pdf
7,,8.9,2019,ai,Projects:\n• multilingual chatbot\n· developed...,resume55.pdf


In [6]:
# Step  3 College tier mapping

# step 3.1: Load College Tier CSV

college_df = pd.read_csv("Indian Institution Tier - New 2023-1-15.csv")
college_df.columns = college_df.columns.str.strip()
college_df

Unnamed: 0,Institute_Name,Tier
0,Acharya Nagarjuna University,Tier 1
1,Alagappa University,Tier 1
2,Aligarh Muslim University AMU,Tier 1
3,All India Institute of Medical Sciences (AIIMS...,Tier 1
4,All India Institute of Medical Sciences (AIIMS...,Tier 1
...,...,...
981,William Carey University (WCU),Tier 3
982,World University of Design (WUD),Tier 3
983,Xavier University (Xavier University Bhubaneswar),Tier 3
984,Yashwantrao Chavan Maharashtra Open University...,Tier 3


In [7]:
#Step 3.2: Define Tier Mapping Function

def get_college_tier(institute):
    if pd.isna(institute): return "Unknown"
    for _, row in college_df.iterrows():
        if str(row['Institute_Name']).lower() in str(institute).lower():
            return row['Tier']
    return "Unknown"

In [8]:
# Step 3.3: Apply to all  Resumes

df_resumes['Tier'] = df_resumes['Institute'].apply(get_college_tier)
df_resumes

Unnamed: 0,Institute,CGPA,Year,Branch,Projects,Resume,Tier
0,Sinhgad College of Science Pune,,2001,ai,"Projects, i have honed my skills in data analy...",resume 1.pdf,Unknown
1,,,2016,ai,Projects concerning a custom study for a manuf...,resume 2.pdf,Unknown
2,Vivekananda College of Engineering and Technol...,,2020,it,"Projects\nmedical management system\nmysql, ph...",resume 3.pdf,Unknown
3,Indian Institute of Technology Delhi (2019–2021),9.2,2019,ai,Projects:\n- predictive pricing model\n• devel...,resume11.pdf,Unknown
4,Vellore Institute of Technology (2018–2020),8.3,2018,ai,Projects:\n- fraud detection system\n• impleme...,resume22.pdf,Unknown
5,"Netaji Subhas Institute of Technology, Delhi (...",8.6,2016,it,Projects:\n• food image classification\n· buil...,resume33.pdf,Unknown
6,"Indian Statistical Institute, Kolkata (2018–2020)",9.0,2018,ai,Projects:\n• customer lifetime value predictio...,resume44.pdf,Unknown
7,,8.9,2019,ai,Projects:\n• multilingual chatbot\n· developed...,resume55.pdf,Unknown


In [9]:
#Error encountered: 
# In the step above the the Tier column began showing "Unknown", 
# which meant the function could not find a match between: The Institute name in the resume, and 
#The College Name in the tier  CSV. so in order to solve thiat problem i did text normalization for Matching.
# And then Reapply the Mapping

# Normalize Text for Matching 

def get_college_tier(institute):
    if pd.isna(institute): return "Unknown"
    institute_clean = str(institute).lower().strip()

    for _, row in college_df.iterrows():
        college_name = str(row['Institute_Name']).lower().strip()
        if college_name in institute_clean:
            return row['Tier']
    return "Unknown"

In [10]:

#Reapply the Mapping
df_resumes['Tier'] = df_resumes['Institute'].apply(get_college_tier)


In [11]:
df_resumes['Tier']

0    Unknown
1    Unknown
2    Unknown
3    Unknown
4    Unknown
5    Unknown
6    Unknown
7    Unknown
Name: Tier, dtype: object

In [12]:
#Since the problem was not solved  i decided to use fuzzywuzzy. As the Resume text may say “Indian Institute of Technology Delhi” 
# while CSV says “IIT Delhi”. so  FuzzyWuzzy was used as it enables flexible, similarity-based matching , making 
# the system more robust and reliable when handling real-world, inconsistent text data.



# Tier Mapping problem solved. through fuzzy

from fuzzywuzzy import fuzz

def get_college_tier_fuzzy(institute, threshold=85):
    if pd.isna(institute): return "Unknown"
    institute_clean = str(institute).lower().strip()
    
    best_score = 0
    best_tier = "Unknown"

    for _, row in college_df.iterrows():
        college_name = str(row['Institute_Name']).lower().strip()
        score = fuzz.partial_ratio(college_name, institute_clean)

        if score > best_score and score >= threshold:
            best_score = score
            best_tier = row['Tier']

    return best_tier





In [13]:
df_resumes['Tier'] = df_resumes['Institute'].apply(get_college_tier_fuzzy)

df_resumes

Unnamed: 0,Institute,CGPA,Year,Branch,Projects,Resume,Tier
0,Sinhgad College of Science Pune,,2001,ai,"Projects, i have honed my skills in data analy...",resume 1.pdf,Unknown
1,,,2016,ai,Projects concerning a custom study for a manuf...,resume 2.pdf,Unknown
2,Vivekananda College of Engineering and Technol...,,2020,it,"Projects\nmedical management system\nmysql, ph...",resume 3.pdf,Unknown
3,Indian Institute of Technology Delhi (2019–2021),9.2,2019,ai,Projects:\n- predictive pricing model\n• devel...,resume11.pdf,Tier 1
4,Vellore Institute of Technology (2018–2020),8.3,2018,ai,Projects:\n- fraud detection system\n• impleme...,resume22.pdf,Unknown
5,"Netaji Subhas Institute of Technology, Delhi (...",8.6,2016,it,Projects:\n• food image classification\n· buil...,resume33.pdf,Unknown
6,"Indian Statistical Institute, Kolkata (2018–2020)",9.0,2018,ai,Projects:\n• customer lifetime value predictio...,resume44.pdf,Tier 1
7,,8.9,2019,ai,Projects:\n• multilingual chatbot\n· developed...,resume55.pdf,Unknown


In [14]:
# Step4: Use BERT embeddings to measure how closely a resume matches the job description, and compute a Job Fit Probability.
from sentence_transformers import SentenceTransformer, util


  _torch_pytree._register_pytree_node(


In [15]:
# Load BERT Model
model = SentenceTransformer('all-MiniLM-L6-v2')


  _torch_pytree._register_pytree_node(


In [16]:
jd_df = pd.read_csv("Job_Description1.csv")
jd_df.columns = jd_df.columns.str.strip()  # remove extra spaces
job_description = jd_df['Job_Description'].iloc[0]  # take the first one

In [17]:
def calculate_job_fit(resume_text, job_description, model):
    embeddings = model.encode([resume_text, job_description], convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return round(float(similarity[0][0]) * 100, 2)  # Convert to percentage


In [18]:
job_fit_scores = []

for file in df_resumes['Resume']:
    file_path = os.path.join(resume_folder, file)
    text = extract_text_from_pdf(file_path)
    score = calculate_job_fit(text, job_description, model)
    print(f"Score: {score}")  # DEBUG PRINT
    job_fit_scores.append(score)

df_resumes['Job_Fit_Probability'] = job_fit_scores



Score: 50.7
Score: 40.72
Score: 32.59
Score: 33.04
Score: 42.72
Score: 50.48
Score: 28.98
Score: 36.77


In [19]:
df_resumes[['Resume', 'Institute', 'Tier', 'CGPA', 'Projects', 'Job_Fit_Probability']]


Unnamed: 0,Resume,Institute,Tier,CGPA,Projects,Job_Fit_Probability
0,resume 1.pdf,Sinhgad College of Science Pune,Unknown,,"Projects, i have honed my skills in data analy...",50.7
1,resume 2.pdf,,Unknown,,Projects concerning a custom study for a manuf...,40.72
2,resume 3.pdf,Vivekananda College of Engineering and Technol...,Unknown,,"Projects\nmedical management system\nmysql, ph...",32.59
3,resume11.pdf,Indian Institute of Technology Delhi (2019–2021),Tier 1,9.2,Projects:\n- predictive pricing model\n• devel...,33.04
4,resume22.pdf,Vellore Institute of Technology (2018–2020),Unknown,8.3,Projects:\n- fraud detection system\n• impleme...,42.72
5,resume33.pdf,"Netaji Subhas Institute of Technology, Delhi (...",Unknown,8.6,Projects:\n• food image classification\n· buil...,50.48
6,resume44.pdf,"Indian Statistical Institute, Kolkata (2018–2020)",Tier 1,9.0,Projects:\n• customer lifetime value predictio...,28.98
7,resume55.pdf,,Unknown,8.9,Projects:\n• multilingual chatbot\n· developed...,36.77


In [20]:
#Step 5: Export Final Output to Excel
output_file = "Final_Resume_Match_Output.xlsx"
df_resumes.to_excel(output_file, index=False)
print("Data exported to:", output_file)


Data exported to: Final_Resume_Match_Output.xlsx


In [21]:
import os
os.getcwd()


'C:\\Users\\aaish\\practice for minterview\\resume parser'

In [22]:
os.startfile("Final_Resume_Match_Output.xlsx")