# Resume Scanner

# Step 1 : Extract Text from resumes

In [3]:
# pip install pypdf2

In [4]:
# read pdf or text file from a folder
import os
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    text = " "
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text()
    return text

folder = 'Resumes/'
resumes = {}
for file in os.listdir('Resumes/'):
    if file.endswith('.pdf'):
        resumes[file] = extract_text_from_pdf(os.path.join(folder, file))

# Step 2: Preprocess the text

In [6]:
# clean and normalize both the resume text and job description.
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
# nltk.download('stopwords')
# nltk.download('wordnet')

In [8]:
stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text) # this line will help to keep only numbers
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return " ".join(tokens)

# Step 3 : Convert text to numerical vector(TF-IDF)

In [11]:
# Use TF-IDF to represent text for comparison
from sklearn.feature_extraction.text import TfidfVectorizer
jd = """ Biocube is looking for Data Scientist to join our dynamic team and embark on a rewarding career journey
Undertaking data collection, preprocessing and analysis
Building models to address business problems
Presenting information using data visualization techniques
Identify valuable data sources and automate collection processes
Undertake preprocessing of structured and unstructured data
Analyze large amounts of information to discover trends and patterns
Build predictive models and machine-learning algorithms
Combine models through ensemble modeling
Present information using data visualization techniques
Propose solutions and strategies to business challenges
Collaborate with engineering and product development teams """

In [12]:
documents = [preprocess_text(jd)] + [preprocess_text(text) for text in resumes.values()]
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(documents)

# Step 4 : Compute similarity between Job description and Each resume

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
similarity_score = cosine_similarity(vectors[0:1], vectors[1:]).flatten()

for i, (filename, score) in enumerate(zip(resumes.keys(), similarity_score)):
    print(f"{filename} : {score :.2f}") 

Ankush_Bhonde_DA.pdf : 0.19
Ankush_Bhonde_Data_Science.pdf : 0.19
Ankush_Bhonde_DS.pdf : 0.19


# Step 5 : Rank the Candidates

In [17]:
import pandas as pd
results = pd.DataFrame({
    "Resume" : list(resumes.keys()),
    "Similarity" : similarity_score
}).sort_values(by = "Similarity", ascending = False)
print(results)

                           Resume  Similarity
0            Ankush_Bhonde_DA.pdf    0.192588
1  Ankush_Bhonde_Data_Science.pdf    0.186466
2            Ankush_Bhonde_DS.pdf    0.186144
