In [1]:
import PyPDF2
import docx
import spacy
import re
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Step 1: Resume Parsing
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = ''
    for paragraph in doc.paragraphs:
        text += paragraph.text
    return text


In [3]:
# Step 2: Feature Extraction
def extract_features(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    
    entities = {ent.label_: ent.text for ent in doc.ents}
    
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]
    
    return entities, keywords

In [4]:
# Step 3: Job Description Parsing (Similar to Step 2)
nlp = spacy.load("en_core_web_sm")
def parse_job_description(job_description):
    doc = nlp(job_description)
    
    entities = {ent.label_: ent.text for ent in doc.ents}
    
    keywords = [token.text for token in doc if not token.is_stop and not token.is_punct]
    
    return entities, keywords


In [5]:
# step 4: preprocess the data
def preprocess_text(text_tuple):
    stop_words = set(stopwords.words('english'))
    preprocessed_entities = []
    for label, entity in text_tuple[0].items():
        preprocessed_entity = re.sub(r'[^a-zA-Z\s]', '', entity).lower()
        preprocessed_entities.extend([word for word in preprocessed_entity.split() if word not in stop_words])
    
    preprocessed_keywords = [word.lower() for word in text_tuple[1] if word.lower() not in stop_words]
    
    return preprocessed_entities, preprocessed_keywords


In [6]:
# Step 5: Similarity Calculation
def calculate_similarity(resume_features, job_features):
    # Convert text features into numerical vectors
    vectorizer = CountVectorizer()
    
    
    entities = resume_features[0]
    keywords = resume_features[1]
    
    
    resume_text = entities + keywords

        
    entities = job_features[0]
    keywords = job_features[1]
    
    
    job_text = entities + keywords
    
    resume_vector = vectorizer.fit_transform([" ".join(resume_text)])  
    job_vector = vectorizer.transform([" ".join(job_text)])  
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(resume_vector, job_vector)
    return similarity_score[0][0]

In [7]:
# Step 6: Ranking
def rank_candidates(resumes, job_description):
    ranked_resumes = []
    job_features = parse_job_description(job_description)
    job_features = preprocess_text(job_features)
    for resume_path in resumes:
        resume_text = ''
        if resume_path.endswith('.pdf'):
            resume_text = extract_text_from_pdf(resume_path)
        elif resume_path.endswith('.docx'):
            resume_text = extract_text_from_docx(resume_path)
        
        
        if resume_text.strip():
            resume_features = extract_features(resume_text)
            resume_features = preprocess_text(resume_features)
            
            similarity_score = calculate_similarity(resume_features, job_features)
            ranked_resumes.append((resume_path, similarity_score))
        else:
            
            ranked_resumes.append((resume_path, 0))
    
    ranked_resumes.sort(key=lambda x: x[1], reverse=True)
    return ranked_resumes

In [8]:
# Example usage
resumes =  ['Mukhesh resume software.pdf', 'Mukhesh resume final.docx', 'Mukhesh sure_resume.pdf','Mukhesh_sure resume (1).pdf']  


job_description = """We are seeking a talented and experienced Data Scientist to join our team. The ideal candidate will have a strong background in statistical analysis, machine learning, and data visualization, with a passion for solving complex problems using data-driven approaches. The Data Scientist will work closely with cross-functional teams to extract insights from large datasets, develop predictive models, and drive data-driven decision-making across the organization.

Key Responsibilities:

Conduct exploratory data analysis to understand patterns and trends in complex datasets.
Develop predictive models and algorithms to solve business problems and optimize processes.
Design and implement machine learning pipelines for data preprocessing, feature engineering, model training, and evaluation.
Collaborate with stakeholders to define project requirements, objectives, and success criteria.
Communicate findings and insights to technical and non-technical audiences through data visualization, reports, and presentations.
Stay updated on the latest trends and advancements in data science, machine learning, and artificial intelligence.
Mentor junior team members and contribute to the continuous improvement of data science practices within the organization.
Requirements:

Bachelor's or Master's degree in Computer Science, Statistics, Mathematics, or a related field.
Proven experience as a Data Scientist or similar role, with a strong portfolio of projects demonstrating expertise in data analysis and machine learning.
Proficiency in programming languages such as Python or R, and experience with data manipulation and analysis libraries (e.g., pandas, numpy, scikit-learn).
Solid understanding of statistical concepts, machine learning algorithms, and experimental design.
Experience with big data technologies and distributed computing frameworks (e.g., Hadoop, Spark) is a plus.
Strong problem-solving skills and the ability to work independently or as part of a team in a fast-paced environment.
Excellent communication and interpersonal skills, with the ability to explain complex technical concepts to non-technical stakeholders."""  # Example job description


job_features = parse_job_description(job_description)
ranked_resumes = rank_candidates(resumes, job_description)
for rank, (resume, score) in enumerate(ranked_resumes, start=1):
    print(f"Rank: {rank}, Resume: {resume}, Similarity Score: {score}") 



Rank: 1, Resume: Mukhesh resume final.docx, Similarity Score: 0.6662000509933521
Rank: 2, Resume: Mukhesh_sure resume (1).pdf, Similarity Score: 0.6197186765525231
Rank: 3, Resume: Mukhesh sure_resume.pdf, Similarity Score: 0.6149354132834033
Rank: 4, Resume: Mukhesh resume software.pdf, Similarity Score: 0.2302097210793726


the above code wil parse the pdf or docx files and rank them according to the cosine similarity with the job discription. the highly matched resume with the job discription will be ranked first and the similaritt score will be close to 1 than compared to others. this will fellow the order till end of resumes

In [9]:
resumes =  ['Mukhesh resume software.pdf', 'Mukhesh resume final.docx', 'Mukhesh sure_resume.pdf','Mukhesh_sure resume (1).pdf']  


job_description = """Responsibilities:
** Assist in the design, development, testing, and deployment of software solutions.
** Help troubleshoot software issues and maintain documentation of software functionalities.
** Participate in our product development from ideation to deployment and beyond.
** Work alongside the product team to design, develop, and review product requirements and feasibility.
** Develop tools and applications by producing clean, efficient code.
** Participate in code reviews to maintain code quality and share knowledge.

A Ninja is resilient, smart, and ambitious. Sounds like you? Here’s what you will need to have to join the Ninja Clan:
** Currently pursuing a Bachelor's or Master’s degree in Computer Science, Engineering, or a related field.
** Basic understanding of software development principles.
** Good analytical and problem-solving abilities.
** Strong knowledge of data structures and algorithms.
** Familiarity with one or more programming languages such as Python, Java, or C++.
** Ability to learn new technologies quickly.
** Excellent verbal and written communication skills."""  # Example job description


job_features = parse_job_description(job_description)
ranked_resumes = rank_candidates(resumes, job_description)
for rank, (resume, score) in enumerate(ranked_resumes, start=1):
    print(f"Rank: {rank}, Resume: {resume}, Similarity Score: {score}") 


Rank: 1, Resume: Mukhesh resume final.docx, Similarity Score: 0.3505847957294434
Rank: 2, Resume: Mukhesh resume software.pdf, Similarity Score: 0.33234018715767727
Rank: 3, Resume: Mukhesh sure_resume.pdf, Similarity Score: 0.306113355005811
Rank: 4, Resume: Mukhesh_sure resume (1).pdf, Similarity Score: 0.30452667754203255
