In [22]:
!pip install PyPDF2 streamlit



In [23]:
# import libraries
import PyPDF2
import streamlit as st
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

# def extract_text_from_pdf(pdf_file):
#   reader = PyPDF2.PdfReader(pdf_file)
#   text = ''
#   for page in reader.pages:
#     text += page.extract_text()
#   return text

In [25]:
def skills_extract(text):

  nlp = spacy.load('en_core_web_sm')

  # Use pre-trained NER
  doc = nlp(text)
  skills = []
  for ent in doc.ents:
    skills.append(ent.text)
  return skills

In [26]:
def score_resume_cosine(job_desc_skills, resume_skills):

    all_skills = job_desc_skills + resume_skills

    vectorizer = TfidfVectorizer()

    tfidf_matrix = vectorizer.fit_transform(all_skills)

    # Separate job description vector and resume vector
    job_desc_vector = tfidf_matrix[:len(job_desc_skills)]  # First part is job description skills
    resume_vector = tfidf_matrix[len(job_desc_skills):]     # Second part is resume skills

    # Compute cosine similarity between the job description and resume vectors
    similarity = cosine_similarity(resume_vector, job_desc_vector)

    # Average the similarity scores (To take combined mean of all the probabilities vectors)
    average_similarity = similarity.mean()

    # Return the score as a percentage
    return round(average_similarity * 100, 2)

In [27]:
def suggested_skills(job_skills, resume_skills):
  suggested_skills = []
  for skill in job_skills:
    if skill not in resume_skills:
      suggested_skills.append(skill)
  return suggested_skills

In [28]:
def process_skills_text(j_skills, r_skills):

  # split the job skills into list
  job_skills = j_skills.split(',')

  #clean both resume and job skills texts
  job_skills = [job_skill.strip() for job_skill in job_skills]
  job_skills = [job_skill.lower() for job_skill in job_skills]

  resume_skills = r_skills
  resume_skills = [resume_skill.strip() for resume_skill in resume_skills]
  resume_skills = [resume_skill.lower() for resume_skill in resume_skills]

  return job_skills, resume_skills

In [29]:
job_skills = "Machine learning, deep learning, C++, Python"
text = extract_text_from_pdf('/content/My_CV.pdf')
resume_skills = skills_extract(text)

job_skills, resume_skills = process_skills_text(job_skills, resume_skills)
score = score_resume_cosine(job_skills, resume_skills)
suggested_skills = suggested_skills(job_skills, resume_skills)

print("Job Skills:", job_skills)
print("Resume Skills:", resume_skills)
print(f"Score: {score}%")
print("Suggested Skills:", suggested_skills)

Job Skills: ['machine learning', 'deep learning', 'c++', 'python']
Resume Skills: ['cnn', 'cnn', 'flower images classification', 'abdul rehman \njunior', 'machine learning', 'random forest', 'xgboost', 'cnn', 'ml', 'python', 'tensorflow', 'ml/dl', 'bachelor of science: economics', '2024, july', 'sukkur iba university - sukkur', 'sindh', 'econometrics', '2024', '2024', 'classification - coursera', '2024', 'coursera', '2024', 'coursera', '2024', 'sarcasm detection using', 'rnn', 'roc', '0.999', 'apple quality prediction using machine learning', 'random forest', 'normalization', 'randomsearchcv', 'islamabad', 'pakistan', 'c++ \nmachine learning', 'tensorflow', 'keras', 'data visualization', 'matplotlib', 'random forest', 'xgboost', 'gradient boost', 'svm', 'knn', 'cnn', 'rnn', 'nlp', 'pandas', 'excel', 'jupyter notebooks', 'pycharm']
Score: 2.17%
Suggested Skills: ['deep learning', 'c++']
