In [161]:
import json
import os
from typing import List
import networkx as nx
import nltk
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from annotated_text import annotated_text, parameters
from streamlit_extras import add_vertical_space as avs
from streamlit_extras.badges import badge
from scripts.similarity import get_similarity_score, find_path, read_config
from scripts.utils import get_filenames_from_dir
from scripts import ReadPdf, JobDescriptionProcessor, ResumeProcessor, KeytermsExtraction
import cohere
from scripts.KeytermsExtraction import KeytermExtractor
from scripts.similarity.get_similarity_score import get_similarity_score



In [162]:
import yaml
import logging
logging.basicConfig(
    filename='app_similarity_score.log',
    filemode='w',
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)

file_handler = logging.FileHandler("app_similarity_score.log")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)
cwd = find_path('Resume-Matcher')
config_path = os.path.join(cwd, "scripts", "similarity")
def read_config(filepath):
    try:
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return config
    except FileNotFoundError as e:
        logger.error(f"Configuration file {filepath} not found: {e}")
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML in configuration file {filepath}: {e}", exc_info=True)
    except Exception as e:
        logger.error(f"Error reading configuration file {filepath}: {e}")
    return None
config = read_config(config_path + "/config.yml")
PROJECT_ID = config['vertex']['api_key']
REGION = config['vertex']['REGION']

In [163]:
import os

job_desc_directory = "Data/JobDescription/"
resumes_directory = "Data/Resumes/"

# Automatically get all job description and resume files
job_desc_files = [file for file in os.listdir(job_desc_directory) if file.endswith('.pdf')]
resume_files = [file for file in os.listdir(resumes_directory) if file.endswith('.pdf')]

job_desc_file = job_desc_files[3]  # This will select the first job description file


In [164]:
job_desc_file

'job_desc_product_manager.pdf'

In [165]:
# Define paths
# Read raw job description
job_desc_text = ReadPdf.read_single_pdf("Data/JobDescription/" + job_desc_file)

# Process job description
job_desc_processor = JobDescriptionProcessor(job_desc_file)
job_desc_processed = job_desc_processor._read_job_desc()

# Read and process resumes
resumes_text = [ReadPdf.read_single_pdf("Data/Resumes/" + resume_file) for resume_file in resume_files]
resumes_processed = []
for resume_text in resumes_text:
    resume_processor = ResumeProcessor(resume_text)
    resume_data = resume_processor._read_resumes()
    resumes_processed.append(resume_data)




Error reading file 'Data/Resumes/JOHN DOE
123 Main St, Anywhere, USA — (123) 456-7890 — john.doe@email.com
LinkedIn: linkedin.com/in/johndoe — GitHub: github.com/johndoe
PROFESSIONAL SUMMARY
Highly skilled Full Stack Developer with over 5 years of experience in Java and Angular development,
specializing in designing, building, testing, and maintaining web applications. Proficient in an assortment
of technologies, including Java, Spring Boot, Angular, HTML5, CSS3, and SQL. Exceptional ability to
work in a team and self-direct. Committed to providing high-quality results with little supervision.
SKILLS
•Java and J2EE
•Spring Boot, Spring MVC, and Hibernate
•Angular (versions 2+)
•JavaScript, TypeScript, HTML5, CSS3, and Bootstrap
•RESTful APIs
•SQL and NoSQL databases (MySQL, MongoDB)
•Agile and Scrum
•Git and GitHub
•Junit and Mockito
•Docker
WORK EXPERIENCE
Full Stack Java Developer , ABC Company, Inc., Anywhere, USA, June 2018 - Present
•Developed scalable, robust, and maintainable en

In [166]:

job_desc_clean_text = job_desc_text.replace("\n", " ")
job_desc_clean_text

'Job Description: Product Manager (10+ Years of Experience) Tech Solutions, San Francisco, CA, USA About Us Tech Solutions is a global leader in the technology industry, specializing in the development of cutting-edge software products. We’re currently looking for a seasoned Product Manager with over 10 years of experience to join our dynamic team. Job Description The Product Manager will be responsible for guiding the success of a product and leading the cross-functional team that is responsible for improving it. This is an important organizational role that sets the strategy, roadmap, and feature definition for a product or product line. Responsibilities •Define the product strategy and roadmap. •Deliver MRDs and PRDs with prioritized features and corresponding justification. •Work with external third parties to assess partnerships and licensing opportunities. •Run beta and pilot programs with early-stage products and samples. •Be an expert with respect to the competition. •Act as a 

In [167]:
def extract_candidate_name_from_filename(filename: str) -> str:
    name_parts = filename.split('_')[:-1]  # Exclude the last part which is the position
    return ' '.join(name_parts).title()

In [168]:

# Initialize the KeytermExtractor for the job description
job_desc_extractor = KeytermExtractor(job_desc_text)

# Extract key terms using the TextRank algorithm (or any other method you prefer)
job_desc_keywords = job_desc_extractor.get_keyterms_based_on_textrank()

# Extract key terms for each resume
resumes_keywords = []
for resume_text in resumes_text:
    resume_extractor = KeytermExtractor(resume_text)
    resume_keywords = resume_extractor.get_keyterms_based_on_textrank()
    resumes_keywords.append(resume_keywords)




In [169]:
# Convert the extracted keywords into strings
job_desc_keywords_str = ' '.join([keyword[0] for keyword in job_desc_keywords])
resumes_keywords_str_list = [' '.join([keyword[0] for keyword in resume_keywords]) for resume_keywords in resumes_keywords]

In [170]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

In [171]:
# Path to your service account key file
key_path = '../Resume-Matcher/vertexai-401621-937aefae595d.json' #Path to the json key associated with your service account from google cloud

In [172]:
# Create credentials object

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())

In [174]:
import vertexai
import numpy as np
# initialize vertex
vertexai.init(project = PROJECT_ID, location = REGION, credentials = credentials)

In [175]:
from vertexai.language_models import TextEmbeddingModel
from sklearn.metrics.pairwise import cosine_similarity
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [176]:
# Get embeddings using the TextEmbeddingModel
# def get_vertex_embeddings(texts: List[str]) -> List[List[float]]:
#     embeddings = embedding_model.get_embeddings(texts)
#     return embeddings
# def get_vertex_embeddings(texts: List[str]) -> List[List[float]]:
#     embedding_objects = embedding_model.get_embeddings(texts)
#     embeddings = [obj.values for obj in embedding_objects]
#     return embeddings
def get_vertex_embeddings(texts: List[str]) -> List[List[float]]:
    CHUNK_SIZE = 5  # Maximum number of instances allowed per prediction
    embeddings = []

    # Split the texts into chunks of size CHUNK_SIZE
    for i in range(0, len(texts), CHUNK_SIZE):
        chunk = texts[i:i + CHUNK_SIZE]
        chunk_embeddings = embedding_model.get_embeddings(chunk)
        embeddings.extend(chunk_embeddings)
        embedding_value = [obj.values for obj in embeddings]
    #print(embeddings)  # Add this line to inspect the embeddings
    return embedding_value



# Compute cosine similarity
def compute_cosine_similarity(embedding1: List[float], embedding2: List[float]) -> float:
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity

# Get embeddings for job description and resumes
job_desc_embedding = get_vertex_embeddings([job_desc_clean_text])[0]
resumes_embeddings = get_vertex_embeddings(resumes_keywords_str_list)


In [177]:
#candidate_names = [os.path.splitext(resume_file)[0] for resume_file in resume_files]
candidate_names = [extract_candidate_name_from_filename(resume_file) for resume_file in resume_files]

In [178]:
similarities_with_names = [
    {
        "name": candidate_names[i],
        "similarity": cosine_similarity([job_desc_embedding], [resume_embedding])[0][0]
    }
    for i, resume_embedding in enumerate(resumes_embeddings)
]


In [179]:
ranked_candidates = sorted(similarities_with_names, key=lambda x: x["similarity"], reverse=True)

# Print the ranked candidates
for index, candidate in enumerate(ranked_candidates, 1):  # Starting the index from 1
    print(f"Candidate {index}: {candidate['name']}, Similarity Score: {candidate['similarity']:.4f}")


Candidate 1: Alfred Pennyworth, Similarity Score: 0.7649
Candidate 2: Maria Gonzalez, Similarity Score: 0.7344
Candidate 3: John, Similarity Score: 0.7340
Candidate 4: Hanfei He, Similarity Score: 0.7185
Candidate 5: Bruce Wayne, Similarity Score: 0.7135
Candidate 6: Xiaoda Li, Similarity Score: 0.7005
Candidate 7: Sheen Huang, Similarity Score: 0.6856
Candidate 8: Harvey Dent, Similarity Score: 0.6849
Candidate 9: Barry Allen, Similarity Score: 0.6762
Candidate 10: Arif Demirkan, Similarity Score: 0.6601
Candidate 11: John Zhou, Similarity Score: 0.6504
