In [1]:
import json
import os
from typing import List
import networkx as nx
import nltk
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from annotated_text import annotated_text, parameters
from streamlit_extras import add_vertical_space as avs
from streamlit_extras.badges import badge
from scripts.similarity import get_similarity_score, find_path, read_config
from scripts.utils import get_filenames_from_dir
from scripts import ReadPdf, JobDescriptionProcessor, ResumeProcessor, KeytermsExtraction
import cohere
from scripts.KeytermsExtraction import KeytermExtractor
from scripts.similarity.get_similarity_score import get_similarity_score
import uuid
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
import yaml
import logging
logging.basicConfig(
    filename='app_similarity_score.log',
    filemode='w',
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)

file_handler = logging.FileHandler("app_similarity_score.log")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)
cwd = find_path('Resume-Matcher')
config_path = os.path.join(cwd, "scripts", "similarity")


def read_config(filepath):
    try:
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return config
    except FileNotFoundError as e:
        logger.error(f"Configuration file {filepath} not found: {e}")
    except yaml.YAMLError as e:
        logger.error(
            f"Error parsing YAML in configuration file {filepath}: {e}", exc_info=True)
    except Exception as e:
        logger.error(f"Error reading configuration file {filepath}: {e}")
    return None


config = read_config(config_path + "/config.yml")
PROJECT_ID = config['vertex']['api_key']
REGION = config['vertex']['REGION']

In [3]:
FULL_STACK = 0

In [4]:
import os

job_desc_directory = "Data/JobDescription/"
resumes_directory = "Data/Resumes/"

# Automatically get all job description and resume files
job_desc_files = [file for file in os.listdir(
    job_desc_directory) if file.endswith('.pdf')]
resume_files = [file for file in os.listdir(
    resumes_directory) if file.endswith('.pdf')]

# This will select the first job description file
job_desc_file = job_desc_files[FULL_STACK]

In [5]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://chl155:3DiufR0lg70F9s4P@cluster0.bkmn6rk.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [6]:
# Choose or create a database named 'resume_db'
db = client['resume_db']

# Choose or create a collection named 'candidates'
candidates_collection = db['candidates']

In [7]:
# Choose or create a database named 'job_db'
db = client['job_db']
job_collection = db['job']

In [8]:
# Define paths
# Read raw job description
# job_desc_text = ReadPdf.read_single_pdf("Data/JobDescription/" + job_desc_file)
job_desc_text = [ReadPdf.read_single_pdf(os.path.join(
    "Data/JobDescription/", job_desc_file)) for job_desc_file in job_desc_files]
# Process job description
job_desc_processor = JobDescriptionProcessor(job_desc_file)
job_desc_processed = job_desc_processor._read_job_desc()

job_files = [f for f in os.listdir(
    "Data/JobDescription/") if os.path.isfile(os.path.join("Data/JobDescription/", f))]


job_processed = []
for job_file in job_files:
    job_processor = JobDescriptionProcessor(job_file)
    job_data = job_processor._read_job_desc()
    job_processor._write_json_file(job_data)
    job_processed.append(job_data)

In [9]:

resumes_text = [ReadPdf.read_single_pdf(os.path.join(
    "Data/Resumes/", resume_file)) for resume_file in resume_files]


# Process resumes
resume_files = [f for f in os.listdir(
    "Data/Resumes/") if os.path.isfile(os.path.join("Data/Resumes/", f))]

resumes_processed = []
for resume_file in resume_files:
    resume_processor = ResumeProcessor(resume_file)
    resume_data = resume_processor._read_resumes()
    resume_processor._write_json_file(resume_data)
    resumes_processed.append(resume_data)

In [10]:
def extract_candidate_name_from_filename(filename: str) -> str:
    # Exclude the last part which is the position
    name_parts = filename.split('_')[:-1]
    return ' '.join(name_parts).title()

In [11]:
candidate_names = [extract_candidate_name_from_filename(
    resume_file) for resume_file in resume_files]
candidate_names

['Yiching Liu',
 'Angela Zhu',
 'Maria Chinkan',
 'Arpi Melik Parsadanyan',
 'Zihui Lin',
 'Jose Felix Villasenor',
 'Anna Gasparyan',
 'Amitesh Rathore',
 'Yunrui Shao',
 'Jaykumar',
 'Minyue Yao',
 'John',
 'Zane Rouguine',
 'Brandon Penner',
 'Zhe Wang',
 'Tsubasa Lin',
 'Danny Mai',
 'David Boutwell',
 'Alexandra',
 'Sarah Sherman',
 'David Botbol',
 'Ryan Pintar',
 'Eloise Yu',
 'Vasil Klimovich',
 'Ming Jin',
 'Robert Scozzari',
 'Cody Romero',
 'Carnell Brame',
 'Timothy Wang',
 'Nico Santoso',
 'Shirley Zhao',
 'Yuan Wang',
 'Divya Harshini',
 'Deekshitha Pullaiah',
 'Meredith Cheng',
 'Grace Li',
 'Anya Hsu',
 'Nandini Seth',
 'Andrew Knuppel',
 'John Hinnegan',
 'Sharad Dangol',
 'Salvador Campos',
 'Lauren Aubrey Lee',
 'Xiao Li',
 'Bruce Wayne',
 'Barry Allen',
 'Balraj Rai',
 'Annie Zhou',
 'Federico De Marines',
 'Ray Lee',
 'Jagriti Sharma',
 'Michelle Wang',
 'Galen Fink',
 'Dennis Mo',
 'Mengyao Zhang',
 'Johann C',
 'Yixin-Ying',
 'Peggy Lai',
 'Yuanhuang Lo',
 'Serle

In [12]:
# resumes_processed[0]['name']

for i in range(len(resumes_processed)):
    resumes_processed[i]['name'] = candidate_names[i]

In [13]:
candidates_collection.insert_many(resumes_processed)

<pymongo.results.InsertManyResult at 0x2fed1a020>

In [14]:
def extract_jobdes_name_from_filename(filename: str) -> str:
    # Exclude the last part which is the position
    name_parts = filename.split('_')[:-1]
    return ' '.join(name_parts).title()

In [15]:
job_des = [extract_jobdes_name_from_filename(
    job_desc_file) for job_desc_file in job_desc_files]
job_des = [item.replace('Job Desc ', '') for item in job_des]

In [16]:
for i in range(len(job_processed)):
    job_processed[i].update({'job_title': job_des[i]})

In [17]:
job_collection.insert_many(job_processed)

<pymongo.results.InsertManyResult at 0x31e532890>

In [18]:
job_des_parse_data = job_collection.find({}, {'unique_id':1,'clean_data':1,'extracted_keywords':1,'_id':0,'job_title':1})

In [19]:
job_des_parse_data = list(job_des_parse_data)

In [20]:
job_title_to_index = {}
job_unique_id = []

for index, data in enumerate(job_des_parse_data):
    # Convert the job_title to uppercase and replace spaces with underscores
    constant_name = data['job_title'].upper().replace(' ', '_')
    job_unique_id.append(data['unique_id'])
    # Set the constant name as a key in the dictionary with its index as the value
    job_title_to_index[constant_name] = index

# To access a particular index:
# index_for_full_stack = job_title_to_index['FULL_STACK']

In [21]:
# Get all the candidates from the MongoDB collection
candidates_parse_data = candidates_collection.find(
    {}, {"name": 1, "unique_id": 1, "_id": 0, "clean_data": 1, "extracted_keywords": 1})

In [22]:
candidates_parse_data = list(candidates_parse_data)

In [23]:
candidates_parse_data_holder = candidates_parse_data

In [24]:
candidates_unique_id = []

for index, data in enumerate(candidates_parse_data):
    # Convert the job_title to uppercase and replace spaces with underscores
    candidates_unique_id.append(data['unique_id'])
    # Set the constant name as a key in the dictionary with its index as the value

In [25]:
# Initialize an empty list for resumes_keywords if it doesn't exist yet


# Iterate over the cursor and update resumes_keywords
# Initialize an empty list for resumes_keywords if it doesn't exist yet
# Initialize an empty list for resumes_keywords if it doesn't exist yet
resumes_keywords_str_list = []

# Iterate over the cursor and update resumes_keywords
for document in candidates_parse_data_holder:
    keyword_string = ' '.join(document['extracted_keywords'])
    resumes_keywords_str_list.append(keyword_string)
    # resumes_keywords.append(document['extracted_keywords'])


# for index in range(len(candidates_parse_data)):
#     candidates_parse_data[index]['extracted_keywords'] = resumes_keywords[index]

In [26]:
resumes_clean_data_str_list = []

# Iterate over the cursor and update resumes_keywords
for document in candidates_parse_data:
    keyword_string = ' '.join(document['clean_data'])
    resumes_clean_data_str_list.append(keyword_string)

In [38]:
# Convert the extracted keywords into strings
# job_desc_keywords_str = ' '.join([keyword[0] for keyword in job_desc_keywords])
job_desc_keywords_str_list = []
for index in range(len(job_des_parse_data)):
    keyword_string = ' '.join(document['extracted_keywords'])
    job_desc_keywords_str_list.append(keyword_string)
    #job_desc_keywords_str_list.append(job_des_parse_data[index]['clean_data'])
    #job_desc_keywords_str_list.append(job_des_parse_data[index]['extracted_keywords'])
    

In [39]:
candidate_names = [extract_candidate_name_from_filename(
    resume_file) for resume_file in resume_files]

In [40]:
embeddings = HuggingFaceEmbeddings()

In [41]:
#job_desc_embeddings = embeddings.embed_documents(job_desc_keywords_str_list)
# Assuming job_desc_keywords_str_list is a list of lists
# flattened_list = [
#     item for sublist in job_desc_keywords_str_list for item in sublist]

# Now, each element of flattened_list should be a string
job_desc_embeddings = embeddings.embed_documents(job_desc_keywords_str_list)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]


In [42]:
resumes_embeddings = embeddings.embed_documents(resumes_keywords_str_list)

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches: 100%|██████████| 9/9 [00:15<00:00,  1.72s/it]


In [43]:
import pinecone

pinecone.init(
	api_key='75a74122-c7d4-470d-a892-df788c806fb7',
	environment='gcp-starter'
)

index = pinecone.Index(index_name="jobmatcher")

In [44]:
index_for_full_stack = job_title_to_index['FULL_STACK']

In [45]:
job_des_underscore = [name.replace(' ', '_') for name in job_des]

In [46]:
candidate_names_underscore = [name.replace(
    ' ', '_') for name in candidate_names]

In [47]:
print(len(job_desc_embeddings), len(job_des_underscore), len(job_unique_id))

12 4 12


In [48]:
job_vectors_with_metadata = []

for i, embedding in enumerate(job_desc_embeddings):
    vector_data = {
        "id": job_des_underscore[i],
        "values": embedding,
        "metadata": {"unique_id": job_unique_id[i]}
    }
    job_vectors_with_metadata.append(vector_data)

IndexError: list index out of range

In [None]:
candidates_vectors_with_metadata = []

for i, embedding in enumerate(resumes_embeddings):
    vector_data = {
        "id": candidate_names_underscore[i],
        "values": embedding,
        "metadata": {"unique_id": candidates_unique_id[i]}
    }
    candidates_vectors_with_metadata.append(vector_data)

IndexError: list index out of range

In [None]:
all_vectors_with_metadata = job_vectors_with_metadata + \
    candidates_vectors_with_metadata
index.upsert(vectors=all_vectors_with_metadata)

In [None]:
candidates_fetched_vectors = index.fetch(ids=candidate_names_underscore)

In [None]:
jobs_fetched_vectors = index.fetch(ids=job_des_underscore)

In [None]:
all_job_vectors = {}

for name, data in jobs_fetched_vectors['vectors'].items():
    all_job_vectors[name] = data['values']

In [None]:
all_candidate_vectors = {}

for name, data in candidates_fetched_vectors['vectors'].items():
    all_candidate_vectors[name] = data['values']

In [None]:
all_job_vectors['Full_Stack']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(embedding1: List[float], embedding2: List[float]) -> float:
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity

In [None]:

similarities_with_names = []

# Assuming all_job_vectors['Full_Stack'] gives you the vector for the 'Full_Stack' job.
full_stack_vector = all_job_vectors['Full_Stack']

for candidate_name, candidate_vector in all_candidate_vectors.items():
    similarity_score = cosine_similarity(
        [full_stack_vector], [candidate_vector])[0][0]
    similarities_with_names.append({
        "name": candidate_name,
        "similarity": similarity_score
    })

In [None]:
ranked_candidates = sorted(similarities_with_names,
                           key=lambda x: x["similarity"], reverse=True)

# Print the ranked candidates
# Starting the index from 1
for index, candidate in enumerate(ranked_candidates, 1):
    print(
        f"Candidate {index}: {candidate['name']}, Similarity Score: {candidate['similarity']:.4f}")

In [None]:
import csv

# Set the file path
file_path = 'Data/Result/ranked_candidates_hugging.csv'

# Prepare the data for CSV
rows = [["Rank", "Candidate Name", "Similarity Score"]]
for index, candidate in enumerate(ranked_candidates, 1):
    rows.append([index, candidate['name'], candidate['similarity']])

# Save to CSV
with open(file_path, "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)