In [2]:
import json
import os
from typing import List
import networkx as nx
import nltk
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from annotated_text import annotated_text, parameters
from streamlit_extras import add_vertical_space as avs
from streamlit_extras.badges import badge
from scripts.similarity import get_similarity_score, find_path, read_config
from scripts.utils import get_filenames_from_dir
from scripts import ReadPdf, JobDescriptionProcessor, ResumeProcessor, KeytermsExtraction
import cohere
from scripts.KeytermsExtraction import KeytermExtractor
from scripts.similarity.get_similarity_score import get_similarity_score
import uuid
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
import yaml
import logging
logging.basicConfig(
    filename='app_similarity_score.log',
    filemode='w',
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)

file_handler = logging.FileHandler("app_similarity_score.log")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)
cwd = find_path('Resume-Matcher')
config_path = os.path.join(cwd, "scripts", "similarity")


def read_config(filepath):
    try:
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return config
    except FileNotFoundError as e:
        logger.error(f"Configuration file {filepath} not found: {e}")
    except yaml.YAMLError as e:
        logger.error(
            f"Error parsing YAML in configuration file {filepath}: {e}", exc_info=True)
    except Exception as e:
        logger.error(f"Error reading configuration file {filepath}: {e}")
    return None


config = read_config(config_path + "/config.yml")
PROJECT_ID = config['vertex']['api_key']
REGION = config['vertex']['REGION']

In [4]:
FULL_STACK = 0

In [6]:
import os

job_desc_directory = "Data/JobDescription/"
resumes_directory = "Data/Resumes/"

# Automatically get all job description and resume files
job_desc_files = [file for file in os.listdir(
    job_desc_directory) if file.endswith('.pdf')]
resume_files = [file for file in os.listdir(
    resumes_directory) if file.endswith('.pdf')]

# This will select the first job description file
job_desc_file = job_desc_files[FULL_STACK]

## Connect your MongoDB database

In [None]:
# Choose or create a database named 'resume_db'
db = client['resume_db']

# Choose or create a collection named 'candidates'
candidates_collection = db['candidates']

In [None]:
# Choose or create a database named 'job_db'
db = client['job_db']
job_collection = db['job']

In [7]:
# Define paths
# Read raw job description
# job_desc_text = ReadPdf.read_single_pdf("Data/JobDescription/" + job_desc_file)
job_desc_text = [ReadPdf.read_single_pdf(os.path.join(
    "Data/JobDescription/", job_desc_file)) for job_desc_file in job_desc_files]
# Process job description
job_desc_processor = JobDescriptionProcessor(job_desc_file)
job_desc_processed = job_desc_processor._read_job_desc()

job_files = [f for f in o s.listdir(
    "Data/JobDescription/") if os.path.isfile(os.path.join("Data/JobDescription/", f))]


job_processed = []
for job_file in job_files:
    job_processor = JobDescriptionProcessor(job_file)
    job_data = job_processor._read_job_desc()
    job_processor._write_json_file(job_data)
    job_processed.append(job_data)

In [10]:
job_files

['job_desc_full_stack_engineer.pdf',
 'job_desc_front_end_engineer.pdf',
 'job_desc_java_developer.pdf',
 'job_desc_product_manager.pdf']

In [8]:
job_desc_processed

{'unique_id': '2e3cb665-8a86-40d0-b721-2fc4114c04d3',
 'job_desc_data': 'Job Description: Senior Full Stack Engineer (5+ Years of\nExperience)\nTech Solutions, San Francisco, CA, USA\nAbout Us\nTech Solutions is a leading technology company that creates innovative solutions across a variety of industries.\nOur mission is to improve lives through advanced technology. We’re currently seeking a Senior Full Stack\nEngineer to join our dynamic team.\nJob Description\nWe’re looking for a Senior Full Stack Engineer with 5+ years of experience in developing web applications.\nThe successful candidate will have experience working with both front-end and back-end technologies, and\nwill be capable of overseeing projects from conception to production deployment.\nResponsibilities\n•Developing front end website architecture.\n•Designing user interactions on web pages.\n•Developing back end website applications.\n•Creating servers and databases for functionality.\n•Ensuring cross-platform optimizat

In [None]:

resumes_text = [ReadPdf.read_single_pdf(os.path.join(
    "Data/Resumes/", resume_file)) for resume_file in resume_files]


# Process resumes
resume_files = [f for f in os.listdir(
    "Data/Resumes/") if os.path.isfile(os.path.join("Data/Resumes/", f))]

resumes_processed = []
for resume_file in resume_files:
    resume_processor = ResumeProcessor(resume_file)
    resume_data = resume_processor._read_resumes()
    resume_processor._write_json_file(resume_data)
    resumes_processed.append(resume_data)

In [None]:
def extract_candidate_name_from_filename(filename: str) -> str:
    # Exclude the last part which is the position
    name_parts = filename.split('_')[:-1]
    return ' '.join(name_parts).title()

In [None]:
candidate_names = [extract_candidate_name_from_filename(
    resume_file) for resume_file in resume_files]
candidate_names

In [None]:
# resumes_processed[0]['name']

for i in range(len(resumes_processed)):
    resumes_processed[i]['name'] = candidate_names[i]

In [None]:
candidates_collection.insert_many(resumes_processed)

In [None]:
def extract_jobdes_name_from_filename(filename: str) -> str:
    # Exclude the last part which is the position
    name_parts = filename.split('_')[:-1]
    return ' '.join(name_parts).title()

In [None]:
job_des = [extract_jobdes_name_from_filename(
    job_desc_file) for job_desc_file in job_desc_files]
job_des = [item.replace('Job Desc ', '') for item in job_des]

In [None]:
for i in range(len(job_processed)):
    job_processed[i].update({'job_title': job_des[i]})

In [None]:
job_collection.insert_many(job_processed)

In [None]:
job_des_parse_data = job_collection.find({}, {'unique_id':1,'clean_data':1,'extracted_keywords':1,'_id':0,'job_title':1})

In [None]:
job_des_parse_data = list(job_des_parse_data)

In [None]:
job_title_to_index = {}
job_unique_id = []

for index, data in enumerate(job_des_parse_data):
    # Convert the job_title to uppercase and replace spaces with underscores
    constant_name = data['job_title'].upper().replace(' ', '_')
    job_unique_id.append(data['unique_id'])
    # Set the constant name as a key in the dictionary with its index as the value
    job_title_to_index[constant_name] = index

# To access a particular index:
# index_for_full_stack = job_title_to_index['FULL_STACK']

In [None]:
# Get all the candidates from the MongoDB collection
candidates_parse_data = candidates_collection.find(
    {}, {"name": 1, "unique_id": 1, "_id": 0, "clean_data": 1, "extracted_keywords": 1})

In [None]:
candidates_parse_data = list(candidates_parse_data)

In [None]:
candidates_parse_data_holder = candidates_parse_data

In [None]:
candidates_unique_id = []

for index, data in enumerate(candidates_parse_data):
    # Convert the job_title to uppercase and replace spaces with underscores
    candidates_unique_id.append(data['unique_id'])
    # Set the constant name as a key in the dictionary with its index as the value

In [None]:
# Initialize an empty list for resumes_keywords if it doesn't exist yet


# Iterate over the cursor and update resumes_keywords
# Initialize an empty list for resumes_keywords if it doesn't exist yet
# Initialize an empty list for resumes_keywords if it doesn't exist yet
resumes_keywords_str_list = []

# Iterate over the cursor and update resumes_keywords
for document in candidates_parse_data_holder:
    keyword_string = ' '.join(document['extracted_keywords'])
    resumes_keywords_str_list.append(keyword_string)
    # resumes_keywords.append(document['extracted_keywords'])


# for index in range(len(candidates_parse_data)):
#     candidates_parse_data[index]['extracted_keywords'] = resumes_keywords[index]

In [None]:
resumes_clean_data_str_list = []

# Iterate over the cursor and update resumes_keywords
for document in candidates_parse_data:
    keyword_string = ' '.join(document['clean_data'])
    resumes_clean_data_str_list.append(keyword_string)

In [None]:
# Convert the extracted keywords into strings
# job_desc_keywords_str = ' '.join([keyword[0] for keyword in job_desc_keywords])
job_desc_keywords_str_list = []
for index in range(len(job_des_parse_data)):
    keyword_string = ' '.join(document['extracted_keywords'])
    job_desc_keywords_str_list.append(keyword_string)
    #job_desc_keywords_str_list.append(job_des_parse_data[index]['clean_data'])
    #job_desc_keywords_str_list.append(job_des_parse_data[index]['extracted_keywords'])
    

In [None]:
candidate_names = [extract_candidate_name_from_filename(
    resume_file) for resume_file in resume_files]

In [None]:
embeddings = HuggingFaceEmbeddings()

In [None]:
#job_desc_embeddings = embeddings.embed_documents(job_desc_keywords_str_list)
# Assuming job_desc_keywords_str_list is a list of lists
# flattened_list = [
#     item for sublist in job_desc_keywords_str_list for item in sublist]

# Now, each element of flattened_list should be a string
job_desc_embeddings = embeddings.embed_documents(job_desc_keywords_str_list)

In [None]:
resumes_embeddings = embeddings.embed_documents(resumes_keywords_str_list)

## Connect your Pinecone database

In [None]:
import pinecone

pinecone.init(
	api_key= config['pinecone']['api_key'],
	environment='gcp-starter'
)

index = pinecone.Index(index_name="jobmatcher")

In [None]:
index_for_full_stack = job_title_to_index['FULL_STACK']

In [None]:
job_des_underscore = [name.replace(' ', '_') for name in job_des]

In [None]:
candidate_names_underscore = [name.replace(
    ' ', '_') for name in candidate_names]

In [None]:
print(len(job_desc_embeddings), len(job_des_underscore), len(job_unique_id))

In [None]:
job_vectors_with_metadata = []

# for i, embedding in enumerate(job_desc_embeddings):
#     vector_data = {
#         "id": job_des_underscore[i:4],
#         "values": embedding,
#         "metadata": {"unique_id": job_unique_id[i]}
#     }
#     job_vectors_with_metadata.append(vector_data)

In [None]:
for i, embedding in enumerate(job_desc_embeddings):
    if i < len(job_des_underscore) and i < len(job_unique_id):
        vector_data = {
            "id": job_des_underscore[i],
            "values": embedding,
            "metadata": {"unique_id": job_unique_id[i]}
        }
        job_vectors_with_metadata.append(vector_data)
    else:
        break  # Break the loop if i exceeds the length of other lists

In [None]:
job_vectors_with_metadata

In [None]:
candidates_vectors_with_metadata = []

# for i, embedding in enumerate(resumes_embeddings):
#     vector_data = {
#        "id": candidate_names_underscore[i:4],
#         "values": embedding,
#         "metadata": {"unique_id": candidates_unique_id[i]}
#     }
#     candidates_vectors_with_metadata.append(vector_data)
for i, embedding in enumerate(resumes_embeddings):
    if i < len(candidate_names_underscore) and i < len(candidates_unique_id):
        vector_data = {
            "id": candidate_names_underscore[i],
            "values": embedding,
            "metadata": {"unique_id": candidates_unique_id[i]}
        }
        candidates_vectors_with_metadata.append(vector_data)
    else:
        break  # Break the loop if i exceeds the length of other lists

In [None]:
all_vectors_with_metadata = job_vectors_with_metadata + candidates_vectors_with_metadata
index.upsert(vectors=all_vectors_with_metadata)

In [None]:
candidates_fetched_vectors = index.fetch(ids=candidate_names_underscore)

In [None]:
jobs_fetched_vectors = index.fetch(ids=job_des_underscore)

In [None]:
all_job_vectors = {}

for name, data in jobs_fetched_vectors['vectors'].items():
    all_job_vectors[name] = data['values']

In [None]:
all_candidate_vectors = {}

for name, data in candidates_fetched_vectors['vectors'].items():
    all_candidate_vectors[name] = data['values']

In [None]:
all_job_vectors['Full_Stack']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(embedding1: List[float], embedding2: List[float]) -> float:
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity

In [None]:

similarities_with_names = []

# Assuming all_job_vectors['Full_Stack'] gives you the vector for the 'Full_Stack' job.
full_stack_vector = all_job_vectors['Full_Stack']

for candidate_name, candidate_vector in all_candidate_vectors.items():
    similarity_score = cosine_similarity(
        [full_stack_vector], [candidate_vector])[0][0]
    similarities_with_names.append({
        "name": candidate_name,
        "similarity": similarity_score
    })

In [None]:
ranked_candidates = sorted(similarities_with_names,
                           key=lambda x: x["similarity"], reverse=True)

# Print the ranked candidates
# Starting the index from 1
for index, candidate in enumerate(ranked_candidates, 1):
    print(
        f"Candidate {index}: {candidate['name']}, Similarity Score: {candidate['similarity']:.4f}")

In [None]:
import csv

# Set the file path
file_path = 'Data/Result/ranked_candidates_hugging.csv'

# Prepare the data for CSV
rows = [["Rank", "Candidate Name", "Similarity Score"]]
for index, candidate in enumerate(ranked_candidates, 1):
    rows.append([index, candidate['name'], candidate['similarity']])

# Save to CSV
with open(file_path, "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)