In [1]:
import json
import os
from typing import List
import networkx as nx
import nltk
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from annotated_text import annotated_text, parameters
from streamlit_extras import add_vertical_space as avs
from streamlit_extras.badges import badge
from scripts.similarity import get_similarity_score, find_path, read_config
from scripts.utils import get_filenames_from_dir
from scripts import ReadPdf, JobDescriptionProcessor, ResumeProcessor, KeytermsExtraction
import cohere
from scripts.KeytermsExtraction import KeytermExtractor
from scripts.similarity.get_similarity_score import get_similarity_score
import uuid
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
FULL_STACK = 0

In [4]:
import yaml
import logging
logging.basicConfig(
    filename='app_similarity_score.log',
    filemode='w',
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)

file_handler = logging.FileHandler("app_similarity_score.log")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)
cwd = find_path('Resume-Matcher')
config_path = os.path.join(cwd, "scripts", "similarity")
def read_config(filepath):
    try:
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return config
    except FileNotFoundError as e:
        logger.error(f"Configuration file {filepath} not found: {e}")
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML in configuration file {filepath}: {e}", exc_info=True)
    except Exception as e:
        logger.error(f"Error reading configuration file {filepath}: {e}")
    return None
config = read_config(config_path + "/config.yml")
PROJECT_ID = config['vertex']['api_key']
REGION = config['vertex']['REGION']

In [5]:
import os

job_desc_directory = "Data/JobDescription/"
resumes_directory = "Data/Resumes/"

# Automatically get all job description and resume files
job_desc_files = [file for file in os.listdir(job_desc_directory) if file.endswith('.pdf')]
resume_files = [file for file in os.listdir(resumes_directory) if file.endswith('.pdf')]

job_desc_file = job_desc_files[FULL_STACK]  # This will select the first job description file


In [6]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://liuchuning831:VradLfR8Obkv8YWR@cluster0.l9wytxq.mongodb.net/?retryWrites=true&w=majority"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)
    
    

Pinged your deployment. You successfully connected to MongoDB!


In [7]:
# Choose or create a database named 'resume_db'
db = client['resume_db']

# Choose or create a collection named 'candidates'
candidates_collection = db['candidates']


In [8]:
# Choose or create a database named 'job_db'
db = client['job_db']
job_collection = db['job']

In [9]:
# Define paths
# Read raw job description
#job_desc_text = ReadPdf.read_single_pdf("Data/JobDescription/" + job_desc_file)
job_desc_text = [ReadPdf.read_single_pdf(os.path.join("Data/JobDescription/", job_desc_file)) for job_desc_file in job_desc_files]
# Process job description
job_desc_processor = JobDescriptionProcessor(job_desc_file)
job_desc_processed = job_desc_processor._read_job_desc()

job_files = [f for f in os.listdir("Data/JobDescription/") if os.path.isfile(os.path.join("Data/JobDescription/", f))]


job_processed = []
for job_file in job_files:
    job_processor = JobDescriptionProcessor(job_file)
    job_data = job_processor._read_job_desc()
    job_processor._write_json_file(job_data)
    job_processed.append(job_data)

In [12]:

resumes_text = [ReadPdf.read_single_pdf(os.path.join("Data/Resumes/", resume_file)) for resume_file in resume_files]


# Process resumes
resume_files = [f for f in os.listdir("Data/Resumes/") if os.path.isfile(os.path.join("Data/Resumes/", f))]

resumes_processed = []
for resume_file in resume_files:
    resume_processor = ResumeProcessor(resume_file)
    resume_data = resume_processor._read_resumes()
    resume_processor._write_json_file(resume_data)
    resumes_processed.append(resume_data)





In [70]:
import openai
import time
# Set up the OpenAI API key 
openai.api_key = config['openai']['api_key']





In [71]:
def extract_resume_keywords(resume_text):
    # Your old version code here
    prompt = (f"Given the following resume content:\n"
              f"'{resume_text}'\n\n"
              "Identify and list key skills, experiences, qualifications, 50 keywors based on the work qulification and any other relevant keywords. "
              "Please provide the output as a comma-separated list.")
    
    # Call the OpenAI Chat API
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
            {"role": "system", "content": "You are a helpful assistant to help me get the keyword from resume."},
            {"role": "user", "content": prompt}
        ]
    )

    # Check and handle response format
    if 'choices' in response and len(response.choices) > 0 and 'message' in response.choices[0]:
        keywords = response.choices[0].message['content'].strip().split(', ')
        return keywords
    else:
        raise ValueError("Unexpected response format from OpenAI")

In [13]:
def extract_candidate_name_from_filename(filename: str) -> str:
    name_parts = filename.split('_')[:-1]  # Exclude the last part which is the position
    return ' '.join(name_parts).title()

In [14]:
candidate_names = [extract_candidate_name_from_filename(resume_file) for resume_file in resume_files]
candidate_names

['Yiching Liu',
 'Angela Zhu',
 'Maria Chinkan',
 'Arpi Melik Parsadanyan',
 'Zihui Lin',
 'Jose Felix Villasenor',
 'Anna Gasparyan',
 'Amitesh Rathore',
 'Yunrui Shao',
 'Jaykumar',
 'Minyue Yao',
 'John',
 'Zane Rouguine',
 'Brandon Penner',
 'Zhe Wang',
 'Tsubasa Lin',
 'Danny Mai',
 'David Boutwell',
 'Alexandra',
 'Sarah Sherman',
 'David Botbol',
 'Ryan Pintar',
 'Eloise Yu',
 'Vasil Klimovich',
 'Ming Jin',
 'Robert Scozzari',
 'Cody Romero',
 'Carnell Brame',
 'Timothy Wang',
 'Nico Santoso',
 'Shirley Zhao',
 'Yuan Wang',
 'Divya Harshini',
 'Deekshitha Pullaiah',
 'Meredith Cheng',
 'Grace Li',
 'Anya Hsu',
 'Nandini Seth',
 'Andrew Knuppel',
 'John Hinnegan',
 'Sharad Dangol',
 'Salvador Campos',
 'Lauren Aubrey Lee',
 'Xiao Li',
 'Bruce Wayne',
 'Barry Allen',
 'Balraj Rai',
 'Annie Zhou',
 'Federico De Marines',
 'Ray Lee',
 'Jagriti Sharma',
 'Michelle Wang',
 'Galen Fink',
 'Dennis Mo',
 'Mengyao Zhang',
 'Johann C',
 'Yixin-Ying',
 'Peggy Lai',
 'Yuanhuang Lo',
 'Serle

In [15]:
# resumes_processed[0]['name']

for i in range(len(resumes_processed)):
    resumes_processed[i]['name'] = candidate_names[i]

In [None]:
# Insert all the resumes with UID into the MongoDB collection
candidates_collection.insert_many(resumes_processed)



<pymongo.results.InsertManyResult at 0x2cf7f9360>

In [16]:
def extract_jobdes_name_from_filename(filename: str) -> str:
    name_parts = filename.split('_')[:-1]  # Exclude the last part which is the position
    return ' '.join(name_parts).title()

In [17]:
job_des = [extract_jobdes_name_from_filename(job_desc_file) for job_desc_file in job_desc_files]
job_des = [item.replace('Job Desc ', '') for item in job_des]


In [18]:

for i in range(len(job_processed)):
    job_processed[i].update({'job_title':job_des[i]})

In [431]:
# Insert all the job descriptions with UID into the MongoDB collection
job_collection.insert_many(job_processed)

<pymongo.results.InsertManyResult at 0x2cf03b190>

In [19]:
job_des_parse_data = job_collection.find({},{'unique_id':1,'clean_data':1,'extracted_keywords':1,'_id':0,'job_title':1})

In [20]:
job_des_parse_data = list(job_des_parse_data)

In [21]:
job_title_to_index = {}
job_unique_id = []

for index, data in enumerate(job_des_parse_data):
    # Convert the job_title to uppercase and replace spaces with underscores
    constant_name = data['job_title'].upper().replace(' ', '_')
    job_unique_id.append(data['unique_id'])
    # Set the constant name as a key in the dictionary with its index as the value
    job_title_to_index[constant_name] = index

# To access a particular index:
# index_for_full_stack = job_title_to_index['FULL_STACK']


In [22]:
# Get all the candidates from the MongoDB collection
candidates_parse_data = candidates_collection.find({}, {"name": 1, "unique_id": 1, "_id": 0,"clean_data":1, "extracted_keywords": 1})



In [23]:
candidates_parse_data = list(candidates_parse_data)

In [24]:
candidates_parse_data_holder = candidates_parse_data

In [25]:
candidates_unique_id = []

for index, data in enumerate(candidates_parse_data):
    # Convert the job_title to uppercase and replace spaces with underscores
    candidates_unique_id.append(data['unique_id'])
    # Set the constant name as a key in the dictionary with its index as the value

In [26]:
# Initialize an empty list for resumes_keywords if it doesn't exist yet


# Iterate over the cursor and update resumes_keywords
# Initialize an empty list for resumes_keywords if it doesn't exist yet
# Initialize an empty list for resumes_keywords if it doesn't exist yet
resumes_keywords_str_list = []

# Iterate over the cursor and update resumes_keywords
for document in candidates_parse_data_holder:
    keyword_string = ' '.join(document['extracted_keywords'])
    resumes_keywords_str_list.append(keyword_string)
    #resumes_keywords.append(document['extracted_keywords'])



# for index in range(len(candidates_parse_data)):
#     candidates_parse_data[index]['extracted_keywords'] = resumes_keywords[index]

In [27]:
resumes_clean_data_str_list = []

# Iterate over the cursor and update resumes_keywords
for document in candidates_parse_data:
    keyword_string = ' '.join(document['clean_data'])
    resumes_clean_data_str_list.append(keyword_string)

In [74]:
# resume_keywords_dict = {}

# for index, resume_text in enumerate(resumes_clean_data_str_list, 1):
#     keywords = extract_resume_keywords(resume_text)
#     resume_keywords_dict[f"Resume_{index}"] = keywords

# Now, resume_keywords_dict contains the extracted keywords for each resume
   
    
BATCH_SIZE = 1  # number of resumes to process in one batch
BATCH_DELAY = 3  # time to wait between batches, in seconds

resume_keywords_dict = {}

# Iterate over the resumes in batches
for i in range(0, len(resumes_clean_data_str_list), BATCH_SIZE):
    batch = resumes_clean_data_str_list[i:i + BATCH_SIZE]
    
    for index, resume_text in enumerate(batch, start=i+1):
        try:
            keywords = extract_resume_keywords(resume_text)
            resume_keywords_dict[f"Resume_{index}"] = keywords
        except Exception as e:
            print(f"Error processing Resume_{index}: {e}")
    
    # Sleep for some time before processing the next batch (except for the last batch)
    if i + BATCH_SIZE < len(resumes_clean_data_str_list):
        time.sleep(BATCH_DELAY)


for key, value in resume_keywords_dict.items():
    print(f"{key}: {value}")

Error processing Resume_13: The server is overloaded or not ready yet.
Error processing Resume_23: HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
)
Error processing Resume_70: The server is overloaded or not ready yet.
Error processing Resume_74: The server is overloaded or not ready yet.
Error processing Resume_79: The server is overloaded or not ready yet.
Resume_1: ['key skills', 'interviewing', 'UI', 'UX', 'UX research', 'UXUI', 'UX designer', 'digital frontdoor', '3D web design\nexperiences', 'Taelor\nqualifications', '大同大學\nlanguages', '台灣\nother relevant keywords', 'LinkedIn', 'San Francisco Bay Area', 'Jubo Health']
Resume_2: ['Key skills', 'experiences', 'qualifications', 'languages', 'and other relevant keywords extracted from the resume are:\n\n- Collaborative Problem Solver\n- Translating Technical Data for Non-Technical Audiences\n- Excellent Communi

In [75]:
gpt_keywords_list = list(resume_keywords_dict.values())

In [28]:
# Convert the extracted keywords into strings
#job_desc_keywords_str = ' '.join([keyword[0] for keyword in job_desc_keywords])
job_desc_keywords_str_list = []
for index in range(len(job_des_parse_data)):
    job_desc_keywords_str_list.append(job_des_parse_data[index]['clean_data'])
#job_desc_keywords_str = job_des_parse_data[0]['clean_data']

In [29]:
candidate_names = [extract_candidate_name_from_filename(resume_file) for resume_file in resume_files]

In [78]:
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials

In [79]:
# Path to your service account key file
key_path = '../Resume-Matcher/vertexai-401621-937aefae595d.json' #Path to the json key associated with your service account from google cloud

In [80]:
# Create credentials object

credentials = Credentials.from_service_account_file(
    key_path,
    scopes=['https://www.googleapis.com/auth/cloud-platform'])

if credentials.expired:
    credentials.refresh(Request())

In [81]:
import vertexai
import numpy as np
# initialize vertex
vertexai.init(project = PROJECT_ID, location = REGION, credentials = credentials)

In [82]:
from vertexai.language_models import TextEmbeddingModel
from sklearn.metrics.pairwise import cosine_similarity
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [86]:
gpt_keywords_string_list = [' '.join(keywords) for keywords in gpt_keywords_list]


In [None]:
# Get embeddings using the TextEmbeddingModel
# def get_vertex_embeddings(texts: List[str]) -> List[List[float]]:
#     embeddings = embedding_model.get_embeddings(texts)
#     return embeddings
# def get_vertex_embeddings(texts: List[str]) -> List[List[float]]:
#     embedding_objects = embedding_model.get_embeddings(texts)
#     embeddings = [obj.values for obj in embedding_objects]
#     return embeddings
def get_vertex_embeddings(texts: List[str]) -> List[List[float]]:
    CHUNK_SIZE = 5  # Maximum number of instances allowed per prediction
    embeddings = []

    # Split the texts into chunks of size CHUNK_SIZE
    for i in range(0, len(texts), CHUNK_SIZE):
        chunk = texts[i:i + CHUNK_SIZE]
        chunk_embeddings = embedding_model.get_embeddings(chunk)
        embeddings.extend(chunk_embeddings)
        embedding_value = [obj.values for obj in embeddings]
    #print(embeddings)  # Add this line to inspect the embeddings
    return embedding_value

# Compute cosine similarity
def compute_cosine_similarity(embedding1: List[float], embedding2: List[float]) -> float:
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity

# Get embeddings for job description and resumes
job_desc_embeddings = get_vertex_embeddings(job_desc_keywords_str_list)
#resumes_embeddings = get_vertex_embeddings(resumes_keywords_str_list)
resumes_embeddings_whole = get_vertex_embeddings(resumes_clean_data_str_list)
#resumes_embeddings_gpt = get_vertex_embeddings(gpt_keywords_list)

In [88]:
resumes_embeddings_gpt = get_vertex_embeddings(gpt_keywords_string_list)

In [89]:
import pinecone      

pinecone.init(      
	api_key='75a74122-c7d4-470d-a892-df788c806fb7',      
	environment='gcp-starter'      
)    

index = pinecone.Index(index_name="jobmatcher")

In [90]:
index_for_full_stack = job_title_to_index['FULL_STACK']

In [91]:
job_des_underscore = [name.replace(' ', '_') for name in job_des]

In [92]:
candidate_names_underscore = [name.replace(' ', '_') for name in candidate_names]

In [93]:
job_vectors_with_metadata = []

for i, embedding in enumerate(job_desc_embeddings):
    vector_data = {
        "id": job_des_underscore[i],
        "values": embedding,
        "metadata": {"unique_id": job_unique_id[i]}
    }
    job_vectors_with_metadata.append(vector_data)



In [468]:
candidates_vectors_with_metadata = []

for i, embedding in enumerate(resumes_embeddings):
    vector_data = {
        "id": candidate_names_underscore[i],
        "values": embedding,
        "metadata": {"unique_id": candidates_unique_id[i]}
    }
    candidates_vectors_with_metadata.append(vector_data)


In [94]:
candidates_vectors_with_metadata_whole = []

for i, embedding in enumerate(resumes_embeddings_whole):
    vector_data = {
        "id": candidate_names_underscore[i],
        "values": embedding,
        "metadata": {"unique_id": candidates_unique_id[i]}
    }
    candidates_vectors_with_metadata_whole.append(vector_data)

In [98]:
candidates_vectors_with_metadata_gpt = []

for i, embedding in enumerate(resumes_embeddings_gpt):
    vector_data = {
        "id": candidate_names_underscore[i],
        "values": embedding,
        "metadata": {"unique_id": candidates_unique_id[i]}
    }
    candidates_vectors_with_metadata_gpt.append(vector_data)

In [469]:
all_vectors_with_metadata = job_vectors_with_metadata + candidates_vectors_with_metadata
index.upsert(vectors=all_vectors_with_metadata)

{'upserted_count': 97}

In [47]:
all_vectors_with_metadata_whole = job_vectors_with_metadata + candidates_vectors_with_metadata_whole
index.upsert(vectors=all_vectors_with_metadata_whole)

{'upserted_count': 97}

In [97]:
all_vectors_with_metadata_gpt = job_vectors_with_metadata + candidates_vectors_with_metadata_gpt
index.upsert(vectors=all_vectors_with_metadata_gpt)

{'upserted_count': 92}

In [470]:

# List of candidate IDs associated with a specific job
candidates_fetched_vectors = index.fetch(ids=candidate_names_underscore)


In [49]:
candidates_fetched_vectors_whole = index.fetch(ids=candidate_names_underscore)

In [100]:
candidates_fetched_vectors_gpt = index.fetch(ids=candidate_names_underscore)

In [101]:
jobs_fetched_vectors = index.fetch(ids=job_des_underscore)

In [51]:
all_job_vectors = {}

for name, data in jobs_fetched_vectors['vectors'].items():
    all_job_vectors[name] = data['values']

In [473]:
all_candidate_vectors = {}

for name, data in candidates_fetched_vectors['vectors'].items():
    all_candidate_vectors[name] = data['values']


In [53]:
all_candidate_vectors_whole = {}

for name, data in candidates_fetched_vectors_whole['vectors'].items():
    all_candidate_vectors_whole[name] = data['values']


In [102]:
all_candidate_vectors_gpt = {}

for name, data in candidates_fetched_vectors_gpt['vectors'].items():
    all_candidate_vectors_gpt[name] = data['values']

In [294]:
#IN THEN FUTURE IF WE USE TWO INDEX



# job_id_to_query = job_unique_id[index_for_full_stack]
# job_embedding_to_query = [vector_data["values"] for vector_data in job_vectors_with_metadata if vector_data["metadata"]["unique_id"] == job_id_to_query][0]

# number_of_candidates = len(candidates_vectors_with_metadata)  # retrieve all candidates
# results = index.query(
#     vector=job_embedding_to_query,
#     # filter=filter_criteria,  # Uncomment if using filtering
#     top_k=number_of_candidates,
#     include_metadata=True
# )


In [54]:
all_job_vectors['Full_Stack']

[-0.00600681966,
 -0.0602209456,
 0.00730065349,
 0.00343397679,
 0.0369784907,
 -0.024647234,
 0.0337498747,
 0.0280820262,
 -0.00348442188,
 0.0497242,
 0.0335967094,
 0.0138770649,
 0.00179614243,
 0.0163046811,
 0.00875881,
 -1.93673659e-05,
 -0.0325206108,
 -0.0457260609,
 0.0238750316,
 0.0153263081,
 -0.0843152627,
 -0.0332679041,
 0.0292509608,
 -0.0211599767,
 0.00532217044,
 -0.0366266966,
 -0.0147755314,
 0.00305338739,
 0.00274453871,
 0.0494503453,
 -0.0096470518,
 0.0234567169,
 -0.0499196388,
 0.00721335178,
 0.012653674,
 0.017226208,
 -0.0176005196,
 0.0368479304,
 -0.0388800129,
 0.0188576169,
 0.0325558521,
 -0.0422664806,
 0.0303553157,
 -0.00696607959,
 -0.0294357128,
 -0.0121896444,
 -0.0388545059,
 0.0257240552,
 0.00547146611,
 -0.0778098777,
 -0.00292169303,
 -0.0179530513,
 0.00719748344,
 0.0270367954,
 -0.0209934581,
 0.0191249941,
 -0.058459878,
 0.039463032,
 -0.0542196408,
 -0.0192588419,
 0.00980851147,
 -0.03212687,
 0.020884769,
 -0.0739424154,
 0.0329

In [475]:

similarities_with_names = []

# Assuming all_job_vectors['Full_Stack'] gives you the vector for the 'Full_Stack' job.
full_stack_vector = all_job_vectors['Full_Stack']

for candidate_name, candidate_vector in all_candidate_vectors.items():
    similarity_score = cosine_similarity([full_stack_vector], [candidate_vector])[0][0]
    similarities_with_names.append({
        "name": candidate_name,
        "similarity": similarity_score
    })


In [56]:
similarities_with_names_whole = []

# Assuming all_job_vectors['Full_Stack'] gives you the vector for the 'Full_Stack' job.
full_stack_vector = all_job_vectors['Full_Stack']

for candidate_name, candidate_vector in all_candidate_vectors_whole.items():
    similarity_score = cosine_similarity([full_stack_vector], [candidate_vector])[0][0]
    similarities_with_names_whole.append({
        "name": candidate_name,
        "similarity": similarity_score
    })

In [103]:
similarities_with_names_gpt = []

# Assuming all_job_vectors['Full_Stack'] gives you the vector for the 'Full_Stack' job.
full_stack_vector = all_job_vectors['Full_Stack']

for candidate_name, candidate_vector in all_candidate_vectors_gpt.items():
    similarity_score = cosine_similarity([full_stack_vector], [candidate_vector])[0][0]
    similarities_with_names_gpt.append({
        "name": candidate_name,
        "similarity": similarity_score
    })

In [None]:
ranked_candidates = sorted(similarities_with_names, key=lambda x: x["similarity"], reverse=True)

# Print the ranked candidates
for index, candidate in enumerate(ranked_candidates, 1):  # Starting the index from 1
    print(f"Candidate {index}: {candidate['name']}, Similarity Score: {candidate['similarity']:.4f}")


In [57]:
ranked_candidates_whole = sorted(similarities_with_names_whole, key=lambda x: x["similarity"], reverse=True)

# Print the ranked candidates
for index, candidate in enumerate(ranked_candidates_whole, 1):  # Starting the index from 1
    print(f"Candidate {index}: {candidate['name']}, Similarity Score: {candidate['similarity']:.4f}")

Candidate 1: Arif_Demirkan, Similarity Score: 0.6576
Candidate 2: John, Similarity Score: 0.6535
Candidate 3: Xinyi_Yu, Similarity Score: 0.6535
Candidate 4: Sheen_Huang, Similarity Score: 0.6505
Candidate 5: Nico_Santoso, Similarity Score: 0.6494
Candidate 6: Chenjie_Wu, Similarity Score: 0.6487
Candidate 7: Xiaoda_Li, Similarity Score: 0.6486
Candidate 8: Jeffrey_Chen, Similarity Score: 0.6482
Candidate 9: Yunrui_Shao, Similarity Score: 0.6469
Candidate 10: Galen_Fink, Similarity Score: 0.6461
Candidate 11: Jose_Felix_Villasenor, Similarity Score: 0.6461
Candidate 12: Danny_Mai, Similarity Score: 0.6461
Candidate 13: Lisa_Udechukwu, Similarity Score: 0.6459
Candidate 14: Vasil_Klimovich, Similarity Score: 0.6458
Candidate 15: Alexandra, Similarity Score: 0.6458
Candidate 16: Barry_Allen, Similarity Score: 0.6458
Candidate 17: Tsubasa_Lin, Similarity Score: 0.6456
Candidate 18: Yiwen_Ding, Similarity Score: 0.6450
Candidate 19: Dennis_Mo, Similarity Score: 0.6449
Candidate 20: Meredit

In [104]:
ranked_candidates_gpt = sorted(similarities_with_names_gpt, key=lambda x: x["similarity"], reverse=True)

# Print the ranked candidates
for index, candidate in enumerate(ranked_candidates_gpt, 1):  # Starting the index from 1
    print(f"Candidate {index}: {candidate['name']}, Similarity Score: {candidate['similarity']:.4f}")

Candidate 1: Arif_Demirkan, Similarity Score: 0.8733
Candidate 2: Chenjie_Wu, Similarity Score: 0.8536
Candidate 3: Harvey_Dent, Similarity Score: 0.8487
Candidate 4: Lauren_Aubrey_Lee, Similarity Score: 0.8467
Candidate 5: Federico_De_Marines, Similarity Score: 0.8321
Candidate 6: Xiao_Li, Similarity Score: 0.8297
Candidate 7: Adrian_Velasco, Similarity Score: 0.8250
Candidate 8: Jaykumar, Similarity Score: 0.8210
Candidate 9: Xinyi_Yu, Similarity Score: 0.8168
Candidate 10: John, Similarity Score: 0.8156
Candidate 11: Ray_Lee, Similarity Score: 0.8152
Candidate 12: Dennis_Tou, Similarity Score: 0.8121
Candidate 13: Mckenna_Bass, Similarity Score: 0.8098
Candidate 14: Timothy_Wang, Similarity Score: 0.8071
Candidate 15: Austin_Zuo, Similarity Score: 0.8037
Candidate 16: Sheen_Huang, Similarity Score: 0.8022
Candidate 17: Nandini_Seth, Similarity Score: 0.8022
Candidate 18: Alfred_Pennyworth, Similarity Score: 0.8020
Candidate 19: Yihao_Xu, Similarity Score: 0.7988
Candidate 20: Carnel

In [477]:
import csv

# Set the file path
file_path = 'Data/Result/ranked_candidates.csv'

# Prepare the data for CSV
rows = [["Rank", "Candidate Name", "Similarity Score"]]
for index, candidate in enumerate(ranked_candidates, 1):
    rows.append([index, candidate['name'], candidate['similarity']])

# Save to CSV
with open(file_path, "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)



In [58]:
import csv

# Set the file path
file_path = 'Data/Result/ranked_candidates_whole.csv'

# Prepare the data for CSV
rows = [["Rank", "Candidate Name", "Similarity Score"]]
for index, candidate in enumerate(ranked_candidates_whole, 1):
    rows.append([index, candidate['name'], candidate['similarity']])

# Save to CSV
with open(file_path, "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)


In [105]:
import csv

# Set the file path
file_path = 'Data/Result/ranked_candidates_gpt.csv'

# Prepare the data for CSV
rows = [["Rank", "Candidate Name", "Similarity Score"]]
for index, candidate in enumerate(ranked_candidates_gpt, 1):
    rows.append([index, candidate['name'], candidate['similarity']])

# Save to CSV
with open(file_path, "w", newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)


In [117]:
import pandas as pd
from scipy.stats import kendalltau

# Step 1: Read CSV files into pandas dataframes
# df1 = pd.read_csv('Data/Result/ranked_candidates_gpt.csv')
# df2 = pd.read_csv('Data/Result/ranked_candidates_whole.csv')
# df1 = pd.read_csv('Data/Result/ranked_candidates.csv')
# df2 = pd.read_csv('Data/Result/ranked_candidates_whole.csv')

df1 = pd.read_csv('Data/Result/ranked_candidates.csv')
df2 = pd.read_csv('Data/Result/ranked_candidates_gpt.csv')

# You might need to modify this based on your CSV structure
ranks_df1 = df1['Candidate Name'].tolist()
ranks_df2 = df2['Candidate Name'].tolist()

# Step 2: Calculate Kendall's tau distance
tau, _ = kendalltau(ranks_df1, ranks_df2)
distance = (len(ranks_df1) * (len(ranks_df1) - 1) / 2) - (tau * len(ranks_df1) * (len(ranks_df1) - 1) / 2)

print(f"Kendall's tau distance between file1 and file2: {distance}")


Kendall's tau distance between file1 and file2: 3982.0
