In [1]:
import json
import os
from typing import List
import networkx as nx
import nltk
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
from annotated_text import annotated_text, parameters
from streamlit_extras import add_vertical_space as avs
from streamlit_extras.badges import badge
from scripts.similarity import get_similarity_score, find_path, read_config
from scripts.utils import get_filenames_from_dir
from scripts import ReadPdf, JobDescriptionProcessor, ResumeProcessor, KeytermsExtraction
import cohere
from scripts.KeytermsExtraction import KeytermExtractor
from scripts.similarity.get_similarity_score import get_similarity_score
import uuid
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
FULL_STACK = 0

In [3]:
import yaml
import logging
logging.basicConfig(
    filename='app_similarity_score.log',
    filemode='w',
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

console_handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
console_handler.setLevel(logging.DEBUG)

file_handler = logging.FileHandler("app_similarity_score.log")
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(console_handler)
cwd = find_path('Resume-Matcher')
config_path = os.path.join(cwd, "scripts", "similarity")


def read_config(filepath):
    try:
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return config
    except FileNotFoundError as e:
        logger.error(f"Configuration file {filepath} not found: {e}")
    except yaml.YAMLError as e:
        logger.error(
            f"Error parsing YAML in configuration file {filepath}: {e}", exc_info=True)
    except Exception as e:
        logger.error(f"Error reading configuration file {filepath}: {e}")
    return None


config = read_config(config_path + "/config.yml")
PROJECT_ID = config['vertex']['api_key']
REGION = config['vertex']['REGION']

In [4]:
import os

job_desc_directory = "Data/JobDescription/"
resumes_directory = "Data/Resumes/"

# Automatically get all job description and resume files
job_desc_files = [file for file in os.listdir(
    job_desc_directory) if file.endswith('.pdf')]
resume_files = [file for file in os.listdir(
    resumes_directory) if file.endswith('.pdf')]

job_desc_file = job_desc_files[FULL_STACK]

In [5]:
# Define paths
# Read raw job description
# job_desc_text = ReadPdf.read_single_pdf("Data/JobDescription/" + job_desc_file)
job_desc_text = [ReadPdf.read_single_pdf(os.path.join(
    "Data/JobDescription/", job_desc_file)) for job_desc_file in job_desc_files]
# Process job description
job_desc_processor = JobDescriptionProcessor(job_desc_file)
job_desc_processed = job_desc_processor._read_job_desc()

job_files = [f for f in os.listdir(
    "Data/JobDescription/") if os.path.isfile(os.path.join("Data/JobDescription/", f))]


job_processed = []
for job_file in job_files:
    job_processor = JobDescriptionProcessor(job_file)
    job_data = job_processor._read_job_desc()
    job_processor._write_json_file(job_data)
    job_processed.append(job_data)

## Check Job keywords with index from 0 to n

In [12]:
print(job_processed[FULL_STACK])

{'unique_id': 'e026c88a-ac28-4e7d-a75d-0cc25793b18c', 'job_desc_data': 'Job Description: Senior Full Stack Engineer (5+ Years of\nExperience)\nTech Solutions, San Francisco, CA, USA\nAbout Us\nTech Solutions is a leading technology company that creates innovative solutions across a variety of industries.\nOur mission is to improve lives through advanced technology. We’re currently seeking a Senior Full Stack\nEngineer to join our dynamic team.\nJob Description\nWe’re looking for a Senior Full Stack Engineer with 5+ years of experience in developing web applications.\nThe successful candidate will have experience working with both front-end and back-end technologies, and\nwill be capable of overseeing projects from conception to production deployment.\nResponsibilities\n•Developing front end website architecture.\n•Designing user interactions on web pages.\n•Developing back end website applications.\n•Creating servers and databases for functionality.\n•Ensuring cross-platform optimizati

In [6]:

resumes_text = [ReadPdf.read_single_pdf(os.path.join(
    "Data/Resumes/", resume_file)) for resume_file in resume_files]


# Process resumes
resume_files = [f for f in os.listdir(
    "Data/Resumes/") if os.path.isfile(os.path.join("Data/Resumes/", f))]

resumes_processed = []
for resume_file in resume_files:
    resume_processor = ResumeProcessor(resume_file)
    resume_data = resume_processor._read_resumes()
    resume_processor._write_json_file(resume_data)
    resumes_processed.append(resume_data)

In [16]:
def extract_candidate_name_from_filename(filename: str) -> str:
    # Exclude the last part which is the position
    name_parts = filename.split('_')[:-1]
    return ' '.join(name_parts).title()

In [17]:
candidate_names = [extract_candidate_name_from_filename(
    resume_file) for resume_file in resume_files]
candidate_names

['Yiching Liu',
 'Angela Zhu',
 'Maria Chinkan',
 'Arpi Melik Parsadanyan',
 'Zihui Lin',
 'Jose Felix Villasenor',
 'Anna Gasparyan',
 'Amitesh Rathore',
 'Yunrui Shao',
 'Jaykumar',
 'Minyue Yao',
 'John',
 'Zane Rouguine',
 'Brandon Penner',
 'Zhe Wang',
 'Tsubasa Lin',
 'Danny Mai',
 'David Boutwell',
 'Alexandra',
 'Sarah Sherman',
 'David Botbol',
 'Ryan Pintar',
 'Eloise Yu',
 'Vasil Klimovich',
 'Ming Jin',
 'Robert Scozzari',
 'Cody Romero',
 'Carnell Brame',
 'Timothy Wang',
 'Nico Santoso',
 'Shirley Zhao',
 'Yuan Wang',
 'Divya Harshini',
 'Deekshitha Pullaiah',
 'Meredith Cheng',
 'Grace Li',
 'Anya Hsu',
 'Nandini Seth',
 'Andrew Knuppel',
 'John Hinnegan',
 'Sharad Dangol',
 'Salvador Campos',
 'Lauren Aubrey Lee',
 'Xiao Li',
 'Bruce Wayne',
 'Barry Allen',
 'Balraj Rai',
 'Annie Zhou',
 'Federico De Marines',
 'Ray Lee',
 'Jagriti Sharma',
 'Michelle Wang',
 'Galen Fink',
 'Dennis Mo',
 'Mengyao Zhang',
 'Johann C',
 'Yixin-Ying',
 'Peggy Lai',
 'Yuanhuang Lo',
 'Serle

In [18]:

for i in range(len(resumes_processed)):
    resumes_processed[i]['name'] = candidate_names[i]

## Check candidate keywords with index from 0 to n


In [20]:
print(resumes_processed[0])

{'unique_id': '12d1e8aa-d41c-482f-b8e5-6a34a1b29ed1', 'resume_data': '\xa0 \xa0\n聯絡\nwww.linkedin.com/in/yi-ching-liu-\na73276216  (LinkedIn)\n熱門技能\nInterviewing\nUIUX\nUX ResearchYI CHING LIU\nUXUI\n台灣\n經歷\nTaelor\nUIUX\n2023 年 9 月\xa0-\xa0Present\xa0 (2 個月)\nSan Francisco Bay Area\nJubo Health\nUX Designer\n2023 年 3 月\xa0-\xa02023 年 9 月\xa0 (7 個月)\n台灣\n研究並產出數位醫療digital frontdoor 潛在用戶輪廓，以加值服務，提供使用者\n更喜愛的體驗內容。\n學歷\n大同大學\n媒體設計,\xa0UI/UX  3D web design \xa0·\xa0(2020\xa0-\xa02023)\n\xa0 Page 1 of 1', 'clean_data': '\xa0 \xa0\n聯絡\n\na73276216  LinkedIn\n熱門技能\nInterviewing\nUIUX\nUX ResearchYI CHING LIU\nUXUI\n台灣\n經歷\nTaelor\nUIUX\n2023 年 9 月\xa0\xa0Present\xa0 2 個月\nSan Francisco Bay Area\nJubo Health\nUX Designer\n2023 年 3 月\xa0\xa02023 年 9 月\xa0 7 個月\n台灣\n研究並產出數位醫療digital frontdoor 潛在用戶輪廓，以加值服務，提供使用者\n更喜愛的體驗內容。\n學歷\n大同大學\n媒體設計\xa0UI/UX  3D web design \xa0\xa02020\xa0\xa02023\n\xa0 Page 1 of 1', 'entities': ['UIUX', '更喜愛的體驗內容', 'UI', 'LinkedIn', '潛在用戶輪廓，以加值服務，提供使用者', 'UXUI'], 'extracted_