In [1]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.20-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.2-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.28.2-py3

In [154]:
# imports
import os
import abc
import numpy as np
import pandas as pd
import chromadb
import json
from sentence_transformers import SentenceTransformer
from collections import defaultdict
import pickle
import re
import similarity_functions as sim


In [155]:
model_id = "jjzha/jobbert_skill_extraction"

#use sentence transformer to get the embeddings
model = SentenceTransformer(model_id)

Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert_skill_extraction and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [157]:
#read candidates.json
DATA_DIR = "data/"
def create_candidate_embeddings():
  with open(f"{DATA_DIR}candidates.json", "r") as f:
      candidates = json.load(f)

  #encode the education and work history of each candidate
  candidate_embeddings = defaultdict(dict)
  for candidate_id in candidates:
      candidate = candidates[candidate_id]
      candidate_embeddings[candidate_id]["embedding"] = (3*model.encode(candidate["education"]) + 5*model.encode(candidate["work_history"]) - 2*model.encode(f'Years of Experience = {candidate["yrs_of_experience"]}')).tolist()

  #write to a new json
  with open(f"{DATA_DIR}candidate_embeddings.json", "w") as f:
      json.dump(candidate_embeddings, f)
  return candidate_embeddings

In [158]:
def evaluate(ranks, target, verbose=True):
  new_target = []
  for r in target:
    new_target.append(str(r))
  target = new_target
  indices = []
  s = 0

  for i, c in enumerate(ranks):
      if c in target:
          indices.append(i)
          if i < 200:
              s += 1

  #calculate MAP for the target candidates
  def calculate_map(similarity):
      # Sort ranks to ensure they are in ascending order
      ranks = [x + 1 for x in similarity]
      # print(f"{ranks=}")
      # Calculate precision at each relevant rank
      precisions = []
      for i, rank in enumerate(ranks, start=1):
          precision_at_rank = i / rank
          precisions.append(precision_at_rank)

      # print(f"{precisions=}")
      # Calculate Mean Average Precision
      map_score = sum(precisions) / len(ranks)
      return map_score

  # Example usage:
  map_score = calculate_map(indices)
  recall = s/len(target)
  if verbose:
    print(f"Mean Average Precision (MAP): {round(map_score,4)}")
    print(f"Recall@200 = {round(recall,4)}")
    print(f"Ground Truth Indices: {indices}")
  return map_score, recall

def random_partition(target, n):
  np.random.seed(42)
  np.random.shuffle(target)
  return target[:n], target[n:]

In [159]:
class IRSystem(metaclass=abc.ABCMeta):
    """
    IRSystem class to be use for TalentRank
    """

    def __init__(self, data_dir):
        self.text_embedding = json.load(open(os.path.join(data_dir, "candidate_embeddings.json"), "r"))

        self.create_index()
        self.create_parser_searcher()

    def create_index(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        """
        chroma_client = chromadb.Client()

        name = 'talentrank'

        if name in [collection.name for collection in
                               chroma_client.list_collections()]:
            chroma_client.delete_collection(name)

        self.index_sys = chroma_client.create_collection(name=name, metadata={"hnsw:space": "ip", "hnsw:M": 400, "hnsw:construction_ef": 400, "hnsw:search_ef": 200}) #HNSW parameters explained better https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md and https://www.pinecone.io/learn/series/faiss/hnsw/

    def add_files(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        """
        for candidate in self.text_embedding:
            self.index_sys.add(ids = candidate,
                               embeddings = self.text_embedding[candidate]["embedding"])
            if int(candidate) % 100 == 0:
                print(f"Already indexed: {candidate} candidates")
        print("Done indexing.")

    def create_parser_searcher(self):
        """
        INPUT:
            None
        OUTPUT:
            None
        """
        model_name = "jjzha/jobbert_skill_extraction"

        self.query_parser = SentenceTransformer(model_name)

        self.searcher = self.index_sys

    def perform_search(self, topic_phrase=None, target_vector=None, n_results=1000):
        """
        INPUT:
            topic_phrase: string
            target_vector: list
            n_results: int
        OUTPUT:
            topicResults: dict

        Utilize self.query_parser and self.searcher to calculate the result for topic_phrase
        """
        if topic_phrase is not None:
            query_embeddings = self.query_parser.encode(topic_phrase).tolist()
        else:
            query_embeddings = target_vector
        topicResults = self.searcher.query(query_embeddings=query_embeddings, n_results=n_results)
        return topicResults


In [166]:
# candidate_embeddings = create_candidate_embeddings() #uncomment this to create new embeddings instead of using the pre-saved ones
candidate_embeddings = json.load(open(f"{DATA_DIR}candidate_embeddings.json", "r"))

In [160]:
target = pd.read_csv(DATA_DIR + "target.csv")
target = target["target"].tolist()
n_results = 1000

#open restructured_job_details.txt, read it
with open(f"{DATA_DIR}restructured_job_details.txt", "r") as f:
    job_details = f.read()

talentrank = IRSystem(DATA_DIR)
talentrank.add_files()
r1_ranking_list = talentrank.perform_search(topic_phrase=job_details, n_results=n_results)["ids"][0]
print("=====================\nR1 ranking\n=====================\n")
evaluate(r1_ranking_list, target)

Some weights of BertModel were not initialized from the model checkpoint at jjzha/jobbert_skill_extraction and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Already indexed: 100 candidates
Already indexed: 200 candidates
Already indexed: 300 candidates
Already indexed: 400 candidates
Already indexed: 500 candidates
Already indexed: 600 candidates
Already indexed: 700 candidates
Already indexed: 800 candidates
Already indexed: 1000 candidates
Already indexed: 1200 candidates
Already indexed: 1300 candidates
Already indexed: 1400 candidates
Done indexing.
Mean Average Precision (MAP): 0.0162
Recall@200 = 0.3636
Ground Truth Indices: [30, 121, 145, 175, 349, 783, 892, 964]


(0.016249300227890316, 0.36363636363636365)

In [161]:
#average of embeddings for candidates in target
target_vector = np.mean([candidate_embeddings[str(i)]["embedding"] for i in target], axis=0)

r2_ranking_list = talentrank.perform_search(target_vector=target_vector, n_results=n_results)["ids"][0]
print("=====================\nR2 ranking\n=====================\n")
evaluate(r2_ranking_list, target)

Mean Average Precision (MAP): 0.0148
Recall@200 = 0.1818
Ground Truth Indices: [27, 116, 220, 382, 387, 500, 688, 731, 888]


(0.014766869051643419, 0.18181818181818182)

In [162]:
with open(DATA_DIR + "vectors.pkl", "rb") as f:
    vectors = pickle.load(f)

train, test = random_partition(target, 8)

#mean and std of target vectors dict while target holds candidate ids
target_vectors = [vectors[int(candidate)] for candidate in train]
# print(f"{target_vectors=}")
mean = np.mean(target_vectors, axis=0)

similarity = []
for candidate in vectors:
    vector = vectors[candidate]
    similarity.append((candidate, sim.cosine_similarity(mean, vector)))
    # similarity.append((candidate, sim.euclidean_similarity(mean, vector)))
    # similarity.append((candidate, sim.manhattan_similarity(mean, vector)))
    # similarity.append((candidate, sim.inner_product_similarity(mean, vector)))
    # similarity.append((candidate, sim.minkowski_similarity(mean, vector)))

ranks = sorted(similarity, key=lambda x: x[1], reverse=True)
r3_ranking_list = [str(r[0]) for r in ranks]
print("=====================\nR3 ranking\n=====================\n")
evaluate(r3_ranking_list, target)


Mean Average Precision (MAP): 0.0268
Recall@200 = 0.4545
Ground Truth Indices: [35, 53, 66, 72, 147, 227, 327, 603, 604, 788, 1300]


(0.026825169402405768, 0.45454545454545453)

In [163]:
def fill_missing_ranks(consolidated_scoring, lenght):
  for candidate in consolidated_scoring:
    for _ in  range(lenght - len(consolidated_scoring[candidate])):
        consolidated_scoring[candidate].append(1e-10)
  return consolidated_scoring

consolidated_scoring = {}

for i, r in enumerate(r1_ranking_list):
  consolidated_scoring[int(r)] = [1/(i + 1)]

for i, r in enumerate(r2_ranking_list):
  if int(r) in consolidated_scoring:
    consolidated_scoring[int(r)].append(1/(i + 1))
  else:
    consolidated_scoring[int(r)] = [1e-10, 1/(i + 1)]

consolidated_scoring = fill_missing_ranks(consolidated_scoring, 2)

for i, r in enumerate(r3_ranking_list):
  if int(r) in consolidated_scoring:
    consolidated_scoring[int(r)].append(1/(i + 1))
  else:
    consolidated_scoring[int(r)] = [1e-10, 1e-10, 1/(i + 1)]

consolidated_scoring = fill_missing_ranks(consolidated_scoring, 3)


In [165]:
#tuning rank marging hyperparamters
def tuning():
  all_feature_names = {}
  for a in np.arange(0, 10, 0.2):
    for b in np.arange(0, 10, 0.2):
      for c in np.arange(0, 10, 0.2):
        final_score = {}
        for candidate in consolidated_scoring:
          final_score[candidate] = a*consolidated_scoring[candidate][0] + b*consolidated_scoring[candidate][1] + c*consolidated_scoring[candidate][2]
        ranks = sorted(final_score.items(), key=lambda x: x[1], reverse=True)
        r4_ranking_list = [str(r[0]) for r in ranks]
        map_score, recall = evaluate(r4_ranking_list, target, verbose=False)
        all_feature_names[(a, b, c)] = (map_score, recall)
        print(f"{a=}, {b=}, {c=}")
  return all_feature_names

In [167]:
# # find key with highest value[0] in all_feature_names
# all_feature_names = tuning()
# max_key = max(all_feature_names, key=lambda k: all_feature_names[k][0])
# print(f"{max_key=}")
# print(f"{all_feature_names[max_key]=}")