In [52]:
import os
import sys


try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = "/content/drive/MyDrive/wdir/repos/Apziva/3-potential_talents/"
    os.getcwd()

except ImportError:
    while 'potential_talents' not in os.listdir('.'):
        os.chdir('..')
        root_dir=os.getcwd()
    
    # append term_deposit to system to import custom functions
    sys.path.append('.')
    
%pwd

'/workspaces/3-potential_talents'

In [53]:
import pandas as pd
from pathlib import Path
import toml
import json
import requests
import numpy as np


data_path = Path("data")
data = pd.read_parquet(data_path  / "interim" / "encoded.parquet", columns=['job_title'])

credentials_path = Path(root_dir) / "config" / ".credentials"
credentials = toml.load(credentials_path)

# API and credentials setup
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
headers = {"Authorization": f"Bearer {credentials['hf_api_token']}"}

In [54]:
def query(payload):
    """Send a POST request to Hugging Face inference API."""
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()  # Assuming the API returns a JSON response
    else:
        raise Exception(f"API Error: {response.status_code} {response.text}")

def compute_similarities(data, phrases):
    """Compute similarities between multiple phrases and job titles."""
    similarity_matrix = []
    
    for phrase in phrases:
        payload = {
            "inputs": {
                "source_sentence": phrase,
                "sentences": data['job_title'].tolist()
            }
        }
        response = query(payload)
        
        # Debug the response structure
        if isinstance(response, dict) and 'similarities' in response:
            scores = response['similarities']
        elif isinstance(response, list):  # Sometimes APIs return a list of scores
            scores = response
        else:
            raise TypeError(f"Unexpected response format: {response}")
        
        similarity_matrix.append(scores)
    
    return np.array(similarity_matrix)

In [55]:
# Define multiple phrases for comparison
phrases_path = Path(root_dir) / "config" / "search_phrases.toml"
phrases = toml.load(phrases_path)['search_phrases']

In [56]:
# Compute similarity scores
similarity_matrix = compute_similarities(data, phrases)

# Add scores for each phrase to the DataFrame
for i, phrase in enumerate(phrases):
    data[f"similarity_to_{phrase}"] = similarity_matrix[i]

# Filter and rank results for each phrase
filtered_results = []
for phrase in phrases:
    filtered = (
        data
        # .loc[data[f"similarity_to_{phrase}"] > 0.7]
        .sort_values(f"similarity_to_{phrase}", ascending=False)
    )
    filtered['matching_phrase'] = phrase
    filtered_results.append(filtered)

# Combine filtered results into a single DataFrame
final_result = pd.concat(filtered_results).drop_duplicates().reset_index(drop=True)

In [58]:
final_result

Unnamed: 0,job_title,similarity_to_aspiring human resources,similarity_to_human resources assistant,similarity_to_hr coordinator,similarity_to_hr generalist (entry-level),similarity_to_talent acquisition assistant,similarity_to_recruitment coordinator,similarity_to_hr intern,similarity_to_hr trainee,similarity_to_junior hr specialist,similarity_to_hr associate,similarity_to_people operations assistant,matching_phrase
0,aspiring human resource professional,0.960846,0.862141,0.733101,0.764335,0.704718,0.692377,0.755420,0.794812,0.763064,0.746892,0.695747,aspiring human resources
1,aspiring human resource specialist,0.930391,0.858606,0.743931,0.770218,0.710703,0.695650,0.741314,0.774672,0.816259,0.738898,0.692015,aspiring human resources
2,aspiring human resource manager seeking intern...,0.905984,0.851768,0.766588,0.773088,0.753021,0.735469,0.825225,0.804605,0.751556,0.761361,0.697133,aspiring human resources
3,human resource professional,0.897843,0.905179,0.762691,0.743502,0.675241,0.681468,0.760710,0.768687,0.756842,0.751947,0.718334,aspiring human resources
4,aspiring human resource management student see...,0.893726,0.821763,0.748655,0.774614,0.753225,0.730220,0.821586,0.812231,0.748860,0.747272,0.685812,aspiring human resources
...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,lead official western illinois university,0.669256,0.671879,0.662737,0.706091,0.634491,0.659648,0.646991,0.652434,0.668253,0.661374,0.625837,people operations assistant
568,student chapman university,0.650180,0.649625,0.609013,0.649209,0.660825,0.637602,0.641431,0.661083,0.639836,0.602648,0.620907,people operations assistant
569,bachelor science biology victoria university w...,0.648028,0.639748,0.620809,0.681726,0.622945,0.652597,0.631934,0.630300,0.654003,0.595737,0.598758,people operations assistant
570,student westfield state university,0.637752,0.614611,0.616293,0.664882,0.636979,0.614762,0.620246,0.655822,0.645827,0.620608,0.597020,people operations assistant


In [None]:
# # Compute similarity scores
# similarity_matrix = compute_similarities(data, phrases)

# # Add scores for each phrase to the DataFrame
# for i, phrase in enumerate(phrases):
#     data[f"similarity_to_{phrase}"] = similarity_matrix[i]

# # Ensure all rows are preserved during filtering and ranking
# filtered_results = []
# for phrase in phrases:
#     filtered = (
#         data
#         .copy()  # Ensure no inplace modification of the original DataFrame
#         .sort_values(f"similarity_to_{phrase}", ascending=False)
#     )
#     filtered['matching_phrase'] = phrase
#     filtered_results.append(filtered)

# # Combine all results, keeping duplicates across phrases
# final_result = pd.concat(filtered_results).reset_index(drop=True)

# # Ensure no data loss
# print(f"Original rows: {data.shape[0]}")
# print(f"Final combined rows (including duplicates): {final_result.shape[0]}")

In [None]:
final_result = final_result.assign(
    # mean=final_result.iloc[:, 1:-1].mean(axis=1),
    # median=final_result.iloc[:, 1:-1].median(axis=1),
    # std=final_result.iloc[:, 1:-1].std(axis=1),
    # max=final_result.iloc[:, 1:-1].max(axis=1),
    # min=final_result.iloc[:, 1:-1].min(axis=1),
    med_std=final_result.iloc[:, 1:-1].median(axis=1)-final_result.iloc[:, 1:-1].std(axis=1)
    ).sort_values('med_std', ascending=False).iloc[:, [0,-2,-1]]
final_result.sample(4, random_state=27)

Unnamed: 0,job_title,matching_phrase,med_std
101,junior me not techer information system,human resources assistant,0.619228
21,not tech seeking human resource payroll admini...,aspiring human resources,0.711246
396,human resource generalist scottmadden inc,hr trainee,0.693768
195,junior me not techer information system,hr generalist (entry-level),0.619228


In [150]:
grouped_results = final_result.groupby('job_title')["med_std"].mean()\
    .sort_values(ascending=False)

In [154]:
pd.DataFrame(grouped_results).info()

<class 'pandas.core.frame.DataFrame'>
Index: 52 entries, human resource staffing recruiting professional to always set success
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   med_std  52 non-null     float64
dtypes: float64(1)
memory usage: 2.9+ KB
