In [1]:
import os
import sys


try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = "/content/drive/MyDrive/wdir/repos/Apziva/3-potential_talents/"
    os.getcwd()

except ImportError:
    while 'potential_talents' not in os.listdir('.'):
        os.chdir('..')
        root_dir=os.getcwd()
    
    # append term_deposit to system to import custom functions
    sys.path.append('.')
    
%pwd

'/workspaces/3-potential_talents'

In [None]:
import pandas as pd
from pathlib import Path
import toml
import json
import requests
import numpy as np
import time


data_path = Path("data")
data = pd.read_parquet(data_path  / "interim" / "encoded.parquet", columns=['job_title'])

credentials_path = Path(root_dir) / "config" / ".credentials"
credentials = toml.load(credentials_path)

# API and credentials setup
API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/msmarco-distilbert-base-tas-b"
headers = {"Authorization": f"Bearer {credentials['hf_api_token']}"}

In [3]:
def query(payload):
    """Send a POST request to Hugging Face inference API."""
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()  # Assuming the API returns a JSON response
    else:
        raise Exception(f"API Error: {response.status_code} {response.text}")

def compute_similarities(data, phrases):
    """Compute similarities between multiple phrases and job titles."""
    similarity_matrix = []
    
    for phrase in phrases:
        payload = {
            "inputs": {
                "source_sentence": phrase,
                "sentences": data['job_title'].tolist()
            }
        }
        response = query(payload)
        
        # Debug the response structure
        if isinstance(response, dict) and 'similarities' in response:
            scores = response['similarities']
        elif isinstance(response, list):  # Sometimes APIs return a list of scores
            scores = response
        else:
            raise TypeError(f"Unexpected response format: {response}")
        
        similarity_matrix.append(scores)
    
    return np.array(similarity_matrix)

In [4]:
# Define multiple phrases for comparison
phrases_path = Path(root_dir) / "config" / "search_phrases.toml"
phrases = toml.load(phrases_path)['search_phrases']

In [5]:
def query_with_retry(payload, retries=5, delay=20):
    """Send a POST request to Hugging Face inference API with retry mechanism."""
    for attempt in range(retries):
        response = requests.post(API_URL, headers=headers, json=payload)
        if response.status_code == 200:
            return response.json()  # Assuming the API returns a JSON response
        elif response.status_code == 503:
            print(f"Model is loading, retrying in {delay} seconds...")
            time.sleep(delay)
        else:
            raise Exception(f"API Error: {response.status_code} {response.text}")
    raise Exception("Max retries exceeded")

def compute_similarities_with_retry(data, phrases):
    """Compute similarities between multiple phrases and job titles with retry mechanism."""
    similarity_matrix = []
    
    for phrase in phrases:
        payload = {
            "inputs": {
                "source_sentence": phrase,
                "sentences": data['job_title'].tolist()
            }
        }
        response = query_with_retry(payload)
        
        # Debug the response structure
        if isinstance(response, dict) and 'similarities' in response:
            scores = response['similarities']
        elif isinstance(response, list):  # Sometimes APIs return a list of scores
            scores = response
        else:
            raise TypeError(f"Unexpected response format: {response}")
        
        similarity_matrix.append(scores)
    
    return np.array(similarity_matrix)

# Compute similarity scores
similarity_matrix = compute_similarities_with_retry(data, phrases)

# Add scores for each phrase to the DataFrame
for i, phrase in enumerate(phrases):
    data[f"similarity_to_{phrase}"] = similarity_matrix[i]

# Filter and rank results for each phrase
filtered_results = []
for phrase in phrases:
    filtered = (
        data
        # .loc[data[f"similarity_to_{phrase}"] > 0.7]
        .sort_values(f"similarity_to_{phrase}", ascending=False)
    )
    filtered['matching_phrase'] = phrase
    filtered_results.append(filtered)

# Combine filtered results into a single DataFrame
final_result = pd.concat(filtered_results).drop_duplicates().reset_index(drop=True)

Model is loading, retrying in 20 seconds...


In [6]:
# # Compute similarity scores
# similarity_matrix = compute_similarities(data, phrases)

# # Add scores for each phrase to the DataFrame
# for i, phrase in enumerate(phrases):
#     data[f"similarity_to_{phrase}"] = similarity_matrix[i]

# # Ensure all rows are preserved during filtering and ranking
# filtered_results = []
# for phrase in phrases:
#     filtered = (
#         data
#         .copy()  # Ensure no inplace modification of the original DataFrame
#         .sort_values(f"similarity_to_{phrase}", ascending=False)
#     )
#     filtered['matching_phrase'] = phrase
#     filtered_results.append(filtered)

# # Combine all results, keeping duplicates across phrases
# final_result = pd.concat(filtered_results).reset_index(drop=True)

# # Ensure no data loss
# print(f"Original rows: {data.shape[0]}")
# print(f"Final combined rows (including duplicates): {final_result.shape[0]}")

In [9]:
final_result = final_result.assign(
    # mean=final_result.iloc[:, 1:-1].mean(axis=1),
    # median=final_result.iloc[:, 1:-1].median(axis=1),
    # std=final_result.iloc[:, 1:-1].std(axis=1),
    # max=final_result.iloc[:, 1:-1].max(axis=1),
    # min=final_result.iloc[:, 1:-1].min(axis=1),
    fit=final_result.iloc[:, 1:-1].median(axis=1)-final_result.iloc[:, 1:-1].std(axis=1)
    ).sort_values('fit', ascending=False).iloc[:, [0,-2,-1]]
final_result.sample(4, random_state=27)

Unnamed: 0,job_title,matching_phrase,fit
101,junior me not techer information system,human resources assistant,0.619228
21,not tech seeking human resource payroll admini...,aspiring human resources,0.711246
396,human resource generalist scottmadden inc,hr trainee,0.693768
195,junior me not techer information system,hr generalist (entry-level),0.619228


In [None]:
grouped_results = pd.DataFrame(
    final_result.groupby('job_title')["fit"].mean()\
    .sort_values(ascending=False)
    )
grouped_results.to_parquet(data_path / "processed" / "grouped_results.parquet")

Unnamed: 0_level_0,fit
job_title,Unnamed: 1_level_1
human resource staffing recruiting professional,0.748748
retired army national guard recruiter office manager seeking position human resource,0.729468
aspiring human resource professional energetic teamfocused leader,0.72529
not tech seeking human resource payroll administrative professional,0.711246
experienced retail manager aspiring human resource professional,0.709471
human resource coordinator intercontinental not tech,0.709192
aspiring human resource professional passionate helping create inclusive engatechng work environment,0.708697
aspiring human resource manager seeking internship human resource,0.707899
ct bauer college business graduate magna cum laude aspiring human resource professional,0.70561
director human resource north america groupe not tech,0.701097
