In [9]:
import pandas as pd
import numpy as np
import scoring
from ast import literal_eval
import pyarrow.parquet as pq

In [10]:
def parse(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, (list, tuple)):
        return [str(item).strip() for item in x if str(item).strip()]
    if isinstance(x, np.ndarray):
        return [str(item).strip() for item in x.flatten() if str(item).strip()]
    if isinstance(x, str):
        try:
            parsed = literal_eval(x)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [str(item).strip() for item in parsed if str(item).strip()]
        except:
            pass
        return [s.strip() for s in x.replace('[','').replace(']','').split(',') if s.strip()]
    return []

In [11]:
resumes = pd.read_parquet('processed/resume_matched.parquet', columns=['ID', 'matched_skills'])
jobs = pd.read_parquet('processed/dice_job_descriptions_matched.parquet', columns=['jobid', 'matched_skills'])

In [12]:
resumes['matched_skills'] = resumes['matched_skills'].apply(parse)
jobs['matched_skills'] = jobs['matched_skills'].apply(parse)

In [13]:
thresholds = {
    "0.85": "matches/ind_skills_scores_085.parquet"
}

In [16]:
results = {}

for name, path in thresholds.items():
    print(f"\nEvaluating threshold {name}...")
    
    df = pd.read_parquet(path)
    df['gaps'] = df['gaps'].apply(parse)
    df['has_gap'] = df['gaps'].str.len() > 0

    with_gaps = df[df['has_gap']].copy()

    if len(with_gaps) == 0:
        print("  No pairs with gaps found at this threshold!")
        continue
    
    top10_with_gaps = (with_gaps
                       .sort_values(['resume_id', 'pct_job_covered'], ascending=[True, False])
                       .groupby('resume_id')
                       .head(10))
    
    top10 = top10_with_gaps.merge(resumes.rename(columns={'ID':'resume_id','matched_skills':'resume_matched'}), 
                                  on='resume_id', how='left')
    top10 = top10.merge(jobs.rename(columns={'jobid':'job_id','matched_skills':'job_matched'}), 
                        on='job_id', how='left')
    
    top10['resume_matched'] = top10['resume_matched'].apply(lambda x: x if isinstance(x,list) else [])
    top10['job_matched']    = top10['job_matched'].apply(lambda x: x if isinstance(x,list) else [])
    
    true_missing = [list(set(row.job_matched) - set(row.resume_matched)) for row in top10.itertuples()]
    pred_missing = top10['gaps'].tolist()
    present_skills = top10['resume_matched'].tolist()
    
    results[name] = {
        "found"       : scoring.found_score(true_missing, pred_missing),
        "unnecessary" : scoring.unnecessary_score(true_missing, pred_missing),
        "redundant"   : scoring.redundant_score(pred_missing, present_skills),
        "presence"    : scoring.presence_score(true_missing, pred_missing),
        "median_coverage": top10['pct_job_covered'].median(),
        "median_gaps"    : top10['gaps'].str.len().median(),
        "n_pairs"        : len(top10),
        "n_resumes"      : top10['resume_id'].nunique(),
    }


Evaluating threshold 0.85...


In [17]:
results_df = pd.DataFrame(results).T.round(4)
print("\n" + "="*90)
print("FINAL SCORES")
print("="*90)
print(results_df[['found', 'unnecessary', 'redundant', 'presence', 'n_resumes']])


FINAL SCORES
       found  unnecessary  redundant  presence  n_resumes
0.85  0.0561        0.849        0.0    0.1588      119.0
