In [1]:
import pandas as pd
import numpy as np
import scoring
from ast import literal_eval
import pyarrow.parquet as pq

In [2]:
def parse(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, (list, tuple)):
        return [str(item).strip() for item in x if str(item).strip()]
    if isinstance(x, np.ndarray):
        return [str(item).strip() for item in x.flatten() if str(item).strip()]
    if isinstance(x, str):
        try:
            parsed = literal_eval(x)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [str(item).strip() for item in parsed if str(item).strip()]
        except:
            pass
        return [s.strip() for s in x.replace('[','').replace(']','').split(',') if s.strip()]
    return []

In [3]:
resumes = pd.read_parquet('processed/resume_matched.parquet', columns=['ID', 'matched_skills'])
jobs = pd.read_parquet('processed/dice_job_descriptions_matched.parquet', columns=['uniq_id', 'matched_skills'])

In [4]:
resumes['matched_skills'] = resumes['matched_skills'].apply(parse)
jobs['matched_skills'] = jobs['matched_skills'].apply(parse)

In [5]:
df = pd.read_parquet('matches/ind_skills_scores_085.parquet')

In [6]:
df['gaps'] = df['gaps'].apply(parse)

In [7]:
df = df.drop(columns=[c for c in df.columns if 'matched' in c], errors='ignore')

df = df.merge(
    resumes[['ID', 'matched_skills']].rename(columns={'ID': 'resume_id', 'matched_skills': 'resume_matched'}),
    on='resume_id', how='left'
)

df = df.merge(
    jobs[['uniq_id', 'matched_skills']].rename(columns={'uniq_id': 'job_id', 'matched_skills': 'job_matched'}),
    on='job_id', how='left'
)

In [8]:
df['resume_matched'] = df['resume_matched'].apply(lambda x: x if isinstance(x, list) else [])
df['job_matched']    = df['job_matched'].apply(lambda x: x if isinstance(x, list) else [])

In [9]:
true_missing = [set(j) - set(r) for r, j in zip(df['resume_matched'], df['job_matched'])]
pred_missing = df['gaps'].tolist()
present      = df['resume_matched'].tolist()

In [10]:
found_per_pair       = [scoring.found_score([tm], [pm]) for tm, pm in zip(true_missing, pred_missing)]
unnecessary_per_pair = [scoring.unnecessary_score([tm], [pm]) for tm, pm in zip(true_missing, pred_missing)]
redundant_per_pair   = [scoring.redundant_score([pm], [pr]) for pm, pr in zip(pred_missing, present)]
presence_per_pair    = [scoring.presence_score([tm], [pm]) for tm, pm in zip(true_missing, pred_missing)]

In [11]:
df['found']       = found_per_pair
df['unnecessary'] = unnecessary_per_pair
df['redundant']   = redundant_per_pair
df['presence']    = presence_per_pair

In [12]:
avg_scores = {
    "found"             : df['found'].mean(),
    "unnecessary"       : df['unnecessary'].mean(),
    "redundant"         : df['redundant'].mean(),
    "presence"          : df['presence'].mean(),
    "total_pairs"       : len(df),
    "n_resumes"         : df['resume_id'].nunique(),
    "median_coverage"   : df['pct_job_covered'].median(),
    "mean_coverage"     : df['pct_job_covered'].mean(),
    "median_gaps"       : df['gaps'].str.len().median(),
    "mean_gaps"         : df['gaps'].str.len().mean(),
    "%_perfect_match"   : (df['gaps'].str.len() == 0).mean(),
}

In [13]:
per_resume = df.groupby('resume_id')[['found', 'unnecessary', 'redundant', 'presence']].mean()

In [16]:
print("\n" + "="*90)
print(f"AVERAGE METRICS ACROSS ALL {len(df):,} PAIRS")
print("="*90)
for k, v in avg_scores.items():
    print(f"{k:20} : {v if not isinstance(v, float) else round(v, 4):.4f}")


AVERAGE METRICS ACROSS ALL 2,553,383 PAIRS
found                : 0.9863
unnecessary          : 0.0000
redundant            : 0.0000
presence             : 0.9913
total_pairs          : 2553383.0000
n_resumes            : 119.0000
median_coverage      : 0.0000
mean_coverage        : 0.0936
median_gaps          : 5.0000
mean_gaps            : 5.3252
%_perfect_match      : 0.0087
