In [16]:
import pandas as pd
import numpy as np
import scoring
from ast import literal_eval
import pyarrow.parquet as pq

In [17]:
resumes = pd.read_parquet('processed/resume_matched.parquet')
jobs = pd.read_parquet('processed/dice_job_descriptions_matched.parquet')

In [18]:
def parse(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, (list, tuple)):
        return [str(item).strip() for item in x if str(item).strip()]
    if isinstance(x, np.ndarray):
        return [str(item).strip() for item in x.flatten() if str(item).strip()]
    if isinstance(x, str):
        try:
            parsed = literal_eval(x)
            if isinstance(parsed, (list, tuple, np.ndarray)):
                return [str(item).strip() for item in parsed if str(item).strip()]
        except:
            pass
        return [s.strip() for s in x.replace('[','').replace(']','').split(',') if s.strip()]
    return []

In [19]:
resumes = pd.read_parquet('processed/resume_matched.parquet', columns=['ID', 'matched_skills'])
jobs = pd.read_parquet('processed/dice_job_descriptions_matched.parquet', columns=['jobid', 'matched_skills'])

In [20]:
resumes['matched_skills'] = resumes['matched_skills'].apply(parse)
jobs['matched_skills'] = jobs['matched_skills'].apply(parse)

In [21]:
thresholds = {
    "0.45": "matches/ind_skills_scores_045.parquet",
    "0.55": "matches/ind_skills_scores_055.parquet"
}

In [24]:
thresholds_tech = {
    "0.65": "matches/ind_skills_scores_065.parquet",
    "0.75": "matches/ind_skills_scores_075.parquet",
    "0.85": "matches/ind_skills_scores_085.parquet"
}

In [25]:
results = {}

for name, path in thresholds_tech.items():
    print(f"\nEvaluating threshold {name}...")
    
    parquet_file = pq.ParquetFile(path)
    best_pct = {}

    for batch in parquet_file.iter_batches(batch_size=200_000, columns=['resume_id', 'job_id', 'pct_covered', 'gaps']):
        df_chunk = batch.to_pandas()
        df_chunk['gaps_parsed'] = df_chunk['gaps'].apply(parse)

        for row in df_chunk.itertuples(index=False):
            rid = row.resume_id
            pct = row.pct_covered
            row_dict = {
                'resume_id'   : rid,
                'job_id'      : row.job_id,
                'pct_covered' : pct,
                'gaps'        : row.gaps_parsed
            }
            if rid not in best_pct or pct > best_pct[rid][0]:
                best_pct[rid] = (pct, row_dict)

    best_list = [data for _, data in best_pct.values()]
    best_df = pd.DataFrame(best_list)
    print(f"  → {len(best_df):,} resumes have a best match")

    best_df = best_df.merge(resumes.rename(columns={'ID':'resume_id','matched_skills':'resume_matched'}), 
                            on='resume_id', how='left')
    best_df = best_df.merge(jobs.rename(columns={'jobid':'job_id','matched_skills':'job_matched'}), 
                            on='job_id', how='left')

    best_df['resume_matched'] = best_df['resume_matched'].apply(lambda x: x if isinstance(x, list) else [])
    best_df['job_matched'] = best_df['job_matched'].apply(lambda x: x if isinstance(x, list) else [])

    true_missing = [list(set(row.job_matched) - set(row.resume_matched)) 
                    for row in best_df.itertuples()]
    pred_missing = best_df['gaps'].tolist()
    present_skills = best_df['resume_matched'].tolist()

    results[name] = {
        "found"       : scoring.found_score(true_missing, pred_missing),
        "unnecessary" : scoring.unnecessary_score(true_missing, pred_missing),
        "redundant"   : scoring.redundant_score(pred_missing, present_skills),
        "presence"    : scoring.presence_score(true_missing, pred_missing),
        "n_resumes"   : len(best_df)
    }


Evaluating threshold 0.65...
  → 119 resumes have a best match

Evaluating threshold 0.75...
  → 119 resumes have a best match

Evaluating threshold 0.85...
  → 119 resumes have a best match


In [23]:
results_df = pd.DataFrame(results).T.round(4)
print("\n" + "="*90)
print("FINAL SCORES")
print("="*90)
print(results_df[['found', 'unnecessary', 'redundant', 'presence', 'n_resumes']])
print("="*90)
print("BEST THRESHOLD →", results_df['found'].idxmax())


FINAL SCORES
       found  unnecessary  redundant  presence  n_resumes
0.45  0.0342       0.5702        0.0    0.1481    54389.0
0.55  0.1525       0.8062        0.0    0.5162    75192.0
BEST THRESHOLD → 0.55


In [26]:
results_df = pd.DataFrame(results).T.round(4)
print("\n" + "="*90)
print("FINAL SCORES")
print("="*90)
print(results_df[['found', 'unnecessary', 'redundant', 'presence', 'n_resumes']])
print("="*90)
print("BEST THRESHOLD →", results_df['found'].idxmax())


FINAL SCORES
       found  unnecessary  redundant  presence  n_resumes
0.65  0.1773       0.8373        0.0    0.5291     3871.0
0.75  0.1486       0.8588        0.0    0.4089     4678.0
0.85  0.1774       0.8426        0.0    0.4725     5111.0
BEST THRESHOLD → 0.85
