In [1]:
import pandas as pd
import plotly.graph_objects as go
from pathlib import Path

In [2]:
files = {
    "0.45": "matches/ind_skills_scores_045.parquet",
    "0.55": "matches/ind_skills_scores_055.parquet",
    "0.65": "matches/ind_skills_scores_065.parquet",
    "0.75": "matches/ind_skills_scores_075.parquet",
    "0.85": "matches/ind_skills_scores_085.parquet",
}

In [3]:
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#EAD814"]

In [4]:
fig = go.Figure()

In [7]:
for (label, path), color in zip(files.items(), colors):
    df = pd.read_parquet(path, columns=["pct_job_covered"])
    
    hist = df['pct_job_covered'].value_counts(bins=70, sort=False)
    bin_mids = hist.index.mid
    counts = hist.values
    percentages = counts / counts.sum() * 100
    
    fig.add_trace(go.Bar(
        x=bin_mids,
        y=percentages,
        name=label,
        marker_color=color,
        width=(hist.index.right - hist.index.left),
        opacity=0.75,
        hovertemplate=
            f"<b>{label}</b><br>" +
            "Range: %{x:.3f}<br>" +
            "Pairs: %{customdata:,}<br>" +
            "Percent: %{y:.2f}%<extra></extra>",
        customdata=counts
    ))

In [8]:
fig.show()

In [9]:
df_045 = pd.read_parquet("matches/ind_skills_scores_045.parquet", columns=["resume_id","pct_job_covered"])
df_055 = pd.read_parquet("matches/ind_skills_scores_055.parquet", columns=["resume_id","pct_job_covered"])
df_065 = pd.read_parquet("matches/ind_skills_scores_065.parquet", columns=["resume_id","pct_job_covered"])
df_075 = pd.read_parquet("matches/ind_skills_scores_075.parquet", columns=["resume_id","pct_job_covered"])
df_085 = pd.read_parquet("matches/ind_skills_scores_085.parquet", columns=["resume_id","pct_job_covered"])

In [11]:
best_045 = df_045.loc[df_045.groupby('resume_id')['pct_job_covered'].idxmax()]
best_055 = df_055.loc[df_055.groupby('resume_id')['pct_job_covered'].idxmax()]
best_065 = df_065.loc[df_065.groupby('resume_id')['pct_job_covered'].idxmax()]
best_075 = df_075.loc[df_055.groupby('resume_id')['pct_job_covered'].idxmax()]
best_085 = df_085.loc[df_065.groupby('resume_id')['pct_job_covered'].idxmax()]

In [12]:
fig = go.Figure()

def add_best_histogram(df, name, color, opacity=0.78):
    hist = df['pct_job_covered'].value_counts(bins=60, sort=False)
    bin_mids = hist.index.mid
    counts = hist.values
    percentages = counts / counts.sum() * 100
    
    fig.add_trace(go.Bar(
        x=bin_mids,
        y=percentages,
        width=(hist.index.right - hist.index.left)[0],
        name=name,
        marker_color=color,
        opacity=opacity,
        hovertemplate=
            f"<b>{name}</b><br>" +
            "Coverage: %{x:.3f}<br>" +
            "Candidates: %{customdata:,}<br>" +
            "Percent: %{y:.2f}%<extra></extra>",
        customdata=counts
    ))

In [13]:
add_best_histogram(best_045, "Threshold 0.45", "#EF553B")
add_best_histogram(best_055, "Threshold 0.55", "#00CC96")
add_best_histogram(best_065, "Threshold 0.65", "#2105F4")
add_best_histogram(best_075, "Threshold 0.75", "#CC00A0")
add_best_histogram(best_085, "Threshold 0.85", "#F3EB10")

In [17]:
fig.update_layout(
    barmode='overlay',
    bargap=0.01,
    height=650,
    title="Best-match coverage per resume",
    xaxis_title="pct_covered of the #1 job",
    yaxis_title="Percentage of candidates",
    legend_title="Threshold",
    hovermode="x unified"
)

In [18]:
fig.show()