In [11]:
import pandas as pd

def load_parquet_with_filter(parquet_path, columns_to_keep):
    """
    Load a Parquet file, filter rows by model_prob between 0 and 1, and keep only specified columns in order.

    Args:
        parquet_path (str): Path to the Parquet file.
        columns_to_keep (list of str): List of column names to keep in the given order.

    Returns:
        pd.DataFrame: Filtered DataFrame with specified columns.
    """
    df = pd.read_parquet(parquet_path)
    # Filter rows where match_prob is between 0 and 1
    df_filtered = df[(df['match_prob'] >= 0.0) & (df['match_prob'] <= 0.50)]
    # Filter only the columns requested, if they exist in the DataFrame
    cols_to_use = [col for col in columns_to_keep if col in df_filtered.columns]
    return df_filtered[cols_to_use]

# Example usage:
df = load_parquet_with_filter('/Users/borismartinez/Documents/GitHub/engage/predicted_best_matches_prototype.parquet', ['registration_form_id', 'first_name_att', 'first_name_vf', 'last_name_att', 'last_name_vf', 'match_prob'])
print(df.head())


   registration_form_id first_name_att         first_name_vf  \
0            6001285689          pedro               michael   
2            6001287712           rora                  rosa   
5         1010000005010   rosana david        rosaura daniel   
7         1010000008304          mareo                 marco   
8         1010000010099        micaela  gloria aurora agusti   

         last_name_att last_name_vf  match_prob  
0  hernandez rodriguez        hodge    0.000162  
2                nunez        nunez    0.004387  
5                 dome         dome    0.004387  
7               vargas       vargas    0.004387  
8        quentes elias      quevedo    0.000112  


# Load labeled dataset

In [2]:
import pandas as pd

# Assume df is your DataFrame loaded from Parquet or CSV containing 'match_prob' and the categorical columns

# Define thresholds for high and low certainty
high_threshold = 0.8
low_threshold = 0.4

# Create a new column categorizing match_prob
def certainty_category(p):
    if p >= high_threshold:
        return 'High Certainty'
    elif p <= low_threshold:
        return 'Low Certainty'
    else:
        return 'Medium Certainty'

df_preview_all['certainty_group'] = df_preview_all['match_prob'].apply(certainty_category)

# Columns for breakdown
categorical_cols = ['party_vf', 'dob_year', 'gender_att', 'voting_city']

# Print count breakdowns for each categorical column by certainty group
for col in categorical_cols:
    print(f"\nBreakdown of {col} by certainty group:")
    breakdown = pd.crosstab(df['certainty_group'], df[col], margins=True, normalize='index') * 100
    print(breakdown.round(2))

NameError: name 'df_preview_all' is not defined

# Threshold-based classification

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]

for thresh in thresholds:
    preds = (labeled_matches['match_prob'] >= thresh).astype(int)
    
    cm = confusion_matrix(labeled_matches['true_label'], preds)
    acc = accuracy_score(labeled_matches['true_label'], preds)
    prec = precision_score(labeled_matches['true_label'], preds)
    rec = recall_score(labeled_matches['true_label'], preds)
    f1 = f1_score(labeled_matches['true_label'], preds)
    
    print(f"Threshold: {thresh}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-score: {f1:.4f}\n")

# ROC Curve and AUC


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, roc_thresholds = roc_curve(labeled_matches['true_label'], labeled_matches['match_prob'])
auc_score = roc_auc_score(labeled_matches['true_label'], labeled_matches['match_prob'])

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {auc_score:.4f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC)")
plt.legend(loc="lower right")
plt.show()

# Precision-Recall Curve


In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

precision, recall, pr_thresholds = precision_recall_curve(labeled_matches['true_label'], labeled_matches['match_prob'])
avg_precision = average_precision_score(labeled_matches['true_label'], labeled_matches['match_prob'])

plt.figure(figsize=(8,6))
plt.plot(recall, precision, label=f"Precision-Recall curve (AP = {avg_precision:.4f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend(loc="upper right")
plt.show()

# Calibration check


In [None]:
from sklearn.calibration import calibration_curve
import numpy as np

prob_true, prob_pred = calibration_curve(labeled_matches['true_label'], labeled_matches['match_prob'], n_bins=10)

plt.figure(figsize=(8,6))
plt.plot(prob_pred, prob_true, marker='o', label='Calibration curve')
plt.plot([0,1], [0,1], 'k--', label='Perfectly calibrated')
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.title("Calibration Curve")
plt.legend()
plt.show()

from sklearn.metrics import brier_score_loss
brier_score = brier_score_loss(labeled_matches['true_label'], labeled_matches['match_prob'])
print(f"Brier score: {brier_score:.4f}")

# Threshold selection


In [None]:
import numpy as np

# Find threshold that maximizes F1 score
best_thresh = None
best_f1 = 0

for thresh in np.linspace(0,1,101):
    preds = (labeled_matches['match_prob'] >= thresh).astype(int)
    f1 = f1_score(labeled_matches['true_label'], preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"Best threshold by F1 score: {best_thresh:.2f} with F1 = {best_f1:.4f}")

# Summary and insights

