In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_recall_fscore_support, roc_auc_score
)

In [30]:
# This dataset is from huggingface.co and can be found here: https://huggingface.co/datasets/mstz/speeddating
df = pd.read_csv("../data/raw/train.csv")

# Print the shape of the dataframe
df.shape

(1048, 65)

In [31]:
# Remove rows with missing values
df = df.dropna(subset=["is_match"])

# If some columns are text (e.g. race, gender), convert them to numerical dummy variables
df = pd.get_dummies(df, drop_first=True)

In [32]:
# Change the column names to don't use underscores and to use upper first letters
df.columns = [col.replace('_', ' ').title() for col in df.columns]
df.head()

Unnamed: 0,Is Dater Male,Dater Age,Dated Age,Age Difference,Are Same Race,Same Race Importance For Dater,Same Religion Importance For Dater,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,...,Dated Wants To Date,Is Match,Dater Race 'Black/African American',Dater Race 'Latino/Hispanic American',Dater Race Caucasian,Dater Race Other,Dated Race 'Black/African American',Dated Race 'Latino/Hispanic American',Dated Race Caucasian,Dated Race Other
0,False,21,27,6,False,2.0,4.0,35.0,20.0,20.0,...,False,0,False,False,False,False,False,False,True,False
1,False,21,22,1,False,2.0,4.0,60.0,0.0,0.0,...,False,0,False,False,False,False,False,False,True,False
2,False,21,23,2,False,2.0,4.0,30.0,5.0,15.0,...,True,1,False,False,False,False,False,False,True,False
3,False,21,24,3,False,2.0,4.0,30.0,10.0,20.0,...,True,1,False,False,False,False,False,True,False,False
4,False,21,25,4,False,2.0,4.0,50.0,0.0,30.0,...,True,0,False,False,False,False,False,False,True,False


In [33]:
# Remove columns that are not needed for the analysis
df = df.drop(columns=["Is Dater Male", "Dated Wants To Date", "Dater Wants To Date"])

In [34]:
# Separate features and target variable
X = df.drop("Is Match", axis=1)
y = df["Is Match"]

# First, split the data into training+validation (85%) and test sets (15%)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)

# Next, use 15% of train for validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.15, stratify=y_train_full, random_state=42
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 756, Validation: 134, Test: 158


In [35]:
# Standardize the features
scaler = StandardScaler()

# Fit the scaler on the training data and transform train, val, and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

In [36]:
# Train a logistic regression model
model = LogisticRegression(
    solver="liblinear",
    random_state=42,
    max_iter=500
)
model.fit(X_train_scaled, y_train)

In [37]:
# Evaluate on validation set
y_val_pred = model.predict(X_val_scaled)
print("\n--- Validation Performance ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=3))


--- Validation Performance ---
[[105   5]
 [ 11  13]]
              precision    recall  f1-score   support

           0      0.905     0.955     0.929       110
           1      0.722     0.542     0.619        24

    accuracy                          0.881       134
   macro avg      0.814     0.748     0.774       134
weighted avg      0.872     0.881     0.874       134



In [38]:
# --- Imports ---
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    precision_recall_fscore_support, accuracy_score, roc_auc_score,
    balanced_accuracy_score, confusion_matrix, classification_report, f1_score
)

# --- Utility Functions ---
def eval_with_threshold(y_true, proba, thr):
    """Evaluate predictions at a given threshold."""
    y_pred = (proba >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    acc  = accuracy_score(y_true, y_pred)
    bacc = balanced_accuracy_score(y_true, y_pred)
    auc  = roc_auc_score(y_true, proba)
    return {"p":p, "r":r, "f1":f1, "acc":acc, "bacc":bacc, "auc":auc, "y_pred":y_pred}

# --- Model Variants and Results Collection ---
results = []

# 1. Baseline Logistic Regression (thr=0.5)
base = LogisticRegression(solver="liblinear", random_state=42, max_iter=500)
base.fit(X_train_scaled, y_train)
val_proba = base.predict_proba(X_val_scaled)[:, 1]
test_proba = base.predict_proba(X_test_scaled)[:, 1]
val_res = eval_with_threshold(y_val, val_proba, 0.5)
test_res = eval_with_threshold(y_test, test_proba, 0.5)
results.append(["Baseline LogReg", val_res["acc"], val_res["r"], val_res["f1"], test_res["acc"], test_res["r"], test_res["f1"]])

# 2. Threshold-tuned Logistic Regression (best recall >= 0.55)
THRS = np.linspace(0.05, 0.95, 37)
MIN_RECALL_POS = 0.55
best = None
for thr in THRS:
    res = eval_with_threshold(y_val, val_proba, thr)
    if res["r"] >= MIN_RECALL_POS:
        score = res["acc"]
        if best is None or score > best[0]:
            best = (score, thr, res)
thr_sel = best[1] if best else 0.50
val_res_thr = best[2] if best else val_res
test_res_thr = eval_with_threshold(y_test, test_proba, thr_sel)
results.append([f"Threshold={thr_sel:.2f}", val_res_thr["acc"], val_res_thr["r"], val_res_thr["f1"], test_res_thr["acc"], test_res_thr["r"], test_res_thr["f1"]])

# 3. Logistic Regression Hyperparameter Tuning (C=0.1, best F1)
Cs = [0.1, 1.0, 10.0]
CWs = [None, "balanced"]
best = None
for C in Cs:
    for cw in CWs:
        mdl = LogisticRegression(solver="liblinear", random_state=42, max_iter=500, C=C, class_weight=cw)
        mdl.fit(X_train_scaled, y_train)
        res = eval_with_threshold(y_val, mdl.predict_proba(X_val_scaled)[:,1], 0.50)
        score = res["f1"]
        if best is None or score > best[0]:
            best = (score, C, cw, mdl, res)
_, C_sel, cw_sel, mdl_sel, val_res_tuned = best
test_res_tuned = eval_with_threshold(y_test, mdl_sel.predict_proba(X_test_scaled)[:,1], 0.50)
results.append([f"C={C_sel} / cost-tuned", val_res_tuned["acc"], val_res_tuned["r"], val_res_tuned["f1"], test_res_tuned["acc"], test_res_tuned["r"], test_res_tuned["f1"]])

# 4. SMOTE + Logistic Regression (if imblearn available)
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("smote", SMOTE(random_state=42, sampling_strategy=0.5)),
        ("clf", LogisticRegression(solver="liblinear", random_state=42, max_iter=500))
    ])
    pipe.fit(X_train, y_train)
    val_proba_smote = pipe.predict_proba(X_val)[:,1]
    test_proba_smote = pipe.predict_proba(X_test)[:,1]
    val_res_smote = eval_with_threshold(y_val, val_proba_smote, 0.5)
    test_res_smote = eval_with_threshold(y_test, test_proba_smote, 0.5)
    results.append(["SMOTE", val_res_smote["acc"], val_res_smote["r"], val_res_smote["f1"], test_res_smote["acc"], test_res_smote["r"], test_res_smote["f1"]])
except Exception as e:
    results.append(["SMOTE", None, None, None, None, None, None])

# 5. Random Forest (balanced)
rf = RandomForestClassifier(
    n_estimators=400, max_depth=None, min_samples_leaf=2,
    class_weight="balanced", random_state=42, n_jobs=-1
).fit(X_train, y_train)
val_proba_rf = rf.predict_proba(X_val)[:,1]
test_proba_rf = rf.predict_proba(X_test)[:,1]
val_res_rf = eval_with_threshold(y_val, val_proba_rf, 0.5)
test_res_rf = eval_with_threshold(y_test, test_proba_rf, 0.5)
results.append(["RF (balanced)", val_res_rf["acc"], val_res_rf["r"], val_res_rf["f1"], test_res_rf["acc"], test_res_rf["r"], test_res_rf["f1"]])

# 6. Calibrated Logistic Regression
base = LogisticRegression(solver="liblinear", random_state=42, max_iter=500).fit(X_train_scaled, y_train)
cal = CalibratedClassifierCV(base, method="isotonic", cv="prefit").fit(X_val_scaled, y_val)
val_proba_cal = cal.predict_proba(X_val_scaled)[:,1]
val_res_cal = eval_with_threshold(y_val, val_proba_cal, 0.5)
results.append(["Calibrated LogReg", val_res_cal["acc"], val_res_cal["r"], val_res_cal["f1"], None, None, None])

# --- Results Overview Table ---
results_df = pd.DataFrame(results, columns=["Model", "Val_Acc", "Val_Recall(1)", "Val_F1(1)", "Test_Acc", "Test_Recall(1)", "Test_F1(1)"])
print("\n--- Model Comparison Overview ---")
print(results_df)


--- Model Comparison Overview ---
                Model   Val_Acc  Val_Recall(1)  Val_F1(1)  Test_Acc  \
0     Baseline LogReg  0.880597       0.541667   0.619048  0.810127   
1      Threshold=0.40  0.858209       0.583333   0.595745  0.778481   
2  C=0.1 / cost-tuned  0.902985       0.541667   0.666667  0.829114   
3               SMOTE  0.798507       0.583333   0.509091  0.734177   
4       RF (balanced)  0.858209       0.375000   0.486486  0.860759   
5   Calibrated LogReg  0.895522       0.416667   0.588235       NaN   

   Test_Recall(1)  Test_F1(1)  
0        0.321429    0.375000  
1        0.392857    0.385965  
2        0.321429    0.400000  
3        0.464286    0.382353  
4        0.321429    0.450000  
5             NaN         NaN  




# Model Comparison: What Each Variant Changes

This notebook compares several classification model variants and their effects on performance:


- **Baseline Logistic Regression**: Standard logistic regression with default parameters and a threshold of 0.5.
- **Threshold-tuned Logistic Regression**: Adjusts the decision threshold to increase recall for the positive class (e.g., to catch more '1's), possibly at the cost of accuracy.
- **Cost-tuned Logistic Regression**: Tunes the regularization parameter `C` and class weights to improve F1 score, which balances precision and recall.
- **SMOTE + Logistic Regression**: Uses SMOTE (Synthetic Minority Over-sampling Technique) to balance the dataset by generating synthetic samples for the minority class before fitting logistic regression. This can help when the classes are imbalanced.
- **Random Forest (balanced)**: Uses a random forest classifier with class weights set to 'balanced' to handle class imbalance. Random forests can capture nonlinear relationships and interactions between features.
- **Calibrated Logistic Regression**: Applies probability calibration (isotonic regression) to logistic regression outputs, which can improve the reliability of predicted probabilities, especially when used for decision-making or further threshold tuning.


The table below summarizes the validation and test results for each model variant, making it easy to compare their impact on accuracy, recall, and F1 score.