## Part 1: Data preparation

### 1. Import libraries

In [223]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.ensemble import RandomForestClassifier  # For random forest model
from sklearn.calibration import CalibratedClassifierCV  # For probability calibration
from sklearn.metrics import (
    precision_recall_fscore_support,  # For precision, recall, F1-score
    accuracy_score,                  # For accuracy
    roc_auc_score,                   
    balanced_accuracy_score,         
    f1_score                         
 )

# Add a test print to confirm imports and output are working
print("Imports successful. If you see this message, output is working.")

Imports successful. If you see this message, output is working.


### 2. Load dataset and inspect it

In [224]:
df = pd.read_csv("../../data/raw/train.csv")

# Print the shape of the dataframe
df.shape

(1048, 65)

In [225]:
# Print the first few rows of the dataframe
df.head()

Unnamed: 0,is_dater_male,dater_age,dated_age,age_difference,dater_race,dated_race,are_same_race,same_race_importance_for_dater,same_religion_importance_for_dater,attractiveness_importance_for_dated,...,interests_correlation,expected_satisfaction_of_dater,expected_number_of_likes_of_dater_from_20_people,expected_number_of_dates_for_dater,dater_liked_dated,probability_dated_wants_to_date,already_met_before,dater_wants_to_date,dated_wants_to_date,is_match
0,False,21,27,6,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,35.0,...,0.14,3.0,2,4,7.0,6.0,True,True,False,0
1,False,21,22,1,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,60.0,...,0.54,3.0,2,4,7.0,5.0,True,True,False,0
2,False,21,23,2,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,30.0,...,0.61,3.0,2,4,7.0,6.0,True,True,True,1
3,False,21,24,3,'Asian/Pacific Islander/Asian-American','Latino/Hispanic American',False,2.0,4.0,30.0,...,0.21,3.0,2,4,6.0,6.0,True,True,True,1
4,False,21,25,4,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,50.0,...,0.25,3.0,2,4,6.0,5.0,True,False,True,0


In [226]:
# Print the data types of each column
df.dtypes

is_dater_male                         bool
dater_age                            int64
dated_age                            int64
age_difference                       int64
dater_race                          object
                                    ...   
probability_dated_wants_to_date    float64
already_met_before                    bool
dater_wants_to_date                   bool
dated_wants_to_date                   bool
is_match                             int64
Length: 65, dtype: object

### 3. Data cleaning and preprocessing

In [227]:
# Remove rows with missing values
df = df.dropna(subset=["is_match"])

In [228]:
# Change the column names to don't use underscores and to use upper first letters
df.columns = [col.replace('_', ' ').title() for col in df.columns]

In [229]:
# print all the column names
print(df.columns.tolist())

['Is Dater Male', 'Dater Age', 'Dated Age', 'Age Difference', 'Dater Race', 'Dated Race', 'Are Same Race', 'Same Race Importance For Dater', 'Same Religion Importance For Dater', 'Attractiveness Importance For Dated', 'Sincerity Importance For Dated', 'Intelligence Importance For Dated', 'Humor Importance For Dated', 'Ambition Importance For Dated', 'Shared Interests Importance For Dated', 'Attractiveness Score Of Dater From Dated', 'Sincerity Score Of Dater From Dated', 'Intelligence Score Of Dater From Dated', 'Humor Score Of Dater From Dated', 'Ambition Score Of Dater From Dated', 'Shared Interests Score Of Dater From Dated', 'Attractiveness Importance For Dater', 'Sincerity Importance For Dater', 'Intelligence Importance For Dater', 'Humor Importance For Dater', 'Ambition Importance For Dater', 'Shared Interests Importance For Dater', 'Self Reported Attractiveness Of Dater', 'Self Reported Sincerity Of Dater', 'Self Reported Intelligence Of Dater', 'Self Reported Humor Of Dater', '

In [230]:
# Store all column names before making changes
cols_before = set(df.columns)

# Drop unwanted columns (gender, religion, etc.)
df = df.drop(
    columns=[
        "Is Dater Male",
        "Dated Wants To Date",
        "Dater Wants To Date",
        "Same Race Importance For Dater",
        "Same Religion Importance For Dater"
    ],
    errors="ignore"  # ignore errors if any columns are missing
)

# Drop all columns containing the word 'Race' (handles one-hot encoded versions as well)
race_cols = [col for col in df.columns if "Race" in col]
df = df.drop(columns=race_cols, errors="ignore")

# Compare column sets before and after to see which columns were removed
cols_after = set(df.columns)
removed_cols = cols_before - cols_after

# Print removed columns for verification
print("Removed columns:", removed_cols)

Removed columns: {'Dater Race', 'Same Religion Importance For Dater', 'Same Race Importance For Dater', 'Dated Wants To Date', 'Is Dater Male', 'Dated Race', 'Dater Wants To Date', 'Are Same Race'}


In [231]:
# If some columns are text (e.g. gender), convert them to numerical dummy variables
df = pd.get_dummies(df, drop_first=True)

In [232]:
# Print the shape of the dataframe after cleaning
df.shape

(1048, 57)

In [233]:
df.head()

Unnamed: 0,Dater Age,Dated Age,Age Difference,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,Humor Importance For Dated,Ambition Importance For Dated,Shared Interests Importance For Dated,Attractiveness Score Of Dater From Dated,...,Dater Interest In Shopping,Dater Interest In Yoga,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Already Met Before,Is Match
0,21,27,6,35.0,20.0,20.0,20.0,0.0,5.0,6.0,...,8.0,1.0,0.14,3.0,2,4,7.0,6.0,True,0
1,21,22,1,60.0,0.0,0.0,40.0,0.0,0.0,7.0,...,8.0,1.0,0.54,3.0,2,4,7.0,5.0,True,0
2,21,23,2,30.0,5.0,15.0,40.0,5.0,5.0,7.0,...,8.0,1.0,0.61,3.0,2,4,7.0,6.0,True,1
3,21,24,3,30.0,10.0,20.0,10.0,10.0,20.0,8.0,...,8.0,1.0,0.21,3.0,2,4,6.0,6.0,True,1
4,21,25,4,50.0,0.0,30.0,10.0,0.0,10.0,7.0,...,8.0,1.0,0.25,3.0,2,4,6.0,5.0,True,0


In [234]:
# Get summary statistics of the dataframe
df.describe()

Unnamed: 0,Dater Age,Dated Age,Age Difference,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,Humor Importance For Dated,Ambition Importance For Dated,Shared Interests Importance For Dated,Attractiveness Score Of Dater From Dated,...,Dater Interest In Music,Dater Interest In Shopping,Dater Interest In Yoga,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Is Match
count,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,...,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0
mean,25.005725,24.818702,3.032443,23.728235,16.971021,22.255887,17.325029,9.725792,10.333626,6.211355,...,7.710878,5.51145,4.133588,0.15499,5.378817,5.760496,2.844466,6.218034,4.978053,0.177481
std,3.270365,3.180581,2.427732,12.660571,7.450629,7.352106,6.666005,7.07342,6.763784,1.964935,...,1.899931,2.597821,2.696578,0.335816,1.630245,4.954703,2.370152,1.858517,2.269876,0.382258
min,18.0,18.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,-0.63,1.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,22.0,1.0,15.0,10.0,20.0,10.8325,5.0,5.0,5.0,...,7.0,4.0,2.0,-0.11,5.0,2.0,1.0,5.0,3.0,0.0
50%,25.0,25.0,2.0,20.0,18.0,20.0,18.18,10.0,10.0,6.0,...,8.0,5.0,3.0,0.15,5.0,4.0,2.0,6.0,5.0,0.0
75%,27.0,27.0,4.0,30.0,20.0,25.0,20.0,15.0,15.0,8.0,...,9.0,8.0,7.0,0.42,7.0,8.0,4.0,7.0,7.0,0.0
max,35.0,35.0,14.0,100.0,40.0,50.0,40.0,53.0,30.0,10.0,...,10.0,10.0,10.0,0.9,9.0,20.0,10.0,10.0,10.0,1.0


### 4. Split features and target variable

In [235]:
# Split the data into features and target variable
# "Is Match" is the target variable and is binary (0 or 1) indicating if there was a match or not
# Features are all other columns and these are in X where y is the target variable
X = df.drop("Is Match", axis=1)
y = df["Is Match"]

# 70% train, 30% for validation + test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Divide the 30% into two equal parts = 15% each
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Print the sizes of the datasets
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 733, Validation: 157, Test: 158


### 5. Scaling

In [236]:
# Standardize the features
scaler = StandardScaler()

# Fit the scaler on the training data and transform train, val, and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

### 6. Logisitic Regression

In [237]:
# Train a logistic regression model
model = LogisticRegression(
    solver="liblinear",
    random_state=42,
    max_iter=500
)

# Train the model
model.fit(X_train_scaled, y_train)

In [238]:
# Check model accuracy on training and validation data to see how well it generalizes

#Training accuracy → to see if the model fits at all
train_score = model.score(X_train_scaled, y_train)
#Validation accuracy → to see if it generalizes
val_score   = model.score(X_val_scaled, y_val)

print(f"Training accuracy: {train_score:.3f}")
print(f"Validation accuracy: {val_score:.3f}")

Training accuracy: 0.900
Validation accuracy: 0.815


### 7. Validation Performance

In [239]:
# Evaluate on validation set
y_val_pred = model.predict(X_val_scaled)
print("\n--- Validation Performance ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=3))


--- Validation Performance ---
[[116  13]
 [ 16  12]]
              precision    recall  f1-score   support

           0      0.879     0.899     0.889       129
           1      0.480     0.429     0.453        28

    accuracy                          0.815       157
   macro avg      0.679     0.664     0.671       157
weighted avg      0.808     0.815     0.811       157



In [240]:

# --- Utility Function ---
def eval_with_threshold(y_true, proba, thr):
    """
    Evaluate predictions at a given threshold and return key metrics.
    y_true: True labels (0 or 1)
    proba: Predicted probabilities for class 1
    thr: Threshold for classifying as 1 (e.g., 0.5 means >=0.5 is class 1)
    Returns: Dictionary with precision, recall, F1, accuracy, balanced accuracy, and AUC.
    """
    y_pred = (proba >= thr).astype(int)  # Convert probabilities to 0/1 predictions using threshold
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    acc  = accuracy_score(y_true, y_pred)
    bacc = balanced_accuracy_score(y_true, y_pred)
    auc  = roc_auc_score(y_true, proba)
    return {"p":p, "r":r, "f1":f1, "acc":acc, "bacc":bacc, "auc":auc}

# --- Collect results for each model variant ---
results = []  # This will store the results for each model in a table

# 1. Baseline Logistic Regression (threshold=0.5)
# This is a simple linear model for binary classification.
base = LogisticRegression(solver="liblinear", random_state=42, max_iter=500)  # Create the model
base.fit(X_train_scaled, y_train)  # Train the model on the training data
val_proba = base.predict_proba(X_val_scaled)[:, 1]  # Get predicted probabilities for validation set
test_proba = base.predict_proba(X_test_scaled)[:, 1]  # Get predicted probabilities for test set
val_res = eval_with_threshold(y_val, val_proba, 0.5)  # Evaluate on validation set with threshold 0.5
test_res = eval_with_threshold(y_test, test_proba, 0.5)  # Evaluate on test set with threshold 0.5
results.append([
    "Baseline LogReg",
    val_res["acc"], val_res["r"], val_res["f1"],
    test_res["acc"], test_res["r"], test_res["f1"]
])

# 2. Threshold-tuned Logistic Regression (finds threshold for higher recall)
# Here we try different thresholds to find one that gives higher recall (catches more positives).
THRS = np.linspace(0.05, 0.95, 37)  # Try thresholds from 0.05 to 0.95
MIN_RECALL_POS = 0.55  # We want at least 0.55 recall for class 1
best = None  # Will store the best result
for thr in THRS:
    res = eval_with_threshold(y_val, val_proba, thr)
    # Only consider thresholds that give enough recall, and pick the one with best accuracy
    if res["r"] >= MIN_RECALL_POS and (best is None or res["acc"] > best[0]):
        best = (res["acc"], thr, res)
thr_sel = best[1] if best else 0.50  # Use best threshold found, or 0.5 if none found
val_res_thr = best[2] if best else val_res
test_res_thr = eval_with_threshold(y_test, test_proba, thr_sel)
results.append([
    f"Threshold={thr_sel:.2f}",
    val_res_thr["acc"], val_res_thr["r"], val_res_thr["f1"],
    test_res_thr["acc"], test_res_thr["r"], test_res_thr["f1"]
])

# 3. Logistic Regression Hyperparameter Tuning (best F1 for C/class_weight)
# Try different regularization strengths (C) and class weights to improve F1 score.
Cs = [0.1, 1.0, 10.0]  # Regularization strengths to try
CWs = [None, "balanced"]  # Try with and without class balancing
best = None
for C in Cs:
    for cw in CWs:
        mdl = LogisticRegression(solver="liblinear", random_state=42, max_iter=500, C=C, class_weight=cw)
        mdl.fit(X_train_scaled, y_train)
        res = eval_with_threshold(y_val, mdl.predict_proba(X_val_scaled)[:,1], 0.50)
        # Keep the model with the highest F1 score
        if best is None or res["f1"] > best[0]:
            best = (res["f1"], C, cw, mdl, res)
_, C_sel, cw_sel, mdl_sel, val_res_tuned = best
test_res_tuned = eval_with_threshold(y_test, mdl_sel.predict_proba(X_test_scaled)[:,1], 0.50)
results.append([
    f"C={C_sel} / cost-tuned",
    val_res_tuned["acc"], val_res_tuned["r"], val_res_tuned["f1"],
    test_res_tuned["acc"], test_res_tuned["r"], test_res_tuned["f1"]
])

# 4. SMOTE + Logistic Regression (only if imblearn is available)
# SMOTE creates synthetic samples for the minority class to balance the dataset.
try:
    from imblearn.over_sampling import SMOTE  # For synthetic oversampling
    from imblearn.pipeline import Pipeline  # For chaining preprocessing and modeling
    pipe = Pipeline([
        ("scaler", StandardScaler()),  # Standardize features
        ("smote", SMOTE(random_state=42, sampling_strategy=0.5)),  # Oversample minority class
        ("clf", LogisticRegression(solver="liblinear", random_state=42, max_iter=500))  # Logistic regression
    ])
    pipe.fit(X_train, y_train)  # Fit pipeline on unscaled data (scaling is inside pipeline)
    val_proba_smote = pipe.predict_proba(X_val)[:,1]  # Predict on validation set
    test_proba_smote = pipe.predict_proba(X_test)[:,1]  # Predict on test set
    val_res_smote = eval_with_threshold(y_val, val_proba_smote, 0.5)
    test_res_smote = eval_with_threshold(y_test, test_proba_smote, 0.5)
    results.append([
        "SMOTE",
        val_res_smote["acc"], val_res_smote["r"], val_res_smote["f1"],
        test_res_smote["acc"], test_res_smote["r"], test_res_smote["f1"]
    ])
except Exception:
    # If imblearn is not installed, fill with None (so table still works)
    results.append(["SMOTE", None, None, None, None, None, None])

# 5. Random Forest (balanced class weights)
# Random forest is an ensemble of decision trees. 'balanced' weights help with class imbalance.
rf = RandomForestClassifier(
    n_estimators=400,  # Number of trees in the forest
    min_samples_leaf=2,  # Minimum samples per leaf node
    class_weight="balanced",  # Adjust weights inversely to class frequencies
    random_state=42,  # For reproducibility
    n_jobs=-1  # Use all CPU cores
 )
rf.fit(X_train, y_train)  # Train on unscaled data (trees don't need scaling)
val_proba_rf = rf.predict_proba(X_val)[:,1]  # Predict probabilities for validation set
test_proba_rf = rf.predict_proba(X_test)[:,1]  # Predict probabilities for test set
val_res_rf = eval_with_threshold(y_val, val_proba_rf, 0.5)
test_res_rf = eval_with_threshold(y_test, test_proba_rf, 0.5)
results.append([
    "RF (balanced)",
    val_res_rf["acc"], val_res_rf["r"], val_res_rf["f1"],
    test_res_rf["acc"], test_res_rf["r"], test_res_rf["f1"]
])

# 6. Calibrated Logistic Regression (isotonic calibration)
# Calibrates the predicted probabilities to make them more reliable.
base = LogisticRegression(solver="liblinear", random_state=42, max_iter=500).fit(X_train_scaled, y_train)
cal = CalibratedClassifierCV(base, method="isotonic", cv="prefit").fit(X_val_scaled, y_val)
val_proba_cal = cal.predict_proba(X_val_scaled)[:,1]  # Calibrated probabilities for validation set
val_res_cal = eval_with_threshold(y_val, val_proba_cal, 0.5)
results.append([
    "Calibrated LogReg",
    val_res_cal["acc"], val_res_cal["r"], val_res_cal["f1"],
    None, None, None  # No test set calibration here
 ])

# --- Results Overview Table ---
# Create a table (DataFrame) with all results for easy comparison
results_df = pd.DataFrame(
    results,
    columns=["Model", "Val_Acc", "Val_Recall(1)", "Val_F1(1)", "Test_Acc", "Test_Recall(1)", "Test_F1(1)"]
)
# Show all results in one table
print("\n--- Model Comparison Overview ---")
print(results_df)
print("After results_df")


--- Model Comparison Overview ---
                Model   Val_Acc  Val_Recall(1)  Val_F1(1)  Test_Acc  \
0     Baseline LogReg  0.815287       0.428571   0.452830  0.829114   
1      Threshold=0.37  0.796178       0.571429   0.500000  0.829114   
2  C=0.1 / cost-tuned  0.840764       0.428571   0.489796  0.810127   
3               SMOTE  0.764331       0.535714   0.447761  0.772152   
4       RF (balanced)  0.853503       0.357143   0.465116  0.835443   
5   Calibrated LogReg  0.840764       0.142857   0.242424       NaN   

   Test_Recall(1)  Test_F1(1)  
0        0.428571    0.470588  
1        0.571429    0.542373  
2        0.357143    0.400000  
3        0.500000    0.437500  
4        0.285714    0.380952  
5             NaN         NaN  
After results_df




### 8. Test Performance

In [241]:
# Evaluate on test set
# y_test_pred = model.predict(X_test_scaled)

# print("\n--- Test Performance ---")
# print(confusion_matrix(y_test, y_test_pred))
# print(classification_report(y_test, y_test_pred, digits=3))

### 9. Feature Importance (Koefficienter)