## Part 1: Data preparation

### 1. Import libraries

In [261]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np 
from sklearn.ensemble import RandomForestClassifier  # For random forest model
from sklearn.calibration import CalibratedClassifierCV  # For probability calibration
from sklearn.metrics import (
    precision_recall_fscore_support,  # For precision, recall, F1-score
    accuracy_score,                  # For accuracy
    roc_auc_score,                   
    balanced_accuracy_score,         
    f1_score                         
 )

In [262]:
# If some columns are text (e.g. gender), convert them to numerical dummy variables
df = pd.get_dummies(df, drop_first=True)

In [263]:
# Print the shape of the dataframe after cleaning
df.shape

(1048, 57)

In [264]:
df.head()

Unnamed: 0,Dater Age,Dated Age,Age Difference,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,Humor Importance For Dated,Ambition Importance For Dated,Shared Interests Importance For Dated,Attractiveness Score Of Dater From Dated,...,Dater Interest In Shopping,Dater Interest In Yoga,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Already Met Before,Is Match
0,21,27,6,35.0,20.0,20.0,20.0,0.0,5.0,6.0,...,8.0,1.0,0.14,3.0,2,4,7.0,6.0,True,0
1,21,22,1,60.0,0.0,0.0,40.0,0.0,0.0,7.0,...,8.0,1.0,0.54,3.0,2,4,7.0,5.0,True,0
2,21,23,2,30.0,5.0,15.0,40.0,5.0,5.0,7.0,...,8.0,1.0,0.61,3.0,2,4,7.0,6.0,True,1
3,21,24,3,30.0,10.0,20.0,10.0,10.0,20.0,8.0,...,8.0,1.0,0.21,3.0,2,4,6.0,6.0,True,1
4,21,25,4,50.0,0.0,30.0,10.0,0.0,10.0,7.0,...,8.0,1.0,0.25,3.0,2,4,6.0,5.0,True,0


In [265]:
# Get summary statistics of the dataframe
df.describe()

Unnamed: 0,Dater Age,Dated Age,Age Difference,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,Humor Importance For Dated,Ambition Importance For Dated,Shared Interests Importance For Dated,Attractiveness Score Of Dater From Dated,...,Dater Interest In Music,Dater Interest In Shopping,Dater Interest In Yoga,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Is Match
count,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,...,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0
mean,25.005725,24.818702,3.032443,23.728235,16.971021,22.255887,17.325029,9.725792,10.333626,6.211355,...,7.710878,5.51145,4.133588,0.15499,5.378817,5.760496,2.844466,6.218034,4.978053,0.177481
std,3.270365,3.180581,2.427732,12.660571,7.450629,7.352106,6.666005,7.07342,6.763784,1.964935,...,1.899931,2.597821,2.696578,0.335816,1.630245,4.954703,2.370152,1.858517,2.269876,0.382258
min,18.0,18.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,-0.63,1.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,22.0,1.0,15.0,10.0,20.0,10.8325,5.0,5.0,5.0,...,7.0,4.0,2.0,-0.11,5.0,2.0,1.0,5.0,3.0,0.0
50%,25.0,25.0,2.0,20.0,18.0,20.0,18.18,10.0,10.0,6.0,...,8.0,5.0,3.0,0.15,5.0,4.0,2.0,6.0,5.0,0.0
75%,27.0,27.0,4.0,30.0,20.0,25.0,20.0,15.0,15.0,8.0,...,9.0,8.0,7.0,0.42,7.0,8.0,4.0,7.0,7.0,0.0
max,35.0,35.0,14.0,100.0,40.0,50.0,40.0,53.0,30.0,10.0,...,10.0,10.0,10.0,0.9,9.0,20.0,10.0,10.0,10.0,1.0


### 4. Split features and target variable

In [266]:
# Split the data into features and target variable
# "Is Match" is the target variable and is binary (0 or 1) indicating if there was a match or not
# Features are all other columns and these are in X where y is the target variable
X = df.drop("Is Match", axis=1)
y = df["Is Match"]

# 70% train, 30% for validation + test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Divide the 30% into two equal parts = 15% each
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# Print the sizes of the datasets
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 733, Validation: 157, Test: 158


### 5. Scaling

In [267]:
# Standardize the features
scaler = StandardScaler()

# Fit the scaler on the training data and transform train, val, and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

### 6. Logisitic Regression

In [268]:
# Train a logistic regression model
model = LogisticRegression(
    solver="liblinear",
    random_state=42,
    max_iter=500
)

# Train the model
model.fit(X_train_scaled, y_train)

In [269]:
# Check model accuracy on training and validation data to see how well it generalizes

#Training accuracy → to see if the model fits at all
train_score = model.score(X_train_scaled, y_train)
#Validation accuracy → to see if it generalizes
val_score   = model.score(X_val_scaled, y_val)

print(f"Training accuracy: {train_score:.3f}")
print(f"Validation accuracy: {val_score:.3f}")

Training accuracy: 0.900
Validation accuracy: 0.815


### 7. Validation Performance

In [270]:
# Evaluate on validation set
y_val_pred = model.predict(X_val_scaled)
print("\n--- Validation Performance ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=3))


--- Validation Performance ---
[[116  13]
 [ 16  12]]
              precision    recall  f1-score   support

           0      0.879     0.899     0.889       129
           1      0.480     0.429     0.453        28

    accuracy                          0.815       157
   macro avg      0.679     0.664     0.671       157
weighted avg      0.808     0.815     0.811       157



In [271]:
#    from imblearn.pipeline import Pipeline  # For chaining preprocessing and modeling
#    pipe = Pipeline([
#        ("scaler", StandardScaler()),  # Standardize features
#        ("smote", SMOTE(random_state=42, sampling_strategy=0.5)),  # Oversample minority class
#        ("clf", LogisticRegression(solver="liblinear", random_state=42, max_iter=500))  # Logistic regression
#    ])
#    pipe.fit(X_train, y_train)  # Fit pipeline on unscaled data (scaling is inside pipeline)
#    val_proba_smote = pipe.predict_proba(X_val)[:,1]  # Predict on validation set
#    test_proba_smote = pipe.predict_proba(X_test)[:,1]  # Predict on test set
#    val_res_smote = eval_with_threshold(y_val, val_proba_smote, 0.5)
#    test_res_smote = eval_with_threshold(y_test, test_proba_smote, 0.5)
#    results.append([
#        "SMOTE",
#        val_res_smote["acc"], val_res_smote["r"], val_res_smote["f1"],
#        test_res_smote["acc"], test_res_smote["r"], test_res_smote["f1"]
#    ])
#except Exception:
#    # If imblearn is not installed, fill with None (so table still works)
#    results.append(["SMOTE", None, None, None, None, None, None])
#
# 5. Random Forest (balanced class weights)
# Random forest is an ensemble of decision trees. 'balanced' weights help with class imbalance.
#rf = RandomForestClassifier(
#    n_estimators=400,  # Number of trees in the forest
#    min_samples_leaf=2,  # Minimum samples per leaf node
#    class_weight="balanced",  # Adjust weights inversely to class frequencies
#    random_state=42,  # For reproducibility
#    n_jobs=-1  # Use all CPU cores
# )
#rf.fit(X_train, y_train)  # Train on unscaled data (trees don't need scaling)
#val_proba_rf = rf.predict_proba(X_val)[:,1]  # Predict probabilities for validation set
#test_proba_rf = rf.predict_proba(X_test)[:,1]  # Predict probabilities for test set
#val_res_rf = eval_with_threshold(y_val, val_proba_rf, 0.5)
#test_res_rf = eval_with_threshold(y_test, test_proba_rf, 0.5)
#results.append([
#    "RF (balanced)",
#    val_res_rf["acc"], val_res_rf["r"], val_res_rf["f1"],
#    test_res_rf["acc"], test_res_rf["r"], test_res_rf["f1"]
#])
#
# 6. Calibrated Logistic Regression (isotonic calibration)
# Calibrates the predicted probabilities to make them more reliable.
#base = LogisticRegression(solver="liblinear", random_state=42, max_iter=500).fit(X_train_scaled, y_train)
#cal = CalibratedClassifierCV(base, method="isotonic", cv="prefit").fit(X_val_scaled, y_val)
#val_proba_cal = cal.predict_proba(X_val_scaled)[:,1]  # Calibrated probabilities for validation set
#val_res_cal = eval_with_threshold(y_val, val_proba_cal, 0.5)
#results.append([
#    "Calibrated LogReg",
#    val_res_cal["acc"], val_res_cal["r"], val_res_cal["f1"],
#    None, None, None  # No test set calibration here
# ])
#
# --- Results Overview Table ---
# Create a table (DataFrame) with all results for easy comparison
#results_df = pd.DataFrame(
#    results,
#    columns=["Model", "Val_Acc", "Val_Recall(1)", "Val_F1(1)", "Test_Acc", "Test_Recall(1)", "Test_F1(1)"]
#)
# Show all results in one table
#print("\n--- Model Comparison Overview ---")
#print(results_df)
#print("After results_df")

### 8. Test Performance

In [272]:
# Evaluate on test set
# y_test_pred = model.predict(X_test_scaled)

# print("\n--- Test Performance ---")
# print(confusion_matrix(y_test, y_test_pred))
# print(classification_report(y_test, y_test_pred, digits=3))

### 9. Feature Importance (Koefficienter)