## Part 1: Data preparation

### 1. Import libraries

In [None]:
import pandas as pd
import numpy as np 

# Sklearn modules for data splitting, preprocessing, model building and evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Models to be used for classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix

In [26]:
# If some columns are text (e.g. gender), convert them to numerical dummy variables
df = pd.get_dummies(df, drop_first=True)

In [27]:
# Print the shape of the dataframe after cleaning
df.shape

(1048, 57)

In [28]:
df.head()

Unnamed: 0,Dater Age,Dated Age,Age Difference,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,Humor Importance For Dated,Ambition Importance For Dated,Shared Interests Importance For Dated,Attractiveness Score Of Dater From Dated,...,Dater Interest In Shopping,Dater Interest In Yoga,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Already Met Before,Is Match
0,21,27,6,35.0,20.0,20.0,20.0,0.0,5.0,6.0,...,8.0,1.0,0.14,3.0,2,4,7.0,6.0,True,0
1,21,22,1,60.0,0.0,0.0,40.0,0.0,0.0,7.0,...,8.0,1.0,0.54,3.0,2,4,7.0,5.0,True,0
2,21,23,2,30.0,5.0,15.0,40.0,5.0,5.0,7.0,...,8.0,1.0,0.61,3.0,2,4,7.0,6.0,True,1
3,21,24,3,30.0,10.0,20.0,10.0,10.0,20.0,8.0,...,8.0,1.0,0.21,3.0,2,4,6.0,6.0,True,1
4,21,25,4,50.0,0.0,30.0,10.0,0.0,10.0,7.0,...,8.0,1.0,0.25,3.0,2,4,6.0,5.0,True,0


In [29]:
# Get summary statistics of the dataframe
df.describe()

Unnamed: 0,Dater Age,Dated Age,Age Difference,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,Humor Importance For Dated,Ambition Importance For Dated,Shared Interests Importance For Dated,Attractiveness Score Of Dater From Dated,...,Dater Interest In Music,Dater Interest In Shopping,Dater Interest In Yoga,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Is Match
count,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,...,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0,1048.0
mean,25.005725,24.818702,3.032443,23.728235,16.971021,22.255887,17.325029,9.725792,10.333626,6.211355,...,7.710878,5.51145,4.133588,0.15499,5.378817,5.760496,2.844466,6.218034,4.978053,0.177481
std,3.270365,3.180581,2.427732,12.660571,7.450629,7.352106,6.666005,7.07342,6.763784,1.964935,...,1.899931,2.597821,2.696578,0.335816,1.630245,4.954703,2.370152,1.858517,2.269876,0.382258
min,18.0,18.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,-0.63,1.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,22.0,1.0,15.0,10.0,20.0,10.8325,5.0,5.0,5.0,...,7.0,4.0,2.0,-0.11,5.0,2.0,1.0,5.0,3.0,0.0
50%,25.0,25.0,2.0,20.0,18.0,20.0,18.18,10.0,10.0,6.0,...,8.0,5.0,3.0,0.15,5.0,4.0,2.0,6.0,5.0,0.0
75%,27.0,27.0,4.0,30.0,20.0,25.0,20.0,15.0,15.0,8.0,...,9.0,8.0,7.0,0.42,7.0,8.0,4.0,7.0,7.0,0.0
max,35.0,35.0,14.0,100.0,40.0,50.0,40.0,53.0,30.0,10.0,...,10.0,10.0,10.0,0.9,9.0,20.0,10.0,10.0,10.0,1.0


### 4. Split features and target variable

In [30]:
# Split the data into features and target variable
# "Is Match" is the target variable and is binary (0 or 1) indicating if there was a match or not
X = df.drop("Is Match", axis=1)
y = df["Is Match"]

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20, # the dataset is small, so use 20% for testing
    stratify=y,
    random_state=42
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Define K-fold cross-validation on the training data
# This 'cv' object will be used later when training/evaluating models
# CV means cross-validation
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

Train: 838, Test: 210


### 5. Scaling 
We apply feature scaling using `StandardScaler` inside `scikit-learn Pipelines`.
This ensures that in each `cross-validation fold`, the scaler is fitted only on the training portion and then applied to the validation portion, avoiding data leakage.
Tree-based models (`Decision Tree`, `Random Forest`) do not need scaling, but linear models such as `Logistic Regression` benefit from standardized features.

### 6. Models
#### 6.1. Logisitic Regression

In [38]:
# Build a pipeline: scaling + model
log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("log_reg", LogisticRegression(max_iter=1000))
])

# Run cross-validation on the training data
lt_cv_scores = cross_val_score(
    log_reg_pipeline,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)

print("Logisitic Regression CV Scores:", lt_cv_scores)
print("Logisitic Regression mean CV Score:", lt_cv_scores.mean())

Logisitic Regression CV Scores: [0.85119048 0.8452381  0.83928571 0.85628743 0.8742515 ]
Logisitic Regression mean CV Score: 0.8532506415739949


#### 6.2. Decision Tree

In [None]:
# Decision Tree without scaling (tree models are scale-invariant)
dt_model = DecisionTreeClassifier(
    random_state=42
)

# Run cross-validation on the training data
dt_cv_scores = cross_val_score(
    dt_model,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)

# Print Decision Tree CV scores
print("Decision Tree CV scores:", dt_cv_scores)
print("Decision Tree mean CV score:", dt_cv_scores.mean())

Decision Tree CV scores: [0.81547619 0.80952381 0.78571429 0.75449102 0.78443114]
Decision Tree mean CV score: 0.7899272882805817


#### 6.3. Random Forest

In [35]:
# Random Forest without scaling
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# Run cross-validation on the training data
rf_cv_scores = cross_val_score(
    rf_model,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)

# Print Random Forest CV scores
print("Random Forest CV scores:", rf_cv_scores)
print("Random Forest mean CV score:", rf_cv_scores.mean())

Random Forest CV scores: [0.85119048 0.86904762 0.8452381  0.84431138 0.86227545]
Random Forest mean CV score: 0.8544126033646992


#### 6.4. Results From the Three Models

In [41]:
# Collect CV results from the three models
results = {
    "Logistic Regression": lt_cv_scores.mean(),
    "Decision Tree": dt_cv_scores.mean(),
    "Random Forest": rf_cv_scores.mean()
}

# Convert to a clean table
results_table = pd.DataFrame({
    "Model": list(results.keys()),
    "Mean CV Accuracy": [round(v, 4) for v in results.values()]
})

# Display results sorted by accuracy
results_table.sort_values("Mean CV Accuracy", ascending=False).reset_index(drop=True)

Unnamed: 0,Model,Mean CV Accuracy
0,Random Forest,0.8544
1,Logistic Regression,0.8533
2,Decision Tree,0.7899


### ?. Test Performance

In [272]:
# Evaluate on test set
# y_test_pred = model.predict(X_test_scaled)

# print("\n--- Test Performance ---")
# print(confusion_matrix(y_test, y_test_pred))
# print(classification_report(y_test, y_test_pred, digits=3))