## Part 1: Introduction

### 1.1. Import libraries

In [None]:
import pandas as pd
import numpy as np 

# Sklearn modules for data splitting, preprocessing, model building and evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

# Models to be used for classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Visualization
import matplotlib.pyplot as plt
from sklearn import tree

# Evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix

### 1.2. Load Dataset

In [None]:
# df = pd.read_csv("../data/raw/train.csv") # stien til datafilen når det er i IdaData.ipynb
df = pd.read_csv("../../data/raw/train.csv") # stien til datafilen når det er i data.ipynb

## Part 2: Data Preparation

### 2.1. Data Description and Inspection

In [None]:
# Print the shape of the dataframe
df.shape

In [None]:
# Print the first few rows of the dataframe
df.head()

In [None]:
# Print the data types of each column
df.dtypes

### 2.2. Data Preprocessing and Cleaning 

In [None]:
# Remove rows with missing values
df = df.dropna(subset=["is_match"])

In [None]:
# Change the column names to don't use underscores and to use upper first letters
df.columns = [col.replace('_', ' ').title() for col in df.columns]

In [None]:
# print all the column names
print(df.columns.tolist())

In [None]:
# Store all column names before making changes
cols_before = set(df.columns)

# Drop unwanted columns (gender, religion, etc.)
df = df.drop(
    columns=[
        "Dated Wants To Date",
        "Dater Wants To Date",
        "Same Race Importance For Dater",
        "Same Religion Importance For Dater"
    ],
    errors="ignore"  # ignore errors if any columns are missing
)

# Drop all columns containing the word 'Race'
race_cols = [col for col in df.columns if "Race" in col]
df = df.drop(columns=race_cols, errors="ignore")

# Compare column sets before and after to see which columns were removed for verification
cols_after = set(df.columns)
removed_cols = cols_before - cols_after

# Print removed columns
print("Removed columns:", removed_cols)

In [None]:
# Convert categorical variables to numerical dummy variables using one-hot encoding
df = pd.get_dummies(df, drop_first=True)
df.head()

In [None]:
# Print the shape of the dataframe after cleaning
df.shape

In [None]:
# Get summary statistics of the dataframe
df.describe()

### 2.3. Split Features and Target Variable

In [None]:
# Split the data into features and target variable
# "Is Match" is the target variable and is binary (0 or 1) indicating if there was a match or not
X = df.drop("Is Match", axis=1)
y = df["Is Match"]

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20, # the dataset is small, so use 20% for testing
    stratify=y,
    random_state=42
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# Define K-fold cross-validation on the training data
# This 'cv' object will be used later when training/evaluating models
# CV means cross-validation
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

## Part 3: Modelling

### 3.1. Baseline Model

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Baseline model: always predicts the most frequent class in y_train
dummy_clf = DummyClassifier(strategy='most_frequent')

# Train on the *original* training data
dummy_clf.fit(X_train, y_train)

# Predict on test data
y_pred_dummy = dummy_clf.predict(X_test)

# Metrics
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
precision_dummy = precision_score(y_test, y_pred_dummy, zero_division=0)
recall_dummy = recall_score(y_test, y_pred_dummy, zero_division=0)
f1_dummy = f1_score(y_test, y_pred_dummy, zero_division=0)

print("=== DummyClassifier (Most Frequent) ===")
print(f"Accuracy:  {accuracy_dummy:.3f}")
print(f"Precision: {precision_dummy:.3f}")
print(f"Recall:    {recall_dummy:.3f}")
print(f"F1-score:  {f1_dummy:.3f}\n")

print("Classification report:")
print(classification_report(y_test, y_pred_dummy, zero_division=0))

# Confusion matrix
cm_dummy = confusion_matrix(y_test, y_pred_dummy)
print("Confusion matrix:")
print(cm_dummy)

sns.heatmap(cm_dummy, annot=True, fmt="d")
plt.title("Dummy Classifier – Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

### 3.2. Logistic Regression
#### 3.2.1. Logistic Regression with Cross-Validation

In [None]:
# Build a pipeline: scaling + model
log_reg_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("log_reg", LogisticRegression(max_iter=1000))
])

# Run cross-validation on the training data
lt_cv_scores = cross_val_score(
    log_reg_pipeline,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)

# Print the cross-validation scores
print("Logisitic Regression CV Scores:", lt_cv_scores)
print("Logisitic Regression mean CV Score:", lt_cv_scores.mean())

#### 3.2.2. Fine-Tune Logistic Regression with GridSearchCV

In [None]:
# Hyperparameter grid
param_grid = {
    "log_reg__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "log_reg__penalty": ["l1", "l2"],
    "log_reg__solver": ["liblinear", "saga"],
    "log_reg__class_weight": [None, "balanced"]
}

# Set up GridSearchCV
grid = GridSearchCV(
    estimator=log_reg_pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

# Fit the grid search on the training data
grid.fit(X_train, y_train)

# Print best parameters and best CV score
print("Best parameters:", grid.best_params_)
print("Best CV score:", grid.best_score_)

#### 3.3. Decision Tree

In [None]:
# Decision Tree without scaling (tree models are scale-invariant)
dt_model = DecisionTreeClassifier(
    random_state=42
)

# Run cross-validation on the training data for performance evaluation
dt_cv_scores = cross_val_score(
    dt_model,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)

# Print Decision Tree CV scores
print("Decision Tree CV scores:", dt_cv_scores)
print("Decision Tree mean CV score:", dt_cv_scores.mean())

##### 3.3.1. Reduced Decision Tree for Visualization

In [None]:
# Create a reduced Decision Tree for visualization with max depth of 3
dtree_small = DecisionTreeClassifier(
    random_state=42,
    max_depth=3
)

# Fit the reduced Decision Tree model on the training data
dtree_small.fit(X_train, y_train)

In [None]:
# Visualize the Decision Tree
# Create a figure with specified size
plt.figure(figsize=(13, 8))

# Plot the tree
tree.plot_tree(
    dtree_small,
class_names=["Unsuccessful Match", "Successful Match"],
    feature_names=list(X_train.columns),
    filled=True,
    fontsize=10
)

# Save the figure
plt.savefig("reduced_tree.png", bbox_inches="tight")

# Show the plot
plt.show()

#### 3.4. Random Forest

In [None]:
# Random Forest without scaling
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# Run cross-validation on the training data
rf_cv_scores = cross_val_score(
    rf_model,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)

# Print Random Forest CV scores
print("Random Forest CV scores:", rf_cv_scores)
print("Random Forest mean CV score:", rf_cv_scores.mean())

#### 3.4.1 Random Forest tuning


In [None]:

# Parameter grid 
rf_param_grid = {
    "n_estimators": [200, 500],                 
    "max_depth": [None, 5, 10, 20],             
    "min_samples_split": [2, 5, 10],            
    "min_samples_leaf": [1, 2, 4],              
    "max_features": ["sqrt", "log2"],            
    "bootstrap": [True]                         
}

# Grid search
rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_param_grid,
    cv=cv,                                      
    scoring="accuracy",
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit on training data
rf_grid.fit(X_train, y_train)

print("RF best params:", rf_grid.best_params_)
print("RF best CV accuracy:", rf_grid.best_score_)

#### 3.5. K-Nearest Neighbors

In [None]:
knn_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(
        n_neighbors=15,
        weights="distance",
        p=2
    ))
])

knn_cv_scores = cross_val_score(
    knn_pipeline,
    X_train,
    y_train,
    cv=cv,
    scoring="accuracy"
)


print("KNN CV scores (scaled):", knn_cv_scores)
print("KNN mean CV score (scaled):", knn_cv_scores.mean())


### 3.5.1 KNN tuning


In [None]:
param_grid = {
    "scaler": [StandardScaler(), RobustScaler(), "passthrough"],
    "knn__n_neighbors": [3, 6, 9,12,15,19, 24, 48, 96],
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2],                      # 1=Manhattan, 2=Euclidean
}

KNN_grid = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=param_grid,
    cv=cv,                                 
    scoring="accuracy",                    
    return_train_score=True,
    n_jobs=-1,
    verbose=3
)

KNN_grid.fit(X_train, y_train)

print("Best params:", KNN_grid.best_params_)
print("Best CV score:", KNN_grid.best_score_)

### 4. Results and Evaluation
#### 4.1. Results From the Models

In [None]:
# Collect CV results from the three models
results = {
    "Logistic Regression": lt_cv_scores.mean(),
    "Decision Tree": dt_cv_scores.mean(),
    "Random Forest": rf_cv_scores.mean(),
    "KNN": CV_grid_search.best_score_
}

# Convert to a clean table
results_table = pd.DataFrame({
    "Model": list(results.keys()),
    "CV Accuracy": [round(v, 4) for v in results.values()]
})

# Display results sorted by accuracy
results_table.sort_values("CV Accuracy", ascending=False).reset_index(drop=True)

### ?. Test Performance