## Part 1: Data preparation

### 1. Import libraries

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

### 2. Load dataset and inspect it

In [19]:
df = pd.read_csv("train.csv")

# Print the shape of the dataframe
df.shape

(1048, 65)

In [None]:
# Print the first few rows of the dataframe
df.head()

Unnamed: 0,is_dater_male,dater_age,dated_age,age_difference,are_same_race,same_race_importance_for_dater,same_religion_importance_for_dater,attractiveness_importance_for_dated,sincerity_importance_for_dated,intelligence_importance_for_dated,...,dated_wants_to_date,is_match,dater_race_'Black/African American',dater_race_'Latino/Hispanic American',dater_race_caucasian,dater_race_other,dated_race_'Black/African American',dated_race_'Latino/Hispanic American',dated_race_caucasian,dated_race_other
0,False,21,27,6,False,2.0,4.0,35.0,20.0,20.0,...,False,0,False,False,False,False,False,False,True,False
1,False,21,22,1,False,2.0,4.0,60.0,0.0,0.0,...,False,0,False,False,False,False,False,False,True,False
2,False,21,23,2,False,2.0,4.0,30.0,5.0,15.0,...,True,1,False,False,False,False,False,False,True,False
3,False,21,24,3,False,2.0,4.0,30.0,10.0,20.0,...,True,1,False,False,False,False,False,True,False,False
4,False,21,25,4,False,2.0,4.0,50.0,0.0,30.0,...,True,0,False,False,False,False,False,False,True,False


In [24]:
# Change the column names to don't use underscores and to use upper first letters
df.columns = [col.replace('_', ' ').title() for col in df.columns]
df.head()

Unnamed: 0,Is Dater Male,Dater Age,Dated Age,Age Difference,Dater Race,Dated Race,Are Same Race,Same Race Importance For Dater,Same Religion Importance For Dater,Attractiveness Importance For Dated,...,Interests Correlation,Expected Satisfaction Of Dater,Expected Number Of Likes Of Dater From 20 People,Expected Number Of Dates For Dater,Dater Liked Dated,Probability Dated Wants To Date,Already Met Before,Dater Wants To Date,Dated Wants To Date,Is Match
0,False,21,27,6,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,35.0,...,0.14,3.0,2,4,7.0,6.0,True,True,False,0
1,False,21,22,1,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,60.0,...,0.54,3.0,2,4,7.0,5.0,True,True,False,0
2,False,21,23,2,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,30.0,...,0.61,3.0,2,4,7.0,6.0,True,True,True,1
3,False,21,24,3,'Asian/Pacific Islander/Asian-American','Latino/Hispanic American',False,2.0,4.0,30.0,...,0.21,3.0,2,4,6.0,6.0,True,True,True,1
4,False,21,25,4,'Asian/Pacific Islander/Asian-American',caucasian,False,2.0,4.0,50.0,...,0.25,3.0,2,4,6.0,5.0,True,False,True,0


In [26]:
# Print the data types of each column
df.dtypes

Is Dater Male                         bool
Dater Age                            int64
Dated Age                            int64
Age Difference                       int64
Dater Race                          object
                                    ...   
Probability Dated Wants To Date    float64
Already Met Before                    bool
Dater Wants To Date                   bool
Dated Wants To Date                   bool
Is Match                             int64
Length: 65, dtype: object

### 3. Data cleaning and preprocessing

In [28]:
# Remove rows with missing values
df = df.dropna(subset=["Is Match"])

# If some columns are text (e.g. race, gender), convert them to numerical dummy variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Print the shape of the dataframe after cleaning
df.shape

(1048, 71)

### 4. Split features and target variable

In [6]:

X = df.drop("is_match", axis=1)
y = df["is_match"]

# Først: 70% train, 30% midlertidig (val + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)

# Dernæst: del de 30% i 15% val og 15% test (halv-halv)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 733, Validation: 157, Test: 158


### 5. Scaling

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


### 6. Logisitic Regression

In [15]:

model = LogisticRegression(
    solver="liblinear",        # robust til små datasæt
    random_state=42,
    max_iter=500
)
model.fit(X_train_scaled, y_train)

### 7. Validation Performance

In [16]:

y_val_pred = model.predict(X_val_scaled)
print("\n--- Validation Performance ---")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=3))


--- Validation Performance ---
[[110   0]
 [  0  24]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       110
           1      1.000     1.000     1.000        24

    accuracy                          1.000       134
   macro avg      1.000     1.000     1.000       134
weighted avg      1.000     1.000     1.000       134



### 8. Test Performance

In [17]:

y_test_pred = model.predict(X_test_scaled)
print("\n--- Test Performance ---")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=3))


--- Test Performance ---
[[130   0]
 [  0  28]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000       130
           1      1.000     1.000     1.000        28

    accuracy                          1.000       158
   macro avg      1.000     1.000     1.000       158
weighted avg      1.000     1.000     1.000       158



### 9. Feature Importance (Koefficienter)