Test of which variables to remove from dataset

## Part 1: Introduction

### 1.1. Import libraries

In [14]:
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

### 1.2. Load Dataset

In [15]:
df = pd.read_csv("train.csv")

## Part 2: Data Preparation

In [16]:
# Print the shape of the dataframe
df.shape

(1048, 65)

In [17]:
# Remove rows with missing values
df = df.dropna(subset=["is_match"])

In [18]:
# Change the column names to don't use underscores and to use upper first letters
df.columns = [col.replace('_', ' ').title() for col in df.columns]

In [19]:
# Convert categorical variables to numerical dummy variables using one-hot encoding
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Is Dater Male,Dater Age,Dated Age,Age Difference,Are Same Race,Same Race Importance For Dater,Same Religion Importance For Dater,Attractiveness Importance For Dated,Sincerity Importance For Dated,Intelligence Importance For Dated,...,Dated Wants To Date,Is Match,Dater Race_'Black/African American',Dater Race_'Latino/Hispanic American',Dater Race_caucasian,Dater Race_other,Dated Race_'Black/African American',Dated Race_'Latino/Hispanic American',Dated Race_caucasian,Dated Race_other
0,False,21,27,6,False,2.0,4.0,35.0,20.0,20.0,...,False,0,False,False,False,False,False,False,True,False
1,False,21,22,1,False,2.0,4.0,60.0,0.0,0.0,...,False,0,False,False,False,False,False,False,True,False
2,False,21,23,2,False,2.0,4.0,30.0,5.0,15.0,...,True,1,False,False,False,False,False,False,True,False
3,False,21,24,3,False,2.0,4.0,30.0,10.0,20.0,...,True,1,False,False,False,False,False,True,False,False
4,False,21,25,4,False,2.0,4.0,50.0,0.0,30.0,...,True,0,False,False,False,False,False,False,True,False


In [20]:
# Print the shape of the dataframe after cleaning
df.shape

(1048, 71)

In [21]:
# Define K-fold cross-validation
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

## Part 3: Modelling --> Test of which variables to remove from dataset

In [22]:
# 8 fairness scenarios: include/exclude gender, race, religion
fairness_scenarios = {
    "1: include gender, race, religion": {
        "gender": True,
        "race": True,
        "religion": True
    },
    "2: include gender, race, exclude religion": {
        "gender": True,
        "race": True,
        "religion": False
    },
    "3: include gender, exclude race, religion": {
        "gender": True,
        "race": False,
        "religion": False
    },
    "4: include gender, religion, exclude race": {
        "gender": True,
        "race": False,
        "religion": True
    },
    "5: exclude gender, race, religion": {
        "gender": False,
        "race": False,
        "religion": False
    },
    "6: include race, religion, exclude gender": {
        "gender": False,
        "race": True,
        "religion": True
    },
    "7: include race, exclude gender, religion": {
        "gender": False,
        "race": True,
        "religion": False
    },
    "8: include religion, exclude gender, race": {
        "gender": False,
        "race": False,
        "religion": True
    }
}

In [23]:
# Function to prepare data for a given fairness scenario
def prepare_data_for_scenario(df, scenario):
    df_copy = df.copy()

# Remove columns that directly indicate dating preferences
    df_copy = df_copy.drop(
        columns=[
            "Dater Wants To Date",
            "Dated Wants To Date"
        ],
        errors="ignore"
    )

# Remove columns based on scenario
    if not scenario["gender"]:
        gender_cols = [c for c in df_copy.columns if "Gender" in c or "Male" in c or "Female" in c]
        df_copy = df_copy.drop(columns=gender_cols, errors="ignore")

    if not scenario["race"]:
        race_cols = [c for c in df_copy.columns if "Race" in c]
        df_copy = df_copy.drop(columns=race_cols, errors="ignore")

    if not scenario["religion"]:
        religion_cols = [c for c in df_copy.columns if "Religion" in c]
        df_copy = df_copy.drop(columns=religion_cols, errors="ignore")

    return df_copy

In [24]:
# Function to evaluate model using cross-validation and return mean F1 score
def evaluate_model_f1(model, X, y, cv):
    scores = cross_val_score(
        model,
        X,
        y,
        cv=cv,
        scoring="f1"
    )
    return scores.mean()

In [25]:
# Define models to evaluate
models = {
    "Baseline": DummyClassifier(strategy="most_frequent"),

    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("log_reg", LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            C=1,
            penalty="l1",
            solver="liblinear"
        ))
    ]),

    "Decision Tree": DecisionTreeClassifier(
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=1,
        random_state=42
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),

    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(
            n_neighbors=15,
            weights="distance",
            p=2
        ))
    ])
}

In [26]:
# Evaluate models across fairness scenarios
results = []

for scenario_name, scenario in fairness_scenarios.items():
    df_scenario = prepare_data_for_scenario(df, scenario)

    X_s = df_scenario.drop(columns=["Is Match"])
    y_s = df_scenario["Is Match"]

    for model_name, model in models.items():
        f1 = evaluate_model_f1(model, X_s, y_s, cv)
        results.append({
            "Scenario": scenario_name,
            "Model": model_name,
            "F1-score": round(f1, 4)
        })

results_df = pd.DataFrame(results)

pivot_table = results_df.pivot(
    index="Scenario",
    columns="Model",
    values="F1-score"
)

pivot_table

Model,Baseline,Decision Tree,KNN,Logistic Regression,Random Forest
Scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"1: include gender, race, religion",0.0,0.3692,0.167,0.5032,0.3878
"2: include gender, race, exclude religion",0.0,0.4055,0.1582,0.4997,0.3744
"3: include gender, exclude race, religion",0.0,0.4581,0.2408,0.5233,0.3918
"4: include gender, religion, exclude race",0.0,0.4303,0.2454,0.5212,0.3816
"5: exclude gender, race, religion",0.0,0.4538,0.2456,0.5213,0.3879
"6: include race, religion, exclude gender",0.0,0.3802,0.1652,0.5012,0.3763
"7: include race, exclude gender, religion",0.0,0.397,0.1646,0.4975,0.3839
"8: include religion, exclude gender, race",0.0,0.4323,0.2402,0.528,0.3664
