In [1]:
# Calling the Necessary Libararies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
from scipy import stats
from sklearn.ensemble import IsolationForest

warnings.filterwarnings('ignore')
print("Ignoring all the warnings -- True")



In [6]:
label_encoder = LabelEncoder()
for feature in features:
    df[feature] = label_encoder.fit_transform(df[feature].astype(str))

In [7]:
male_data = df[df['gender'] == 0]
female_data = df[df['gender'] == 1]

In [12]:
X_male = male_data.drop(columns=[target])
y_male = male_data[target]
X_female = female_data.drop(columns=[target])
y_female = female_data[target]

X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(X_male, y_male, test_size=0.10)
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(X_female, y_female, test_size=0.10)

In [13]:
svm_model = SVC()
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()

In [14]:
# Initialize models with default hyperparameters
svm_model_male = SVC()
rf_model_male = RandomForestClassifier()
gb_model_male = GradientBoostingClassifier()

In [15]:
# Initialize models with default hyperparameters
svm_model_female = SVC()
rf_model_female = RandomForestClassifier()
gb_model_female = GradientBoostingClassifier()

In [16]:
# Fit models on training data (Male)
svm_model_male.fit(X_train_male, y_train_male)

SVC()

In [17]:
rf_model_male.fit(X_train_male, y_train_male)

RandomForestClassifier()

In [18]:
gb_model_male.fit(X_train_male, y_train_male)

GradientBoostingClassifier()

In [19]:
# Fit models on training data (Male)
svm_model_female.fit(X_train_female, y_train_female)

SVC()

In [20]:
rf_model_female.fit(X_train_female, y_train_female)

RandomForestClassifier()

In [21]:
gb_model_female.fit(X_train_female, y_train_female)

GradientBoostingClassifier()

In [22]:
# Predicting on the test set for Males dataset
svm_male_pred = svm_model_male.predict(X_test_male)
rf_male_pred = rf_model_male.predict(X_test_male)
gb_male_pred = gb_model_male.predict(X_test_male)

# Predicting on the test set for Females dataset
svm_female_pred = svm_model_female.predict(X_test_female)
rf_female_pred = rf_model_female.predict(X_test_female)
gb_female_pred = gb_model_female.predict(X_test_female)

In [23]:
# Calculating accuracy for Males dset
svm_male_accuracy = accuracy_score(y_test_male, svm_male_pred)
rf_male_accuracy = accuracy_score(y_test_male, rf_male_pred)
gb_male_accuracy = accuracy_score(y_test_male, gb_male_pred)

# Calculating accuracy for Females dset
svm_female_accuracy = accuracy_score(y_test_female, svm_female_pred)
rf_female_accuracy = accuracy_score(y_test_female, rf_female_pred)
gb_female_accuracy = accuracy_score(y_test_female, gb_female_pred)

In [24]:
# Calculating and formatting accuracy for Males dset
svm_male_accuracy_percentage = round(svm_male_accuracy * 100, 2)
rf_male_accuracy_percentage = round(rf_male_accuracy * 100, 2)
gb_male_accuracy_percentage = round(gb_male_accuracy * 100, 2)

# Calculating and formatting accuracy for Females dset
svm_female_accuracy_percentage = round(svm_female_accuracy * 100, 2)
rf_female_accuracy_percentage = round(rf_female_accuracy * 100, 2)
gb_female_accuracy_percentage = round(gb_female_accuracy * 100, 2)

# Creating a table for comparison for better understanding
table = [["Model", "Males", "Females"],
         ["SVM", svm_male_accuracy_percentage, svm_female_accuracy_percentage],
         ["Random Forest", rf_male_accuracy_percentage, rf_female_accuracy_percentage],
         ["Gradient Boosting", gb_male_accuracy_percentage, gb_female_accuracy_percentage]]

In [25]:
# Printing the table for showing the results
print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))

╒═══════════════════╤═════════╤═══════════╕
│ Model             │   Males │   Females │
╞═══════════════════╪═════════╪═══════════╡
│ SVM               │   84.94 │     83.58 │
├───────────────────┼─────────┼───────────┤
│ Random Forest     │   93.17 │     92.78 │
├───────────────────┼─────────┼───────────┤
│ Gradient Boosting │   94.09 │     93.42 │
╘═══════════════════╧═════════╧═══════════╛


In [26]:
# Function to calculate FPR and FNR for further analysis
def calculate_fpr_fnr(y_true, y_pred, group_label):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_negatives = tn + fp
    total_positives = fn + tp

    fpr = fp / total_negatives
    fnr = fn / total_positives

    print(f"FPR for {group_label}: {fpr}")
    print(f"FNR for {group_label}: {fnr}")
    return fpr, fnr

In [27]:
# Calculating FPR and FNR for SVC in both groups at once
fpr_male_svm, fnr_male_svm = calculate_fpr_fnr(y_test_male, svm_male_pred, "Males")
fpr_female_svm, fnr_female_svm = calculate_fpr_fnr(y_test_female, svm_female_pred, "Females")

# Calculating FPR and FNR for Random Forest in both groups at once
fpr_male_rf, fnr_male_rf = calculate_fpr_fnr(y_test_male, rf_male_pred, "Males")
fpr_female_rf, fnr_female_rf = calculate_fpr_fnr(y_test_female, rf_female_pred, "Females")

# Calculating FPR and FNR for Gradient Boosting in both groups at once
fpr_male_gb, fnr_male_gb = calculate_fpr_fnr(y_test_male, gb_male_pred, "Males")
fpr_female_gb, fnr_female_gb = calculate_fpr_fnr(y_test_female, gb_female_pred, "Females")

# Creating a table for comparison for better understanding
table = [
    ["Model", "Group", "FPR", "FNR"],
    ["SVM", "Males", round(fpr_male_svm, 2), round(fnr_male_svm, 2)],
    ["SVM", "Females", round(fpr_female_svm, 2), round(fnr_female_svm, 2)],
    ["Random Forest", "Males", round(fpr_male_rf, 2), round(fnr_male_rf, 2)],
    ["Random Forest", "Females", round(fpr_female_rf, 2), round(fnr_female_rf, 2)],
    ["Gradient Boosting", "Males", round(fpr_male_gb, 2), round(fnr_male_gb, 2)],
    ["Gradient Boosting", "Females", round(fpr_female_gb, 2), round(fnr_female_gb, 2)],
]

FPR for Males: 0.2648809523809524
FNR for Males: 0.04806408544726302
FPR for Females: 0.27244094488188975
FNR for Females: 0.07583547557840617
FPR for Males: 0.09523809523809523
FNR for Males: 0.044058744993324434
FPR for Females: 0.08976377952755905
FNR for Females: 0.05784061696658098
FPR for Males: 0.07291666666666667
FNR for Males: 0.04672897196261682
FPR for Females: 0.06929133858267716
FNR for Females: 0.06298200514138817


In [28]:
# Printing the table for showing the results
print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))

╒═══════════════════╤═════════╤═══════╤═══════╕
│ Model             │ Group   │   FPR │   FNR │
╞═══════════════════╪═════════╪═══════╪═══════╡
│ SVM               │ Males   │  0.26 │  0.05 │
├───────────────────┼─────────┼───────┼───────┤
│ SVM               │ Females │  0.27 │  0.08 │
├───────────────────┼─────────┼───────┼───────┤
│ Random Forest     │ Males   │  0.1  │  0.04 │
├───────────────────┼─────────┼───────┼───────┤
│ Random Forest     │ Females │  0.09 │  0.06 │
├───────────────────┼─────────┼───────┼───────┤
│ Gradient Boosting │ Males   │  0.07 │  0.05 │
├───────────────────┼─────────┼───────┼───────┤
│ Gradient Boosting │ Females │  0.07 │  0.06 │
╘═══════════════════╧═════════╧═══════╧═══════╛


In [29]:
# Calculating the Disparate Mistreatment in both groups for each algorithm at once
disparate_mistreatment_svm = {
    "SVM": abs(fpr_male_svm - fpr_female_svm) + abs(fnr_male_svm - fnr_female_svm)
}

disparate_mistreatment_rf = {
    "Random Forest": abs(fpr_male_rf - fpr_female_rf) + abs(fnr_male_rf - fnr_female_rf)
}

disparate_mistreatment_gb = {
    "Gradient Boosting": abs(fpr_male_gb - fpr_female_gb) + abs(fnr_male_gb - fnr_female_gb)
}

table_data = []
table_data.append(["Model", "Disparate Mistreatment"])

# Adding the values for SVC for better analysis
table_data.append(["SVM", round(disparate_mistreatment_svm['SVM'], 2)])

# Adding the values for Random Forest Classifier for better analysis
table_data.append(["Random Forest", round(disparate_mistreatment_rf['Random Forest'], 2)])

# Adding the values for Gradient Boosting Classifier for better analysis
table_data.append(["Gradient Boosting", round(disparate_mistreatment_gb['Gradient Boosting'], 2)])

In [30]:
# Printing the table for showing the results
table = tabulate(table_data, headers="firstrow", tablefmt="fancy_grid")
print(table)

╒═══════════════════╤══════════════════════════╕
│ Model             │   Disparate Mistreatment │
╞═══════════════════╪══════════════════════════╡
│ SVM               │                     0.04 │
├───────────────────┼──────────────────────────┤
│ Random Forest     │                     0.02 │
├───────────────────┼──────────────────────────┤
│ Gradient Boosting │                     0.02 │
╘═══════════════════╧══════════════════════════╛
