In [1]:
# Calling the Necessary Libararies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
from scipy import stats
from sklearn.ensemble import IsolationForest

warnings.filterwarnings('ignore')
print("Ignoring all the warnings -- True")



In [5]:
label_encoder = LabelEncoder()
for feature in features:
    df[feature] = label_encoder.fit_transform(df[feature].astype(str))

In [6]:
male_data = df[df['Gender'] == 0]
female_data = df[df['Gender'] == 1]

In [9]:
X_male = male_data.drop(columns=[target])
y_male = male_data[target]
X_female = female_data.drop(columns=[target])
y_female = female_data[target]

X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(X_male, y_male, test_size=0.10)
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(X_female, y_female, test_size=0.10)

In [10]:
# Loading the dataset once again
data = df

# Checking the status for normalization before applying the MinMaxScaler technique
def is_normalized(data, tolerance=1e-6):
    return np.all(np.abs(data.mean(axis=0)) < tolerance) and np.all(np.abs(data.std(axis=0) - 1) < tolerance)

is_male_data_normalized = is_normalized(male_data[features])
is_female_data_normalized = is_normalized(female_data[features])

if is_male_data_normalized:
    print("Male data is already normalized.")
else:
    print("Male data is not normalized.")

if is_female_data_normalized:
    print("Female data is already normalized.")
else:
    print("Female data is not normalized.")

# Applying Min-Max-Scaler for outlier treatment, so that further analysis can be performed smoothly
min_max_scaler = MinMaxScaler()

male_data[features] = min_max_scaler.fit_transform(male_data[features])
female_data[features] = min_max_scaler.fit_transform(female_data[features])

# Applying the PowerTransformer for final normalization with 'yeo-johnson' method, according to the google scholar, it is the best method for data normalisation
power_transformer = PowerTransformer(method='yeo-johnson', standardize=False)

male_data[features] = power_transformer.fit_transform(male_data[features])
female_data[features] = power_transformer.fit_transform(female_data[features])

# Checking the final status for normalization after PowerTransformer
is_male_data_normalized = is_normalized(male_data[features])
is_female_data_normalized = is_normalized(female_data[features])

if is_male_data_normalized:
    print("Male data is normalized after PowerTransformer.")
else:
    print("Male data is not normalized after PowerTransformer.")
    print('This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.')

if is_female_data_normalized:
    print("Female data is normalized after PowerTransformer.")
else:
    print("Female data is not normalized after PowerTransformer.")
    print('This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.')

Male data is not normalized.
Female data is not normalized.
Male data is not normalized after PowerTransformer.
This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.
Female data is not normalized after PowerTransformer.
This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.


In [11]:
# Loading the dataset once again
data = data

# Using the Label encode categorical features once again
label_encoder = LabelEncoder()
for feature in features:
    male_data[feature] = label_encoder.fit_transform(male_data[feature])
    female_data[feature] = label_encoder.fit_transform(female_data[feature])

# Extracting the relevent features and target for male and female dset for further analysis
X_train_male = male_data[features].values
y_train_male = male_data[target].values
X_train_female = female_data[features].values
y_train_female = female_data[target].values

# Creating a function to calculate FPR and FNR for further analysis
def calculate_fpr_fnr(y_true, y_pred, group_label):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_negatives = tn + fp
    total_positives = fn + tp

    fpr = fp / total_negatives
    fnr = fn / total_positives

    print(f"FPR for {group_label}: {fpr}")
    print(f"FNR for {group_label}: {fnr}")
    return fpr, fnr

# Checking the normalization status after implementing Isolation Forest
def is_normalized(data, tolerance=1e-6):
    return np.all(np.abs(data.mean(axis=0)) < tolerance) and np.all(np.abs(data.std(axis=0) - 1) < tolerance)

if is_male_data_normalized:
    print("Male data is normalized after Isolation Forest and PowerTransformer.")
else:
    print("Male data is not normalized after Isolation Forest and PowerTransformer.")
    print('This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.')

if is_female_data_normalized:
    print("Female data is normalized after Isolation Forest and PowerTransformer.")
else:
    print("Female data is not normalized after Isolation Forest and PowerTransformer.")
    print('This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.')

Male data is not normalized after Isolation Forest and PowerTransformer.
This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.
Female data is not normalized after Isolation Forest and PowerTransformer.
This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.


In [12]:
# Initialising the Isolation Forest instance
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fitting the Isolation Forest to the male dset
iso_forest.fit(X_train_male)

# Predicting the outliers for male dset
outliers_male = iso_forest.predict(X_train_male)

# Filtering the outliers from the train environment
X_train_male = X_train_male[outliers_male != -1]
y_train_male = y_train_male[outliers_male != -1]

# Fitting the Isolation Forest to the female dset
iso_forest.fit(X_train_female)

# Predicting the outliers for female dset
outliers_female = iso_forest.predict(X_train_female)

# Filtering the outliers in train environment
X_train_female = X_train_female[outliers_female != -1]
y_train_female = y_train_female[outliers_female != -1]

# Applying the PowerTransformer for data normalization process
power_transformer = PowerTransformer()
X_train_male = power_transformer.fit_transform(X_train_male)
X_train_female = power_transformer.fit_transform(X_train_female)

# Checking the normalization status after implementing Isolation Forest and PowerTransformer all together
is_male_data_normalized = is_normalized(X_train_male)
is_female_data_normalized = is_normalized(X_train_female)

if is_male_data_normalized:
    print("Male data is normalized after Isolation Forest and PowerTransformer.")
else:
    print("Male data is not normalized after Isolation Forest and PowerTransformer.")
    print('This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.')

if is_female_data_normalized:
    print("Female data is normalized after Isolation Forest and PowerTransformer.")
else:
    print("Female data is not normalized after Isolation Forest and PowerTransformer.")
    print('This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.')

Male data is not normalized after Isolation Forest and PowerTransformer.
This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.
Female data is not normalized after Isolation Forest and PowerTransformer.
This issue can occur when there is no variation (i.e., the standard deviation is zero) in one or more of the columns.


In [13]:
# Function to calculate FPR and FNR for further analysis
def calculate_fpr_fnr(y_true, y_pred, group_label):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_negatives = tn + fp
    total_positives = fn + tp

    fpr = fp / total_negatives
    fnr = fn / total_positives

    print(f"FPR for {group_label}: {fpr}")
    print(f"FNR for {group_label}: {fnr}")
    return fpr, fnr

# Loading the entire dataset once again
data = data

for feature in features:
    male_data[feature] = label_encoder.fit_transform(male_data[feature])
    female_data[feature] = label_encoder.fit_transform(female_data[feature])

# Splitting the data into training and testing sets (90% training, 10% testing)
X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(male_data[features], male_data[target], test_size=0.10, random_state=42)
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(female_data[features], female_data[target], test_size=0.10, random_state=42)

# Initialising the Isolation Forest for outlier detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fitting the Isolation Forest to the male data for further analysis
iso_forest.fit(X_train_male)

# Predicting the Outliers for the male dset
outliers_male = iso_forest.predict(X_train_male)

# Filtering the outliers from the training dset
X_train_male = X_train_male[outliers_male != -1]
y_train_male = y_train_male[outliers_male != -1]

# Fitting the Isolation Forest algorithm to the female data for further analysis
iso_forest.fit(X_train_female)

# Predicting outliers for female data for further analysis
outliers_female = iso_forest.predict(X_train_female)

# Filtering the outliers form the traiing data
X_train_female = X_train_female[outliers_female != -1]
y_train_female = y_train_female[outliers_female != -1]

# Applying PowerTransformer to normalize the data
power_transformer = PowerTransformer()
X_train_male = power_transformer.fit_transform(X_train_male)
X_train_female = power_transformer.fit_transform(X_train_female)

# Model training (SVC, Random Forest Classifier, Gradient Boosting Classifier) with default parameters for Males because as mentioend in the report that default showed the best accuracy
svm_model_male = SVC()
rf_model_male = RandomForestClassifier()
gb_model_male = GradientBoostingClassifier()

svm_model_male.fit(X_train_male, y_train_male)
rf_model_male.fit(X_train_male, y_train_male)
gb_model_male.fit(X_train_male, y_train_male)

# Predicting on the test set for Males for furhter analysis
svm_male_pred = svm_model_male.predict(X_test_male)
rf_male_pred = rf_model_male.predict(X_test_male)
gb_male_pred = gb_model_male.predict(X_test_male)

# Calculating the accuracy for Males
svm_male_accuracy = accuracy_score(y_test_male, svm_male_pred)
rf_male_accuracy = accuracy_score(y_test_male, rf_male_pred)
gb_male_accuracy = accuracy_score(y_test_male, gb_male_pred)

# Model training (SVC, Random Forest Classifier, Gradient Boosting Classifier) with default parameters for Females (As explained in the report default parameters were the best)
svm_model_female = SVC()
rf_model_female = RandomForestClassifier()
gb_model_female = GradientBoostingClassifier()

svm_model_female.fit(X_train_female, y_train_female)
rf_model_female.fit(X_train_female, y_train_female)
gb_model_female.fit(X_train_female, y_train_female)

# Predicting on the test set for Females for further analysis
svm_female_pred = svm_model_female.predict(X_test_female)
rf_female_pred = rf_model_female.predict(X_test_female)
gb_female_pred = gb_model_female.predict(X_test_female)

# Calculating the accuracy for Females
svm_female_accuracy = accuracy_score(y_test_female, svm_female_pred)
rf_female_accuracy = accuracy_score(y_test_female, rf_female_pred)
gb_female_accuracy = accuracy_score(y_test_female, gb_female_pred)

# Formating accuracy to percentages with 2 decimal places for better representation
svm_male_accuracy_percentage = round(svm_male_accuracy * 100, 2)
rf_male_accuracy_percentage = round(rf_male_accuracy * 100, 2)
gb_male_accuracy_percentage = round(gb_male_accuracy * 100, 2)

svm_female_accuracy_percentage = round(svm_female_accuracy * 100, 2)
rf_female_accuracy_percentage = round(rf_female_accuracy * 100, 2)
gb_female_accuracy_percentage = round(gb_female_accuracy * 100, 2)

# Calculating FPR (false Positive rate) and FNR (Flase negative Rate) for SVM in both groups
fpr_male_svm, fnr_male_svm = calculate_fpr_fnr(y_test_male, svm_male_pred, "Males")
fpr_female_svm, fnr_female_svm = calculate_fpr_fnr(y_test_female, svm_female_pred, "Females")

# Calculating FPR (false Positive rate) and FNR (Flase negative Rate) for Random Forest in both groups
fpr_male_rf, fnr_male_rf = calculate_fpr_fnr(y_test_male, rf_male_pred, "Males")
fpr_female_rf, fnr_female_rf = calculate_fpr_fnr(y_test_female, rf_female_pred, "Females")

# Calculating the FPR (false Positive rate) and FNR (Flase negative Rate) for Gradient Boosting in both groups
fpr_male_gb, fnr_male_gb = calculate_fpr_fnr(y_test_male, gb_male_pred, "Males")
fpr_female_gb, fnr_female_gb = calculate_fpr_fnr(y_test_female, gb_female_pred, "Females")

# Table 1: Storing Model Accuracies for Males and Females in variable table4
table4 = [["Model", "Males", "Females"],
         ["SVM", svm_male_accuracy_percentage, svm_female_accuracy_percentage],
         ["Random Forest", rf_male_accuracy_percentage, rf_female_accuracy_percentage],
         ["Gradient Boosting", gb_male_accuracy_percentage, gb_female_accuracy_percentage]]

# Printing the 1st table
print('')
print("Table 1: Model Accuracies for Males and Females")
print('')
print(tabulate(table4, headers="firstrow", tablefmt="fancy_grid"))

# Table 2: Storing False Positive Rate (FPR) and False Negative Rate (FNR) for Different Models and Groups all together
table5 = [
    ["Model", "Group", "FPR", "FNR"],
    ["SVM", "Males", round(fpr_male_svm, 2), round(fnr_male_svm, 2)],
    ["SVM", "Females", round(fpr_female_svm, 2), round(fnr_female_svm, 2)],
    ["Random Forest", "Males", round(fpr_male_rf, 2), round(fnr_male_rf, 2)],
    ["Random Forest", "Females", round(fpr_female_rf, 2), round(fnr_female_rf, 2)],
    ["Gradient Boosting", "Males", round(fpr_male_gb, 2), round(fnr_male_gb, 2)],
    ["Gradient Boosting", "Females", round(fpr_female_gb, 2), round(fnr_female_gb, 2)],
]

# Printing the 2nd table
print('')
print("Table 2: False Positive Rate (FPR) and False Negative Rate (FNR) for Different Models and Groups")
print('')
print(tabulate(table5, headers="firstrow", tablefmt="fancy_grid"))

# Calculating the Disparate Mistreatment in both groups for each of the algorithms
disparate_mistreatment_svm = {
    "SVM": abs(fpr_male_svm - fpr_female_svm) + abs(fnr_male_svm - fnr_female_svm)
}

disparate_mistreatment_rf = {
    "Random Forest": abs(fpr_male_rf - fpr_female_rf) + abs(fnr_male_rf - fnr_female_rf)
}

disparate_mistreatment_gb = {
    "Gradient Boosting": abs(fpr_male_gb - fpr_female_gb) + abs(fnr_male_gb - fnr_female_gb)
}

# Table 3: Displaying the Disparate Mistreatment Values for Different Models
table6 = []
table6.append(["Model", "Disparate Mistreatment"])

# Adding the values to the SVM/SVC
table6.append(["SVM", round(disparate_mistreatment_svm['SVM'], 2)])

# Adding the values to Random Forest Classifier Algorithm
table6.append(["Random Forest", round(disparate_mistreatment_rf['Random Forest'], 2)])

# Adding the values to the Gradient Boosting Classifier algorith
table6.append(["Gradient Boosting", round(disparate_mistreatment_gb['Gradient Boosting'], 2)])

# Printing the 3rd table
print('')
print("Table 3: Disparate Mistreatment Values for Different Models")
print('')
print(tabulate(table6, headers="firstrow", tablefmt="fancy_grid"))

FPR for Males: 0.0
FNR for Males: 1.0
FPR for Females: 0.0
FNR for Females: 1.0
FPR for Males: 0.0
FNR for Males: 1.0
FPR for Females: 0.013333333333333334
FNR for Females: 0.8571428571428571
FPR for Males: 0.12
FNR for Males: 1.0
FPR for Females: 0.92
FNR for Females: 0.07142857142857142

Table 1: Model Accuracies for Males and Females

╒═══════════════════╤═════════╤═══════════╕
│ Model             │   Males │   Females │
╞═══════════════════╪═════════╪═══════════╡
│ SVM               │   84.75 │     84.27 │
├───────────────────┼─────────┼───────────┤
│ Random Forest     │   84.75 │     85.39 │
├───────────────────┼─────────┼───────────┤
│ Gradient Boosting │   74.58 │     21.35 │
╘═══════════════════╧═════════╧═══════════╛

Table 2: False Positive Rate (FPR) and False Negative Rate (FNR) for Different Models and Groups

╒═══════════════════╤═════════╤═══════╤═══════╕
│ Model             │ Group   │   FPR │   FNR │
╞═══════════════════╪═════════╪═══════╪═══════╡
│ SVM               │