In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import ttest_ind
from sklearn.preprocessing import LabelEncoder


### Analyzing Bias Towards Women in Banking Data

The model analyzes a CSV of banking data to detect potential bias towards women. It follows these steps:

1. **Preprocessing**: Reads the data, sets 'CustomerID' as the index, and encodes categorical columns.
2. **Sampling**: Samples 10% of the records, swapping genders (males to females and vice versa).
3. **Prediction**: Uses KNN to predict the decision (label) for the modified data.
4. **Grouping**: Groups all males and females from both original and modified data.
5. **T-test**: Compares means of decision labels to check for significant differences, indicating potential bias towards males.


# Parameters

In [2]:
# Choose biased or unbiased input file here (comment out the non-desired one):

#not biased input
# csv_path = 'unbiased_input.csv'

#biased input
csv_path = 'data_biased_output.csv'


label_column = 'Decision'

#knn classifier hyper parameter
k = 5


# Pre-process

In [3]:


def preprocess_data(csv_path, label_column):
    # Step 1: Read CSV into DataFrame
    df = pd.read_csv(csv_path)
    
    # Step 2: Set 'Customer' as index
    df.set_index('CustomerID', inplace=True)
    
    # Step 3: Encode categorical text columns
    label_encoders = {}
    for column in df.columns:
        if df[column].dtype == 'object':
            label_encoders[column] = LabelEncoder()
            df[column] = label_encoders[column].fit_transform(df[column])

                
    # Step 4: Separate label and features
    y = df[label_column]
    X = df.drop(columns=[label_column])

    return X, y, label_encoders
    

X, y, label_encoders = preprocess_data(csv_path, label_column)


# Knn algorithm

Train Knn, sample test data, manipulate the data and predict the new labels
devide the data into the distribution of labels between males and females

In [4]:
def knn_algorithm(X, y, label_encoders, label_column, k):
 

    # Step 1: Sample 10% of the data for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Step 2: Reverse the 'gender' column in the test set

    X_test_modified = X_test.copy()
    y_test_modified = y_test.copy()
    X_test_modified['Gender']= X_test_modified['Gender'].apply(lambda x: 1 - x)

    # Step 3: Perform KNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Step 4: Predict the modified test set
    y_modified_pred = knn.predict(X_test_modified)


    # Step 8: define y_male, y_female the decision distribution for males and females
    y_test_male = y_test[X_test['Gender'] == 1].values
    y_modified_male = y_modified_pred[X_test_modified['Gender'] == 1]
    y_male = np.concatenate((y_test_male, y_modified_male))
    y_test_female = y_test[X_test['Gender'] == 0].values
    y_modified_female = y_modified_pred[X_test_modified['Gender'] == 0]
    y_female = np.concatenate((y_test_female, y_modified_female))
    return y_male, y_female


y_male, y_female = knn_algorithm(X, y, label_encoders, label_column, k)


# Analyze Results and perform T-test

In [5]:

mean_male = np.mean(y_male)
mean_female = np.mean(y_female)
print(f"male mean:  {mean_male:.3f}")
print(f"female mean: {mean_female:.3f}")
t_stat, p_value = ttest_ind(y_male, y_female)
print(f"T-statistic: {t_stat:.3f}, P-value: {p_value:.3f}")


male mean:  0.454
female mean: 0.300
T-statistic: 2.582, P-value: 0.010
