# German Credit Dataset analysis using Aequitas

In [1]:
pip install aequitas

Collecting aequitas
  Using cached aequitas-1.0.0-3-py3-none-any.whl (3.1 MB)
[31mERROR: Package 'aequitas' requires a different Python: 3.7.6 not in '<3.12,>=3.8'[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install aif360

Collecting aif360
  Using cached aif360-0.6.1-py3-none-any.whl.metadata (5.0 kB)
Collecting scipy>=1.2.0 (from aif360)
  Using cached scipy-1.14.1-cp312-cp312-macosx_14_0_x86_64.whl.metadata (60 kB)
Collecting scikit-learn>=1.0 (from aif360)
  Downloading scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn>=1.0->aif360)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1.0->aif360)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading aif360-0.6.1-py3-none-any.whl (259 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.14.1-cp312-cp312-macosx_14_0_x86_64.whl (25.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.6/25.6 MB[0m 

In [3]:
import pandas as pd
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot
import aequitas.plot as ap

# import warnings; warnings.simplefilter('ignore')

%matplotlib inline

In [4]:
## loading the dataset through API 
from aif360.datasets import GermanDataset

german_dataset = GermanDataset()

dfgerman = pd.DataFrame(german_dataset.convert_to_dataframe()[0])
# remove labels
dfgerman = dfgerman.drop(columns=german_dataset.label_names)
dfgerman['label'] = tuple(german_dataset.labels[:, 0])

dfgerman

IOError: [Errno 2] No such file or directory: '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/aif360/datasets/../data/raw/german/german.data'
To use this class, please download the following files:

	https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data
	https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc

and place them, as-is, in the folder:

	/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/aif360/data/raw/german



SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
## with Pandas
## Point to the folder where you put the data file, like this:
#filepath = "C://Users//Sohail//.conda//envs//python_3_gpss//Lib//site-packages//aif360//data//raw//german//german.data"


column_names = ['status', 'month', 'credit_history',
    'purpose', 'credit_amount', 'savings', 'employment',
    'investment_as_income_percentage', 'personal_status',
    'other_debtors', 'residence_since', 'property', 'age',
    'installment_plans', 'housing', 'number_of_credits',
    'skill_level', 'people_liable_for', 'telephone',
    'foreign_worker', 'credit']
na_values=[]

german_df = pd.read_csv(filepath, sep=' ', header=None, names=column_names,
                     na_values=na_values)
german_df.shape

In [None]:
german_df.head()


## Exploring the German Dataset

In [None]:
## mapping age 1 >=25
german_df['age_mapped'] = german_df['age'].apply(lambda x: 'older' if x>=25 else 'younger')
german_df.head()

In [None]:
by_age = sns.countplot(x="age_mapped", hue="credit", data=german_df, palette="bright")


In [None]:
# A91 : male : divorced/separated
# A92 : female : divorced/separated/married
# A93 : male : single
# A94 : male : married/widowed
# A95 : female : single 
malecat = ['A91','A93','A94']
german_df['sex_mapped'] = german_df['personal_status'].apply(lambda x: "male" if x in malecat else "female")

by_sex = sns.countplot(x="sex_mapped", hue="credit", data=german_df, palette="bright")

## Using Aequitas to audit the dataset

Before the dta can be used with aequitas it requies preprocessing The input dataframe must comtain score for labels and categories must be strings For detailed requirements check https://github.com/dssg/aequitas#input-data

In [None]:
german_df = german_df.rename(columns={'credit':'label_value'}) ## label is the machine original label value


In [None]:
#trying SVM
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

svcClf =  make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))

german_x = german_df.drop('label_value',  axis=1)
german_y = german_df['label_value']

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(german_x)
german_x = enc.transform(german_x)

german_x_train,german_x_test, german_y_train, german_y_test = train_test_split(german_x, german_y, test_size = 0.33)

svcClf = svcClf.fit(german_x_train,german_y_train)

svc_score = svcClf.predict(german_x)
svcClf.score(german_x_test,german_y_test)

In [None]:
german_df['score'] = svc_score ## score is the machine learning scoring


In [None]:
german_df.dtypes


In [None]:
import aequitas.preprocessing 
# *input_data* matches CLI input data norms.
processed_german_df, _ = aequitas.preprocessing.preprocess_input_df(german_df)

In [None]:
## ## Define Attributes to Audit and Reference Group for each Attribute 
attributes_and_reference_groups={'age_mapped':'older', 'sex_mapped':'male'}
attributes_to_audit = list(attributes_and_reference_groups.keys())

In [None]:
#define the metrics
metrics = ['fpr']
disparity_tolerance = 1.30

In [None]:
processed_german_df

In [None]:
new_preprocessed_german = processed_german_df
new_preprocessed_german['label_value'] = new_preprocessed_german['label_value'].replace(1,0)
new_preprocessed_german['label_value'] = new_preprocessed_german['label_value'].replace(2,1)

new_preprocessed_german['score'] = new_preprocessed_german['score'].replace(1,0)
new_preprocessed_german['score'] = new_preprocessed_german['score'].replace(2,1)

In [None]:
## Running qequitas 

# Initialize Aequitas
g = Group()
b = Bias()

# get_crosstabs returns a dataframe of the group counts and group value bias metrics.
xtab, _ = g.get_crosstabs(new_preprocessed_german, attr_cols=attributes_to_audit)
bdf = b.get_disparity_predefined_groups(xtab, original_df=new_preprocessed_german, ref_groups_dict=attributes_and_reference_groups)


In [None]:
absolute_metrics = g.list_absolute_metrics(xtab)

In [None]:
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

In [None]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2)

In [None]:
bdf

In [None]:
bdf.style

In [None]:
calculated_disparities = b.list_disparities(bdf)

In [None]:
# View disparity metrics added to dataframe
bdf[['attribute_name', 'attribute_value'] +  calculated_disparities]

In [None]:
bdf[['attribute_name', 'attribute_value'] +
     b.list_disparities(bdf)].style

In [None]:
f = Fairness()