<a href="https://colab.research.google.com/github/Ali-Alameer/AI_fairness/blob/main/datasets_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### This notebook Break down the datasets used in the framework of Fairness
#### It shows also the original datasets before preprocessing

In [None]:
!pip install 'aif360[all]'
!wget https://raw.githubusercontent.com/Trusted-AI/AIF360/master/examples/common_utils.py


In [None]:
import urllib.request 
# For Adult dataset
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",'/usr/local/lib/python3.9/dist-packages/aif360/data/raw/adult/adult.data')   
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",'/usr/local/lib/python3.9/dist-packages/aif360/data/raw/adult/adult.test')  
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",'/usr/local/lib/python3.9/dist-packages/aif360/data/raw/adult/adult.names') 

# For German Dataset
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data",'/usr/local/lib/python3.9/dist-packages/aif360/data/raw/german/german.data')   
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.doc",'/usr/local/lib/python3.9/dist-packages/aif360/data/raw/german/german.doc')

# For Compas Dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv",'/usr/local/lib/python3.9/dist-packages/aif360/data/raw/compas/compas-scores-two-years.csv')   



In [3]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.metrics import ClassificationMetric
from aif360.algorithms.preprocessing.reweighing import Reweighing
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions\
        import load_preproc_data_adult, load_preproc_data_german, load_preproc_data_compas
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

from common_utils import compute_metrics

#### Load dataset and set options

In [4]:
## import dataset
dataset_used = "adult" # "adult", "german", "compas"
protected_attribute_used = 1 # 1, 2


if dataset_used == "adult":
#     dataset_orig = AdultDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
        dataset_orig = load_preproc_data_adult(['sex'])
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]
        dataset_orig = load_preproc_data_adult(['race'])
    
elif dataset_used == "german":
#     dataset_orig = GermanDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
        dataset_orig = load_preproc_data_german(['sex'])
    else:
        privileged_groups = [{'age': 1}]
        unprivileged_groups = [{'age': 0}]
        dataset_orig = load_preproc_data_german(['age'])
    
elif dataset_used == "compas":
#     dataset_orig = CompasDataset()
    if protected_attribute_used == 1:
        privileged_groups = [{'sex': 1}]
        unprivileged_groups = [{'sex': 0}]
        dataset_orig = load_preproc_data_compas(['sex'])
    else:
        privileged_groups = [{'race': 1}]
        unprivileged_groups = [{'race': 0}]
        dataset_orig = load_preproc_data_compas(['race'])

all_metrics =  ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]

#random seed for calibrated equal odds prediction
np.random.seed(1)

## Exploratory Data Analysis(EDA)

In [None]:
# Convert the initial Adult Dataset from AIf360 Library into a dataframe and view the created dataframe
dataset_orig_adult= AdultDataset()
dataset_orig_adult1=dataset_orig_adult.convert_to_dataframe()[0]
dataset_orig_adult1

In [None]:
# Convert pre-processed AIF360 Adult dataset to dataframe and view dataframe created
dataset_processed_adult = load_preproc_data_adult()
dataset_processed_adult1=dataset_processed_adult.convert_to_dataframe()[0]
dataset_processed_adult1

In [None]:
# View the data structure for the pre-processed Adult dataset
dataset_processed_adult1.shape

In [None]:
# View the data structure for the initial Adult dataset in AIF360 library N/B missing data of 3620 rows were removed from AdultDataset
dataset_orig_adult1.shape

In [None]:
# View a list of the dataset features
dataset_orig_adult1.columns.tolist()

In [None]:
# View the Features for the pre-processed Adult dataset
dataset_processed_adult1.columns.tolist()

In [None]:
# Check for missing values in the initial Adult dataset in AIF360 library
dataset_orig_adult1.isnull().sum()

In [None]:
# check for missing values for the pre-processed Adult dataset
dataset_processed_adult1.isnull().sum()

In [None]:
# Explore education number of years in the Adult dataset to check for outliers using box plot
import seaborn as sns
sns.boxplot(x= 'education-num', data= dataset_orig_adult1)
plt.title('Education number Distribution for Adult Dataset')

In [None]:
# Using a distplot, explore the effect of education number of years on Income level of residents 
dataset_orig_adult1['income-per-year']= dataset_orig_adult1['income-per-year'].replace({0.0:'<= $50k income', 1.0:'> $50K income'})
sns.histplot(x= 'education-num', hue = 'income-per-year',data= dataset_orig_adult1,multiple="dodge")
plt.title('Education number by Income Distribution for Adult Dataset')

In [None]:
# Explore age distribution relationship with income level of residents
sns.histplot(x= 'age', hue = 'income-per-year',data= dataset_orig_adult1,multiple="stack")
plt.title('Age Distribution by Income for Adult Dataset')

In [None]:
## explore income binary in regards to sex distribution in adult dataset to check class imbalance and distribution
## Privileged = 1, unprivileged = 0
##sex (privileged: Male, unprivileged: Female) 
##Income binary( privileged: > $50K income, unprivileged: <= $50k income)
import seaborn as sns
dataset_processed_adult1['Income Binary']= dataset_processed_adult1['Income Binary'].replace({0.0:'<= $50k income', 1.0:'> $50K income'})
sns.countplot(x="Income Binary", hue= 'sex', data = dataset_processed_adult1)
plt.title('Income Distribution by Sex for Adult Dataset')
plt.legend(title= 'sex', labels=("Female","Male"))
plt.show()

In [None]:
## explore  sex as a protected attribute in adult dataset to check class imbalance and distribution
## Privileged = 1, unprivileged = 0
##sex (privileged: Male, unprivileged: Female) 
dataset_processed_adult1['sex']= dataset_processed_adult1['sex'].replace({0.0:'Female', 1.0:'Male'})
sns.countplot(x="sex", data = dataset_processed_adult1)
plt.title('Sex Distribution for Adult Dataset')
plt.show()

In [None]:
## explore race as aprotected attribute in adult dataset to check class imbalance and distribution
## Privileged = 1, unprivileged = 0
##race (privileged: White, unprivileged: Non-white).
dataset_processed_adult1['race']= dataset_processed_adult1['race'].replace({0.0:'Non-white', 1.0:'White'})
sns.countplot(x="race", data = dataset_processed_adult1)
plt.title('Race Distribution for Adult Dataset')
plt.show()

In [None]:
# Explore income distribution of residents
tips = dataset_orig_adult1
ax= sns.countplot(x= 'income-per-year',data= tips)
plt.title('Income Distribution for Adult Dataset')
for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()/len(tips)), (p.get_x()+0.2, p.get_height()+5))

In [None]:
# Convert pre-processed German dataset to dataframe
dataset_processed_german = load_preproc_data_german(['sex'])
dataset_processed_german2=dataset_processed_german.convert_to_dataframe()[0]
dataset_processed_german2

In [None]:
# Convert initial German dataset to dataframe
dataset_orig_german=GermanDataset()
dataset_orig_german1= dataset_orig_german.convert_to_dataframe()[0]
dataset_orig_german1

In [None]:
# View the data structure for the pre-processed German dataset
dataset_processed_german2.shape

In [None]:
# View the data structure for the initial German dataset in AIF360 Library
dataset_orig_german1.shape

In [None]:
# View the Features for the pre-processed German dataset
dataset_processed_german2.columns.tolist()

In [None]:
# View the Features for the Initial German dataset in AIF360
dataset_orig_german1.columns.tolist()

In [None]:
# Check for missing values in pre-processed German dataset
dataset_processed_german2.isnull().sum()

In [None]:
# Explore credit risk based onduration in months.
plt.figure(figsize=(15,5))
sns.countplot(x="month", hue= 'credit', data = dataset_orig_german1,palette=['#432371',"#FAAE7B"])
plt.title('Duration of Credits(Month) Distribution for German Dataset')
plt.legend(title= 'credit',labels=['good credit', 'bad credit'])
plt.show()

In [None]:
## explore  sex in relationship with credit risk in processed German dataset to check class imbalance and distribution
## Privileged = 1, unprivileged = 0
##sex (privileged: Male, unprivileged: Female) 
## Credit ( good credit =1, bad credit = 2)
dataset_processed_german2['sex']= dataset_processed_german2['sex'].replace({0.0:'Female', 1.0:'Male'})
sns.countplot(x="sex", hue= 'credit', data = dataset_processed_german2,palette=['#432371',"#FAAE7B"])
plt.title('Sex Distribution for German Dataset')
plt.legend(title= 'credit',labels=['good credit', 'bad credit'])
plt.show()

In [None]:
## explore age in German dataset to check class imbalance and distribution
## Privileged  = 1, unprivileged = 0
## age (privileged: Older than or Equal to 25 years, unprivileged: Younger than 25 years) 
## Credit ( good credit =1, bad credit = 2)
dataset_processed_german2['age']= dataset_processed_german2['age'].replace({0.0:'< 25 years', 1.0:'>=25 years'})
sns.countplot(x="age", hue ='credit',data = dataset_processed_german2,palette=['#432371',"#FAAE7B"])
plt.title('Age Distribution for German Dataset')
plt.legend(title= 'credit',labels=['good credit', 'bad credit'])
plt.show()

In [None]:
# Explore Credit distribution of German Dataset
tips =  dataset_processed_german2
## Credit ( good credit =1, bad credit = 2)
dataset_processed_german2['credit']= dataset_processed_german2['credit'].replace({1.0:'Good credit', 2.0:'Bad credit'})
ax= sns.countplot(x= 'credit',data= tips, palette= ['#432371',"#FAAE7B"])
plt.title('Credit Distribution for German Dataset')
for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()/len(tips)), (p.get_x()+0.2, p.get_height()+5))

In [None]:
# Convert processed Compas dataset to dataframe
dataset_processed_compas = load_preproc_data_compas()
dataset_processed_compas2=dataset_processed_compas.convert_to_dataframe()[0]
dataset_processed_compas2

In [None]:
# Convert the Initial Compas dataset in AIF360 library to dataframe
dataset_orig_compas = CompasDataset()
dataset_orig_compas2 = dataset_orig_compas.convert_to_dataframe()[0]
dataset_orig_compas2

In [None]:
# View the data structure for the pre-processed compas dataset
dataset_processed_compas2.shape

In [None]:
# View the data structure for the initial compas dataset in AIF360 library
dataset_orig_compas2.shape

In [None]:
# View the features for the pre-processed compas dataset
dataset_processed_compas2.columns.tolist()

In [None]:
# View the Features for the initial compas dataset in AIF360 library
dataset_orig_compas2.columns.tolist()

In [None]:
# Check for missing values in pre-processed Compas dataset
dataset_processed_compas2.isnull().sum()

In [None]:
# Explore Age distribution of the Initial Compas Dataset in relation to two-years recid
dataset_orig_compas2['two_year_recid']= dataset_orig_compas2['two_year_recid'].replace({0.0:'re-offended',1.0:'did not re-offend'})
sns.histplot(x= 'age', hue= 'two_year_recid', data= dataset_orig_compas2,multiple="stack")
plt.title('Age Distribution for Compas Dataset')
plt.show()

In [None]:
#Explore the prior counts of crimes in relation to recidivism
sns.histplot(x= 'priors_count', hue= 'two_year_recid', data= dataset_orig_compas2, multiple="stack")
plt.title('Prior Crime count Distribution for Compas Dataset' )
plt.show()