In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Adult

In [2]:
adult = pd.read_csv('adult.csv')

In [3]:
df = adult.copy()

In [4]:
adult[(adult['age'] == 39) & (adult['fnlwgt'] == 77516)]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
16281,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [5]:
numerics = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
target_class = ['income']
protected = ['gender']

In [6]:
df['income'].replace({'<=50K': 0,
                      '>50K': 1}, inplace=True)

df['gender'].replace({'Male': 0,
                      'Female': 1}, inplace=True)

df.replace('?', np.NaN, inplace=True)
df.dropna(inplace=True)

In [7]:
df = pd.get_dummies(df)

In [8]:
protected_column = df.pop('gender')
df.insert(0, 'gender', protected_column)

target_column = df.pop('income')
df.insert(0, 'income', target_column)

In [9]:
df.head()

Unnamed: 0,income,gender,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0,0,25,226802,7,0,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,38,89814,9,0,0,50,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,28,336951,12,0,0,40,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,0,44,160323,10,7688,0,40,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,34,198693,6,0,0,30,0,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
scaler = MinMaxScaler()

In [11]:
scaler.fit(df)

In [12]:
df = pd.DataFrame(data=scaler.transform(df), columns=df.columns)

In [13]:
df.to_csv('preprocessed/preprocessed_adult.csv')

# German Credit

In [15]:
german = pd.read_csv('german_data.csv')

In [16]:
del german['Unnamed: 0']

In [17]:
german.head()

Unnamed: 0,checking_acc,credit_historic,saving_acc,atual_employ_since,installment_rate,sex,housing,credits_at_bank,classification
0,2,Existing credits paid,1,2,2,Female,Own,1,Bad
1,0,Critical acc/Other cr existing,1,3,2,Male,Own,1,Good
2,1,Existing credits paid,1,3,2,Male,For free,1,Good
3,1,Delay in paying,1,2,3,Male,For free,2,Bad
4,0,Existing credits paid,0,2,2,Male,For free,1,Good


In [18]:
german['classification'].replace({'Bad': 0,
                                  'Good': 1}, inplace=True)

german['sex'].replace({'Male': 0,
                       'Female': 1}, inplace=True)

classification = german.pop('classification')
protected = german.pop('sex')

In [19]:
german.insert(loc=0, column='sex', value=protected)
german.insert(loc=0, column='classification', value=classification)

In [20]:
german = pd.get_dummies(german)

In [21]:
scaler = MinMaxScaler()
scaler.fit(german)
df_german = pd.DataFrame(data=scaler.transform(german), columns=german.columns)

In [22]:
df_german.to_csv('preprocessed/preprocessed_german.csv')

In [23]:
df_german.head()

Unnamed: 0,classification,sex,checking_acc,saving_acc,atual_employ_since,installment_rate,credits_at_bank,credit_historic_All cr at this bank paid,credit_historic_Critical acc/Other cr existing,credit_historic_Delay in paying,credit_historic_Existing credits paid,credit_historic_No cr taken/all cr paid,housing_For free,housing_Own,housing_Rent
0,0.0,1.0,0.666667,0.25,0.5,0.333333,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.25,0.75,0.333333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.333333,0.25,0.75,0.333333,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.333333,0.25,0.5,0.666667,0.333333,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.5,0.333333,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


# COMPAS

In [24]:
compas = pd.read_csv('compas_two_years_violent.csv')

In [25]:
del compas['Unnamed: 0']

In [26]:
compas.head()

Unnamed: 0,sex,age,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_days_from_compas,...,is_recid,is_violent_recid,type_of_assessment,score_text,v_type_of_assessment,v_decile_score,start,end,event,two_year_recid
0,Male,69,Greater than 45,Other,0,1,0,0,0,1.0,...,0,0,Risk of Recidivism,Low,Risk of Violence,1,0,327,0,No
1,Male,34,25 - 45,African-American,0,3,0,0,0,1.0,...,1,1,Risk of Recidivism,Low,Risk of Violence,1,9,159,1,Yes
2,Male,23,Less than 25,African-American,0,8,1,0,1,1.0,...,0,0,Risk of Recidivism,High,Risk of Violence,6,0,1174,0,No
3,Male,43,25 - 45,Other,0,1,0,0,2,76.0,...,0,0,Risk of Recidivism,Low,Risk of Violence,1,0,1102,0,No
4,Male,44,25 - 45,Other,0,1,0,0,0,0.0,...,0,0,Risk of Recidivism,Low,Risk of Violence,1,1,853,0,No


In [27]:
classification = compas.pop('two_year_recid')

In [28]:
classification.replace({'No': 0,
                        'Yes': 1}, inplace=True)

compas['race'].replace({'Other': 'Not Caucasian',
                        'African-American': 'Not Caucasian',
                        'Hispanic': 'Not Caucasian',
                        'Asian': 'Not Caucasian',
                        'Native American': 'Not Caucasian'}, inplace=True)

compas['race'].replace({'Not Caucasian': 0,
                        'Caucasian': 1}, inplace=True)

race = compas.pop('race')

compas.insert(loc=0, column='race', value=race)
compas.insert(loc=0, column='two_year_recid', value=classification)

In [29]:
compas = pd.get_dummies(compas)
scaler = MinMaxScaler()
scaler.fit(compas)

In [30]:
df_compas = pd.DataFrame(data=scaler.transform(compas), columns=compas.columns)
df_compas.to_csv('preprocessed/preprocessed_compas.csv')
df_compas.head()

Unnamed: 0,two_year_recid,race,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,c_days_from_compas,is_recid,...,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,c_charge_degree_F,c_charge_degree_M,type_of_assessment_Risk of Recidivism,score_text_High,score_text_Low,score_text_Medium,v_type_of_assessment_Risk of Violence
0,0.0,0.0,0.784615,0.0,0.181818,0.0,0.0,0.0,0.000105,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.246154,0.0,0.363636,0.0,0.0,0.0,0.000105,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.076923,0.0,0.818182,0.125,0.0,0.026316,0.000105,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.384615,0.0,0.181818,0.0,0.0,0.052632,0.008013,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.4,0.0,0.181818,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
