In [79]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [80]:
pd.set_option('display.max_columns', None)

In [81]:
german_data = pd.read_csv('../../data/german/german.csv')

In [82]:
german_data.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,present_residence,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,2,6,1,4,1169,1,5,4,5,1,4,4,67,3,3,2,3,1,2,1,1
1,3,48,3,4,5951,2,3,2,2,1,2,4,22,3,3,1,3,1,1,1,0
2,1,12,1,7,2096,2,4,2,5,1,3,4,49,3,3,1,2,2,1,1,1
3,2,42,3,3,7882,2,4,2,5,3,4,3,45,3,1,1,3,2,1,1,1
4,2,24,0,1,4870,2,3,3,5,1,4,1,53,3,1,2,3,2,1,1,0


In [83]:
# these are the categorical features
cats = ['status', 'credit_history', 'purpose', 'savings', 'employment_duration', 'installment_rate', 
'personal_status_sex', 'other_debtors', 'present_residence', 'property', 'other_installment_plans', 
'housing', 'number_credits', 'job']

# These are binary
bins = ['people_liable', 'telephone', 'foreign_worker']
# DURATION: continuous
# AMOUNT: continuous

In [84]:
# one-hot encode the categorical features for german_data
german_data = pd.get_dummies(german_data, columns=cats)

In [85]:
german_data = pd.get_dummies(german_data, columns=bins, drop_first=True)

In [86]:
columns = ['duration', 'amount', 'age', 'status_1', 'status_2',
       'status_3', 'status_4', 'credit_history_0', 'credit_history_1',
       'credit_history_2', 'credit_history_3', 'credit_history_4', 'purpose_0',
       'purpose_1', 'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5',
       'purpose_6', 'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10', 'savings_1',
       'savings_2', 'savings_3', 'savings_4', 'savings_5',
       'employment_duration_1', 'employment_duration_2',
       'employment_duration_3', 'employment_duration_4',
       'employment_duration_5', 'installment_rate_1', 'installment_rate_2',
       'installment_rate_3', 'installment_rate_4', 'personal_status_sex_1',
       'personal_status_sex_2', 'personal_status_sex_3', 'personal_status_sex_4',
       'personal_status_sex_5', 'other_debtors_1', 'other_debtors_2',
       'other_debtors_3', 'present_residence_1', 'present_residence_2',
       'present_residence_3', 'present_residence_4', 'property_1',
       'property_2', 'property_3', 'property_4', 'other_installment_plans_1',
       'other_installment_plans_2', 'other_installment_plans_3', 'housing_1',
       'housing_2', 'housing_3', 'number_credits_1', 'number_credits_2',
       'number_credits_3', 'number_credits_4', 'job_1', 'job_2', 'job_3',
       'job_4', 'people_liable_2', 'telephone_2', 'foreign_worker_2']

In [87]:
y = german_data['credit_risk']
X = german_data.drop('credit_risk', axis=1)
X['personal_status_sex_4'] = pd.Series(np.zeros(len(X)).astype(int), index=X.index)
X['purpose_8'] = pd.Series(np.zeros(len(X)).astype(int), index=X.index)
X = X.reindex(columns=columns)


In [88]:
# standard scaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [89]:

data = pd.DataFrame(X, columns=columns)
data['label'] = y

In [90]:
for i in range(10):
    train, test = train_test_split(data, test_size=0.2, random_state=i)
    train.to_csv('../../data/german/german{}_train.csv'.format(i), index=False)
    test.to_csv('../../data/german/german{}_test.csv'.format(i), index=False)

In [91]:
updated_data = pd.read_csv("../../data/german/corrected_german.csv")
updated_data.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,present_residence,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1,18,4,2,1049,1,2,4,2,1,4,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,2,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,4,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,2,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,4,2,38,1,2,2,2,2,1,1,1


In [92]:
updated_data = pd.get_dummies(updated_data, columns=cats)
updated_data = pd.get_dummies(updated_data, columns=bins, drop_first=True)
y_up = updated_data['credit_risk']
X_up = updated_data.drop('credit_risk', axis=1)
X_up['personal_status_sex_5'] = pd.Series(np.zeros(len(X_up)).astype(int), index=X_up.index)
X_up['purpose_7'] = pd.Series(np.zeros(len(X_up)).astype(int), index=X_up.index)
X_up = X_up.reindex(columns=columns)

In [93]:
X_up = scaler.transform(X_up)
data_up = pd.DataFrame(X_up, columns=columns)
data_up['label'] = y_up


In [94]:
for i in range(10):
    train, test = train_test_split(data, test_size=0.2, random_state=i)
    train.to_csv('../../data/german/german_cor{}_train.csv'.format(i), index=False)
    test.to_csv('../../data/german/german_cor{}_test.csv'.format(i), index=False)

In [76]:
# TO DO different scalar fitted just to X_up
scaler = StandardScaler()
X_up = scaler.fit_transform(X_up)
data_up = pd.DataFrame(X_up, columns=columns)
data_up['label'] = y_up

for i in range(10):
    train, test = train_test_split(data, test_size=0.2)
    train.to_csv('../../data/german/german_cor_diff{}_train.csv'.format(i), index=False)
    test.to_csv('../../data/german/german_cor_diff{}_test.csv'.format(i), index=False)

In [78]:
len(columns)

70