In [19]:
import numpy as np
import pandas as pd

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [9]:
random_state = 1729

In [32]:
df = pd.read_csv("SouthGermanCredit.csv", delimiter = '\s+')
print(df.head())

   laufkont  laufzeit  moral  verw  hoehe  sparkont  beszeit  rate  famges  \
0         1        18      4     2   1049         1        2     4       2   
1         1         9      4     0   2799         1        3     2       3   
2         2        12      2     9    841         2        4     2       2   
3         1        12      4     0   2122         1        3     3       3   
4         1        12      4     0   2171         1        3     4       3   

   buerge  ...  verm  alter  weitkred  wohn  bishkred  beruf  pers  telef  \
0       1  ...     2     21         3     1         1      3     2      1   
1       1  ...     1     36         3     1         2      3     1      1   
2       1  ...     1     23         3     1         1      2     2      1   
3       1  ...     1     39         3     1         2      2     1      1   
4       1  ...     2     38         1     2         2      2     2      1   

   gastarb  kredit  
0        2       1  
1        2       1  
2    

In [3]:
columns = ['status', 'duration', 'credit_history', 'purpose', 'amount', 'savings', 'employment_duration', 'installment_rate', 'personal_status_sex', 'other_debtors', 'present_residence', 'property', 'age', 'other_installment_plans', 'housing', 'number_credits', 'job', 'people_liable', 'telephone', 'foreign_worker', 'credit']

In [6]:
len(columns)

21

In [37]:
df.columns = columns
df.head()

Unnamed: 0,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,personal_status_sex,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,2,1,2,1
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,1,1,2,1
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,2,1,2,1
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,1,1,1,1
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,2,1,1,1


In [16]:
y = df['credit_risk'] 
X = df.loc[:, df.columns != 'credit_risk']

0    1
1    1
2    1
3    1
4    1
Name: credit_risk, dtype: int64

In [34]:
# first split to separate out the training set
X_train, X_other, y_train, y_other = train_test_split(X,y,train_size = 0.6,random_state=random_state)


# second split to separate out the validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_other,y_other,train_size = 0.5,random_state=random_state)


# Kfold for when I start using a ML model
# kf = KFold(n_splits=5,shuffle=True,random_state=random_state)
# for train_index, val_index in kf.split(X_other,y_other):
#     X_train = X_other.iloc[train_index]
#     y_train = y_other.iloc[train_index]
#     X_val = X_other.iloc[val_index]
#     y_val = y_other.iloc[val_index]
#     print('   training set:',X_train.shape, y_train.shape) 
#     print('   validation set:',X_val.shape, y_val.shape) 
#     # the validation set contains different points in each iteration
#     print(X_val[['age','workclass','education']].head())

In [36]:
# collect which encoder to use on each feature
# needs to be done manually
ordinal_ftrs = [''] 
ordinal_cats = [[]]
onehot_ftrs = ['status','credit_history','purpose','savings','personal_status_sex','other_debtors','housing', 'people_liable', 'telephone', 'foreign_worker']
std_ftrs = ['capital-gain','capital-loss']

# collect all the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories = ordinal_cats), ordinal_ftrs),
        ('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'), onehot_ftrs),
        ('minmax', MinMaxScaler(), minmax_ftrs),
        ('std', StandardScaler(), std_ftrs)])

clf = Pipeline(steps=[('preprocessor', preprocessor)]) # for now we only preprocess 
                                                       # later on we will add other steps here

X_train_prep = clf.fit_transform(X_train)
X_val_prep = clf.transform(X_val)
X_test_prep = clf.transform(X_test)

print(X_train.shape)
print(X_train_prep.shape)
print(X_train_prep)

# the target variable still needs to be preprocessed separately
le = LabelEncoder()
print(y_train.ndim)
y_train_prep = le.fit_transform(y_train)
y_val_prep = le.transform(y_val)
y_test_prep = le.transform(y_test)

(600, 20)
(600, 2)
[[ 0.17857143 -0.4017071 ]
 [ 0.375      -0.58414017]
 [ 0.03571429 -0.86069709]
 ...
 [ 0.23214286 -0.9540941 ]
 [ 0.26785714  0.52971915]
 [ 0.60714286 -0.30176866]]
1
