In [259]:
# Importing the necessary libraries

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [260]:
# Storing the data into a dataframe

df = pd.read_csv('Credit_risk/german_credit_data.csv')

Performing EDA and Preprocessing to prepare the data to be fed into the model

In [261]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [262]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [263]:
df['Saving accounts'].value_counts()

Saving accounts
little        603
moderate      103
quite rich     63
rich           48
Name: count, dtype: int64

In [264]:
# Mapping the data to integer values
# This is a naive method of encoding but it is simple and straight forward for features with less number of unique values 

save_mapping = {'little':0, 'moderate':1, 'quite rich':2, 'rich':3}
df['Saving accounts'] = df['Saving accounts'].map(save_mapping)

check_mapping = {'little':0, 'moderate':1, 'rich':2}
df['Checking account'] = df['Checking account'].map(check_mapping)

In [265]:
print(df['Checking account'].value_counts(), df['Saving accounts'].value_counts())

Checking account
0.0    274
1.0    269
2.0     63
Name: count, dtype: int64 Saving accounts
0.0    603
1.0    103
2.0     63
3.0     48
Name: count, dtype: int64


In [266]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1000 non-null   int64  
 1   Age               1000 non-null   int64  
 2   Sex               1000 non-null   object 
 3   Job               1000 non-null   int64  
 4   Housing           1000 non-null   object 
 5   Saving accounts   817 non-null    float64
 6   Checking account  606 non-null    float64
 7   Credit amount     1000 non-null   int64  
 8   Duration          1000 non-null   int64  
 9   Purpose           1000 non-null   object 
 10  Risk              1000 non-null   object 
dtypes: float64(2), int64(5), object(4)
memory usage: 86.1+ KB


In [267]:
# Replacing the unknown values with the median

df['Checking account'].fillna(df['Checking account'].median(), inplace = True)
df['Saving accounts'].fillna(df['Saving accounts'].median(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Checking account'].fillna(df['Checking account'].median(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Saving accounts'].fillna(df['Saving accounts'].median(), inplace = True)


In [268]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1000 non-null   int64  
 1   Age               1000 non-null   int64  
 2   Sex               1000 non-null   object 
 3   Job               1000 non-null   int64  
 4   Housing           1000 non-null   object 
 5   Saving accounts   1000 non-null   float64
 6   Checking account  1000 non-null   float64
 7   Credit amount     1000 non-null   int64  
 8   Duration          1000 non-null   int64  
 9   Purpose           1000 non-null   object 
 10  Risk              1000 non-null   object 
dtypes: float64(2), int64(5), object(4)
memory usage: 86.1+ KB


In [269]:
# Checking for the imbalance in the data
# This imbalance needs to be handled as it will induce a bias in the model and the model's accuracy when predicting the minority class will be low

df.Risk.value_counts()

Risk
good    700
bad     300
Name: count, dtype: int64

In [270]:
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [271]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,0.0,0.0,1169,6,radio/TV,good
1,22,female,2,own,0.0,1.0,5951,48,radio/TV,bad
2,49,male,1,own,0.0,1.0,2096,12,education,good
3,45,male,2,free,0.0,0.0,7882,42,furniture/equipment,good
4,53,male,2,free,0.0,0.0,4870,24,car,bad


In [272]:
# Checking the number of unique values in each column
# Doing this helps us in deciding which encoding method we need to apply to each categorical column

cols = df.columns

for col in cols:
    print(f"{col} : {len(df[col].unique())}")

Age : 53
Sex : 2
Job : 4
Housing : 3
Saving accounts : 4
Checking account : 3
Credit amount : 921
Duration : 33
Purpose : 8
Risk : 2


In [273]:
# We manually map the target feature so that it is easy to understand in the future

risk_val = {'good':1, 'bad':0}
df.Risk = df.Risk.map(risk_val)

In [274]:
# Performing label encoding on the features
# Label encoding being applied on features with less cardinality.

def label_encoder(df, columns):
    for cols in columns:
        unique_values = df[cols].unique()
        name_to_num = {name: i for i, name in enumerate(unique_values)}
        df[cols] = df[cols].map(name_to_num).astype(int)
cols = ['Sex', 'Housing']
label_encoder(df, cols)

In [275]:
# Performing one hot encoding on the features
# This should be applied on featuers with categorical data as it helps preserve the information

def onehotencode(df, columns):
#     df = df.copy()
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column).astype(int)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)        
    return df
cols = ['Purpose']
df = onehotencode(df, cols)

In [276]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Age                          1000 non-null   int64  
 1   Sex                          1000 non-null   int32  
 2   Job                          1000 non-null   int64  
 3   Housing                      1000 non-null   int32  
 4   Saving accounts              1000 non-null   float64
 5   Checking account             1000 non-null   float64
 6   Credit amount                1000 non-null   int64  
 7   Duration                     1000 non-null   int64  
 8   Risk                         1000 non-null   int64  
 9   Purpose_business             1000 non-null   int32  
 10  Purpose_car                  1000 non-null   int32  
 11  Purpose_domestic appliances  1000 non-null   int32  
 12  Purpose_education            1000 non-null   int32  
 13  Purpose_furniture/e

In [277]:
df.sample(10)

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Risk,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
524,26,1,1,0,0.0,1.0,1113,18,1,0,0,0,0,0,1,0,0
493,38,0,2,0,0.0,1.0,368,6,1,0,0,0,0,0,1,0,0
559,31,0,1,0,0.0,1.0,1928,18,0,0,0,0,0,1,0,0,0
44,58,1,1,1,0.0,0.0,6143,48,0,0,1,0,0,0,0,0,0
596,23,1,2,2,0.0,0.0,1442,24,0,0,1,0,0,0,0,0,0
965,26,1,2,0,0.0,1.0,1715,30,1,0,0,0,0,0,1,0,0
235,30,0,3,0,0.0,0.0,1823,24,0,0,0,0,0,0,1,0,0
169,31,0,2,0,0.0,1.0,1935,24,0,1,0,0,0,0,0,0,0
785,35,0,1,0,3.0,1.0,1941,18,1,1,0,0,0,0,0,0,0
962,29,0,2,0,0.0,1.0,3556,15,1,0,1,0,0,0,0,0,0


In [278]:
# Scaling the data
# Scaling the data makes model training efficient and prevents exploding gradients

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['Credit amount'] = scaler.fit_transform(df[['Credit amount']])
df['Age'] = scaler.fit_transform(df[['Age']])
df['Duration'] = scaler.fit_transform(df[['Duration']])

In [279]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Risk,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,0.857143,0,2,0,0.0,0.0,0.050567,0.029412,1,0,0,0,0,0,1,0,0
1,0.053571,1,2,0,0.0,1.0,0.31369,0.647059,0,0,0,0,0,0,1,0,0
2,0.535714,0,1,0,0.0,1.0,0.101574,0.117647,1,0,0,0,1,0,0,0,0
3,0.464286,0,2,1,0.0,0.0,0.419941,0.558824,1,0,0,0,0,1,0,0,0
4,0.607143,0,2,1,0.0,0.0,0.254209,0.294118,0,0,1,0,0,0,0,0,0


In [280]:
# Splitting the data into training, validation and testing data

X = df.drop(['Risk'], axis = 1)
y = df.Risk
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, random_state = 123, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 123, stratify = y_temp)

In [281]:
print(X_train.shape, X_val.shape, X_test.shape)

(700, 16) (150, 16) (150, 16)


In [282]:
print(y_train.value_counts(), y_val.value_counts(), y_test.value_counts())

Risk
1    490
0    210
Name: count, dtype: int64 Risk
1    105
0     45
Name: count, dtype: int64 Risk
1    105
0     45
Name: count, dtype: int64


In [283]:
# Handling unbalanced data
# SMOTE is a method to oversample the minority class by synthetically creating new data for the minority class

smote = SMOTE()

X_train, y_train = smote.fit_resample(X_train, y_train)
X_test, y_test = smote.fit_resample(X_test, y_test)
X_val, y_val = smote.fit_resample(X_val, y_val)

In [284]:
print(y_train.value_counts(), y_val.value_counts(), y_test.value_counts())

Risk
1    490
0    490
Name: count, dtype: int64 Risk
0    105
1    105
Name: count, dtype: int64 Risk
1    105
0    105
Name: count, dtype: int64


Building the model

In [286]:
# Building the model
# We train a deep learning model, with stacked LSTM layers
# We have set the learning rate to be 1e-2 as it provided the best results

from keras.callbacks import LearningRateScheduler

lr = LearningRateScheduler(lambda epoch : 1e-2 * (0.75 ** np.floor(epoch / 2)))

model = keras.Sequential([
    keras.layers.Dense(50, input_shape = (16, )),
    keras.layers.RepeatVector(1),
    keras.layers.LSTM(50,  return_sequences = True),
    keras.layers.LSTM(50, return_sequences = True),
    keras.layers.LSTM(50),
    keras.layers.Dense(50, activation = 'relu'),
    keras.layers.Dense(50, activation = 'relu'),
    keras.layers.Dense(50, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(optimizer = 'adam',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

model.summary()

Model: "sequential_45"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_208 (Dense)           (None, 50)                850       
                                                                 
 repeat_vector_22 (RepeatVec  (None, 1, 50)            0         
 tor)                                                            
                                                                 
 lstm_50 (LSTM)              (None, 1, 50)             20200     
                                                                 
 lstm_51 (LSTM)              (None, 1, 50)             20200     
                                                                 
 lstm_52 (LSTM)              (None, 50)                20200     
                                                                 
 dense_209 (Dense)           (None, 50)                2550      
                                                     

In [287]:
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 100, callbacks = [lr])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1dc685ddbe0>

In [288]:
model.evaluate(X_test, y_test)



[0.7671228051185608, 0.6428571343421936]