In [10]:
#import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [11]:
#Setting the random seed 
import random
seed = 42
np.random.seed(seed) 
random.seed(seed)

In [12]:
#Importing data
data = pd.read_excel('default of credit card clients.xls',header=1)
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [13]:
#replace values 0, 5, 6 with 4 (others category) since they are not mentioned in the data description
data['EDUCATION'].replace([0,5,6],4,inplace=True)
data.EDUCATION.value_counts()

2    14030
1    10585
3     4917
4      468
Name: EDUCATION, dtype: int64

In [14]:
#initial preprocessing: drop 'ID' column, rename target column to DEFAULT, rename PAY_0 to PAY_1 for consistency 
data.drop(columns='ID',inplace=True)
data.rename(columns={"default payment next month": "DEFAULT","PAY_0": "PAY_1"},inplace=True)

In [15]:
#train test split (30%) before scaling and encoding to prevent data leakage 
X_train, X_test, y_train, y_test = train_test_split(data.drop('DEFAULT',axis=1),data['DEFAULT'],test_size=0.3,stratify=data['DEFAULT'])

In [16]:
X_train.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [17]:
#temporal features only
X_train_temp = X_train.drop(['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE'],axis=1)
X_train_temp

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
11018,0,0,0,0,0,0,156098,156259,155404,153234,156731,155818,6000,6000,5600,6000,6000,5500
1710,0,0,0,0,2,0,28383,32903,42283,51539,50583,52588,5000,10000,10000,0,2846,1936
4618,0,0,0,0,0,0,73722,75323,64277,45455,42231,42804,3300,2500,1556,1600,1600,1600
5482,0,0,-1,-1,-2,-2,7286,2160,780,0,0,0,1006,780,0,0,0,0
26187,-2,-2,-2,-2,-2,-2,0,2070,0,0,0,0,2070,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25780,0,0,-2,-2,-2,-2,10701,0,0,0,0,0,0,0,0,0,0,0
13921,0,0,0,0,0,2,112336,113351,115515,113948,122127,121962,4200,4100,4100,10000,4560,0
3794,0,0,0,0,0,0,75796,76004,67187,49924,33188,19826,3700,2023,2016,2000,1200,1000
27565,1,-2,-1,-1,-1,-2,0,0,860,246,-46,-46,0,860,246,0,0,0


In [18]:
#3 temporal features in reverse column order (April to September)
PAY = X_train_temp.iloc[:,5::-1]
BILL_AMT = X_train_temp.iloc[:,11:5:-1]
PAY_AMT = X_train_temp.iloc[:,:11:-1]

In [19]:
PAY = PAY.to_numpy()
BILL_AMT = BILL_AMT.to_numpy()
PAY_AMT = PAY_AMT.to_numpy()

In [20]:
#3D array with 21000 customers on y axis, 6 months on x axis, 3 features per month on z axis 
stacked = np.dstack((PAY,BILL_AMT,PAY_AMT))

In [21]:
stacked.shape

(21000, 6, 3)

In [22]:
y_train_array = y_train.to_numpy()

In [23]:
X_test_temp = X_test.drop(['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE'],axis=1)

In [24]:
X_test_temp

Unnamed: 0,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
8941,0,0,0,0,0,0,101832,92396,84297,49681,46224,44800,5000,4000,2029,2000,2500,1300
17618,-1,3,2,-1,0,0,4927,4693,3342,4844,3728,2612,0,0,4844,0,0,0
27895,0,0,0,0,0,0,167410,85536,77303,60178,56436,51773,70001,3769,2008,1950,2001,18000
16103,-1,4,3,2,2,2,3305,2870,2440,2510,2641,2222,0,0,500,400,0,36
4422,0,0,0,0,0,0,24147,25750,27012,27762,32126,37779,2000,2000,1500,5000,6400,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27933,0,0,0,0,0,0,177607,177498,176081,175540,142015,139162,6500,7000,7200,5018,5200,5300
26623,0,0,0,0,0,0,11669,12665,12905,13498,13993,13854,1502,1522,1505,1000,1000,1000
21750,0,0,-1,-1,-1,-1,6071,0,3308,2527,1076,1164,0,3308,2527,1079,1167,1974
14888,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#numerical columns
numerical_cols = ['BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

In [None]:
#Scaling numerical features
scaler = RobustScaler()

# Fit and transform the scaler on the training data
X_train_scaled_encoded = copy.deepcopy(X_train_encoded)
X_train_scaled_encoded[numerical_cols] = scaler.fit_transform(X_train_encoded[numerical_cols])

# Transform the test data using the fitted scaler
X_test_scaled_encoded = copy.deepcopy(X_test_encoded)
X_test_scaled_encoded[numerical_cols] = scaler.transform(X_test_encoded[numerical_cols])

In [25]:
#3 temporal features in reverse column order (April to September)
PAY_test = X_test_temp.iloc[:,5::-1]
BILL_AMT_test = X_test_temp.iloc[:,11:5:-1]
PAY_AMT_test = X_test_temp.iloc[:,:11:-1]

In [26]:
PAY_test = PAY_test.to_numpy()
BILL_AMT_test = BILL_AMT_test.to_numpy()
PAY_AMT_test = PAY_AMT_test.to_numpy()

In [27]:
#3D array with 21000 customers on y axis, 6 months on x axis, 3 features per month on z axis 
stacked_test = np.dstack((PAY_test,BILL_AMT_test,PAY_AMT_test))

In [28]:
y_test_array = y_test.to_numpy()

SO X_train = stacked, X_test = stacked_test, y_train = y_train_array, y_test = y_test_array

In [29]:
num_time_steps, num_features = stacked.shape[1], stacked.shape[2]

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(num_time_steps, num_features)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 32

model.fit(stacked, y_train_array, epochs=epochs, batch_size=batch_size, validation_data=(stacked_test, y_test_array))

# Evaluate the model
loss, accuracy = model.evaluate(stacked_test, y_test_array)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5156
Test Accuracy: 0.7789


In [44]:
# Make predictions on the test set
y_pred_probs = model.predict(stacked_test)
y_pred = (y_pred_probs>=0.5).astype(int)




In [47]:
print('Accuracy:', accuracy_score(y_test_array,y_pred))
print('Precision:', precision_score(y_test_array,y_pred))
print('Recall:', recall_score(y_test_array,y_pred))
print('F1 score:', f1_score(y_test_array,y_pred))

Accuracy: 0.7788888888888889
Precision: 0.6
Recall: 0.0015067805123053742
F1 score: 0.0030060120240480966
