# ***SETTING UP THE ENVIRONMENT***

In [30]:
import numpy as np
import pandas as pd
from google.colab import files
import zipfile
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ***STEP:1 _ LOADING THE DATASET***

In [None]:
uploaded = files.upload()

Saving credit_risk_dataset.zip to credit_risk_dataset.zip


UNZIPPING THE DATASET

In [None]:
with zipfile.ZipFile('credit_risk.zip', 'r') as zip_ref:
  zip_ref.extractall('credit_risk_data')

In [None]:
df = pd.read_csv('credit_risk_data/credit_data.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


# ***STEP:2 _ DATA PREPROCESSING***

(a) CHECKING THE DATATYPES AND MISSING VALUES

In [None]:
print(df.dtypes)

total_missing = df.isnull().sum()
print("\n\n",total_missing)

print("\n\n",total_missing[total_missing > 0].sort_values(ascending=False))

ID                              int64
LIMIT_BAL                     float64
SEX                             int64
EDUCATION                       int64
MARRIAGE                        int64
AGE                             int64
PAY_0                           int64
PAY_2                           int64
PAY_3                           int64
PAY_4                           int64
PAY_5                           int64
PAY_6                           int64
BILL_AMT1                     float64
BILL_AMT2                     float64
BILL_AMT3                     float64
BILL_AMT4                     float64
BILL_AMT5                     float64
BILL_AMT6                     float64
PAY_AMT1                      float64
PAY_AMT2                      float64
PAY_AMT3                      float64
PAY_AMT4                      float64
PAY_AMT5                      float64
PAY_AMT6                      float64
default.payment.next.month      int64
dtype: object


 ID                            0
L

`there are not any missing values and also all the columns are in numeric form`

(b) Dropping the Unnecessary columns

In [None]:
df = df.drop('ID', axis=1)

(c) Feature Scaling

In [None]:
bill_cols = ["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]
pay_cols = ["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]

scaler = StandardScaler()
df[bill_cols + pay_cols] = scaler.fit_transform(df[bill_cols + pay_cols])

In [None]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,-0.672497,-0.663059,-0.652724,-0.341942,-0.227086,-0.296801,-0.308063,-0.314136,-0.293382,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,-0.621636,-0.606229,-0.597966,-0.341942,-0.213588,-0.240005,-0.24423,-0.314136,-0.180878,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,-0.44973,-0.417188,-0.39163,-0.250292,-0.191887,-0.240005,-0.24423,-0.248683,-0.012122,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,-0.232373,-0.186729,-0.156579,-0.221191,-0.169361,-0.228645,-0.237846,-0.244166,-0.23713,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,-0.346997,-0.348137,-0.331482,-0.221191,1.335034,0.271165,0.266434,-0.269039,-0.255187,0


# ***STEP:3 _ FEATURE ENGINEERING***

(A) Total Bill Amount → Sum of all 6 bill amounts.

In [None]:
df["TOTAL_BILL_AMT"] = df[["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6"]].sum(axis=1)

(b) Total Payment Amount → Sum of all 6 payment amounts.

In [None]:
df["TOTAL_PAY_AMT"] = df[["PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]].sum(axis=1)

(C) Payment to Bill Ratio → How much the customer has paid relative to total bills.

In [None]:
df["PAY_BILL_RATIO"] = df["TOTAL_PAY_AMT"] / (df["TOTAL_BILL_AMT"] + 1)

(D) Average Delay in Payments → Mean of the PAY_0 to PAY_6 columns

In [None]:
pay_status_cols = ["PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]

df["AVG_PAY_DELAY"] = df[pay_status_cols].mean(axis=1)

In [None]:
df = df.drop(columns=["BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6",
                      "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6",
                      "PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"])


In [23]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,default.payment.next.month,TOTAL_BILL_AMT,TOTAL_PAY_AMT,PAY_BILL_RATIO,AVG_PAY_DELAY
0,20000.0,2,2,1,24,1,-3.946174,-1.781409,0.604652,-0.333333
1,120000.0,2,2,2,26,1,-3.791051,-1.534778,0.549892,0.5
2,90000.0,2,2,2,34,0,-2.533414,-1.187218,0.774231,0.0
3,50000.0,2,2,1,37,0,-0.613618,-1.33834,-3.463772,0.0
4,50000.0,1,2,1,57,0,-2.377741,1.127217,-0.818163,-0.333333


# ***STEP:4 _ SPLITTING THE DATASET***

In [26]:
X = df.drop("default.payment.next.month", axis=1)
y = df["default.payment.next.month"]

In [29]:
print("X shape : ", X.shape)
print("y shape : ", y.shape)

print("\nFEATURE COLUMNS : ", X.columns.tolist())

X shape :  (30000, 9)
y shape :  (30000,)

FEATURE COLUMNS :  ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'TOTAL_BILL_AMT', 'TOTAL_PAY_AMT', 'PAY_BILL_RATIO', 'AVG_PAY_DELAY']


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y , test_size = 0.2, random_state=42, stratify=y
)

print("Train features shape:", X_train.shape)
print("\nTest features shape:", X_test.shape)
print("\nTrain target shape:", y_train.shape)
print("\nTest target shape:", y_test.shape)

Train features shape: (24000, 9)

Test features shape: (6000, 9)

Train target shape: (24000,)

Test target shape: (6000,)
