In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,f1_score,precision_score

In [3]:
df = pd.read_csv("../credit_risk_data.csv")

In [5]:
df.head()

Unnamed: 0,LIMIT_BAL,AGE,SEX,EDUCATION,MARRIAGE,UTILIZATION_RATE,PAY_DELAY_MONTHS,default
0,174835.707651,60,1,1,1,0.086712,1,0
1,143086.784941,21,2,1,1,0.17823,1,1
2,182384.426905,24,1,1,2,0.35853,0,1
3,226151.49282,50,1,1,2,0.523811,0,0
4,138292.331264,32,2,3,3,0.322966,3,1


In [7]:
df.dtypes

LIMIT_BAL           float64
AGE                   int64
SEX                   int64
EDUCATION             int64
MARRIAGE              int64
UTILIZATION_RATE    float64
PAY_DELAY_MONTHS      int64
default               int64
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   LIMIT_BAL         5000 non-null   float64
 1   AGE               5000 non-null   int64  
 2   SEX               5000 non-null   int64  
 3   EDUCATION         5000 non-null   int64  
 4   MARRIAGE          5000 non-null   int64  
 5   UTILIZATION_RATE  5000 non-null   float64
 6   PAY_DELAY_MONTHS  5000 non-null   int64  
 7   default           5000 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 312.6 KB


In [11]:
df.describe()

Unnamed: 0,LIMIT_BAL,AGE,SEX,EDUCATION,MARRIAGE,UTILIZATION_RATE,PAY_DELAY_MONTHS,default
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,150352.628895,44.8214,1.399,1.9192,1.6718,0.285483,1.21,0.5474
std,49620.509995,14.06722,0.489742,0.885228,0.66013,0.157736,1.093499,0.497798
min,20000.0,21.0,1.0,1.0,1.0,0.000723,0.0,0.0
25%,117104.748117,33.0,1.0,1.0,1.0,0.162971,0.0,0.0
50%,150673.279595,45.0,1.0,2.0,2.0,0.267933,1.0,1.0
75%,183300.530353,57.0,2.0,3.0,2.0,0.389612,2.0,1.0
max,346311.885322,69.0,2.0,4.0,3.0,0.835919,8.0,1.0


In [15]:
df["default"].value_counts(normalize=True)

default
1    0.5474
0    0.4526
Name: proportion, dtype: float64

In [17]:
df['LIMIT_BAL'] = np.round(df['LIMIT_BAL'],2)

In [19]:
df.head()

Unnamed: 0,LIMIT_BAL,AGE,SEX,EDUCATION,MARRIAGE,UTILIZATION_RATE,PAY_DELAY_MONTHS,default
0,174835.71,60,1,1,1,0.086712,1,0
1,143086.78,21,2,1,1,0.17823,1,1
2,182384.43,24,1,1,2,0.35853,0,1
3,226151.49,50,1,1,2,0.523811,0,0
4,138292.33,32,2,3,3,0.322966,3,1


In [21]:
categorical_cols = ["SEX", "EDUCATION", "MARRIAGE"]

for col in categorical_cols:
    df[col] = df[col].astype("category")


In [23]:
X = df.drop(columns=["default"])
y = df["default"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [27]:
train_df = X_train.copy()
train_df["default"] = y_train.values

test_df = X_test.copy()
test_df["default"] = y_test.values

train_df.to_csv("../train.csv", index=False)
test_df.to_csv("../test.csv", index=False)


In [29]:
train_df

Unnamed: 0,LIMIT_BAL,AGE,SEX,EDUCATION,MARRIAGE,UTILIZATION_RATE,PAY_DELAY_MONTHS,default
1981,81019.09,25,1,2,2,0.117058,3,1
410,185580.74,40,2,1,1,0.664605,2,1
1641,220611.03,39,2,2,1,0.148998,1,0
2390,57690.61,33,1,1,2,0.075212,2,0
287,179415.86,27,2,1,2,0.280862,1,0
...,...,...,...,...,...,...,...,...
2033,175273.51,33,2,1,1,0.134559,2,0
2874,148447.05,40,2,1,2,0.268539,0,1
4534,187122.95,51,2,1,1,0.274860,4,0
696,166306.65,68,2,3,2,0.109983,4,1


In [31]:
test_df

Unnamed: 0,LIMIT_BAL,AGE,SEX,EDUCATION,MARRIAGE,UTILIZATION_RATE,PAY_DELAY_MONTHS,default
465,138951.52,35,2,2,1,0.038645,0,0
4481,164290.16,68,2,3,1,0.441526,0,1
2287,137423.06,51,1,2,2,0.116202,0,0
3434,263621.74,48,1,3,1,0.614774,1,0
554,166880.13,62,2,2,1,0.082229,0,0
...,...,...,...,...,...,...,...,...
3907,79659.25,52,2,2,2,0.407685,0,0
436,231430.78,68,2,1,2,0.464276,0,1
945,96495.76,42,1,2,2,0.229093,0,0
3947,171409.31,45,1,2,3,0.214445,0,0


In [33]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)


Train shape: (3750, 8)
Test shape: (1250, 8)
