In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("../data/label_data.csv")
data.shape

(41734, 26)

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
data.head()

Unnamed: 0,credit_amt,sex,education,marital_status,age,repay_sep,repay_aug,repay_jul,repay_jun,repay_may,...,state_apr,paid_sep,paid_aug,paid_jul,paid_jun,paid_may,paid_apr,repay_hist,due,default
0,20000,female,university,married,24,2,2,-1,-1,-2,...,0,0,689,0,0,0,0,-2,7015,yes
1,120000,female,university,single,26,-1,2,0,0,0,...,3261,0,1000,1000,1000,0,2000,3,12077,yes
2,90000,female,university,single,34,0,0,0,0,0,...,15549,1518,1500,1000,1000,1000,5000,0,90635,no
3,50000,female,university,married,37,0,0,0,0,0,...,29547,2000,2019,1200,1100,1069,1000,0,222946,no
4,50000,male,university,married,57,-1,0,-1,0,0,...,19131,2000,36681,10000,9000,689,679,-2,50290,no


In [5]:
data.repay_apr.unique()

array([-2,  2,  0, -1,  3,  6,  4,  7,  8,  5,  1], dtype=int64)

In [6]:
data.education.value_counts(), data.sex.value_counts(
), data.marital_status.value_counts(), data.default.value_counts()

(education
 university         20302
 graduate_school    15772
 high_school         5269
 others               391
 Name: count, dtype: int64,
 sex
 female    21391
 male      20343
 Name: count, dtype: int64,
 marital_status
 married    22780
 single     18592
 others       362
 Name: count, dtype: int64,
 default
 yes    20867
 no     20867
 Name: count, dtype: int64)

In [7]:
demo_data = data[['credit_amt', 'sex', 'education', 'marital_status', 'age']]
repay_status_data = data.loc[:, 'repay_sep':'repay_apr']
financial_data = data.loc[:, 'state_sep':'paid_apr']
label = data['default']

In [8]:
demo_data_encoded = pd.get_dummies(
    demo_data, columns=['sex', 'education', 'marital_status'], dtype='int8')

In [9]:
label.replace({'yes': 1, 'no': 0}, inplace=True)

In [10]:
demo_data_encoded.drop(['sex_female', 'education_others',
                       'marital_status_others'], axis=1, inplace=True)
demo_data_encoded.columns

Index(['credit_amt', 'age', 'sex_male', 'education_graduate_school',
       'education_high_school', 'education_university',
       'marital_status_married', 'marital_status_single'],
      dtype='object')

In [11]:
final_data = pd.concat(
    [demo_data_encoded, repay_status_data, financial_data, label], axis=1)
final_data.shape

(41734, 27)

In [12]:
final_data.to_csv("../data/final_data.csv", index=False, header=True)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = final_data.drop('default', axis=1)
y = final_data['default']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

In [16]:
y_train.value_counts()

default
1    14607
0    14606
Name: count, dtype: int64

In [17]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [18]:
train_data.to_csv("../data/train.csv", index=False, header=True)
test_data.to_csv("../data/test.csv", index=False, header=True)