## Default of Credit Card Clients

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler
address = '/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv'
df_original = pd.read_csv(address)

In [2]:
df_original.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')

In [3]:
#df_original = pd.read_excel("input/default of credit card clients.xls",header=1)
df_original =df_original.rename({'PAY_0' : 'PAY_1','default.payment.next.month':'default'},axis=1)
df = df_original.copy()
N_labels = [column for column in df_original.columns if column !='default']
split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index , test_index in split.split(df,df.iloc[:,[2,-1]]):
    df_train, label_train= df[N_labels].loc[train_index],df.default[train_index]
    df_test, label_test= df[N_labels].loc[test_index], df.default[test_index]
    

In [4]:
from sklearn.base import TransformerMixin, BaseEstimator
class default_transfomer(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.columns=['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
       'USAGE_1', 'USAGE_2', 'USAGE_3', 'USAGE_4', 'USAGE_5',
       'DIFF_0', 'DIFF_1', 'DIFF_2', 'DIFF_3',
       'DIFF_4', 'log_LIMIT_BAL', 'log_BILL_AMT1', 'log_BILL_AMT2',
       'log_BILL_AMT3', 'log_BILL_AMT4', 'log_BILL_AMT5', 'log_BILL_AMT6',
       'log_PAY_AMT1', 'log_PAY_AMT2', 'log_PAY_AMT3', 'log_PAY_AMT4',
       'log_PAY_AMT5', 'log_PAY_AMT6', 'log_USAGE_1', 'log_USAGE_2',
       'log_USAGE_3', 'log_USAGE_4', 'log_USAGE_5', 'log_DIFF_0',
       'log_DIFF_1', 'log_DIFF_2', 'log_DIFF_3',
       'log_DIFF_4']
        self.log_columns=['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6','log_LIMIT_BAL',
       'log_BILL_AMT1','log_BILL_AMT2', 'log_BILL_AMT3', 'log_BILL_AMT4', 'log_BILL_AMT5',
       'log_BILL_AMT6', 'log_PAY_AMT1', 'log_PAY_AMT2', 'log_PAY_AMT3',
       'log_PAY_AMT4', 'log_PAY_AMT5', 'log_PAY_AMT6', 'log_USAGE_1',
       'log_USAGE_2', 'log_USAGE_3', 'log_USAGE_4', 'log_USAGE_5',
       'log_DIFF_0', 'log_DIFF_1', 'log_DIFF_2', 'log_DIFF_3', 'log_DIFF_4']
    
    def fit(self):
        pass
    def log_pre (self,x):
        x=x*2
        return np.log(1+abs(x))*np.sign(x)
    def log_pre_col (self,column):
        return column.map(lambda x:self.log_pre(x))
    
    def transform(self,df,y=None):
        #Usage
        df_usage = pd.DataFrame()
        df_usage['ID']=df.ID
        BILL = [column for column in df.columns if 'BILL' in column]
        PAY = [column for column in df.columns if 'PAY_AMT' in column]
        for i in range(5):
            df_usage[f"USAGE_{i+1}"] = df[BILL[i]]-(df[BILL[i+1]]-df[PAY[i]])
        #difference
        df_difference =pd.DataFrame()
        df_difference['ID'] = df.ID
        for i in range(5):
            df_difference[f"DIFF_{i}"] = df[BILL[i+1]]-df[PAY[i]]
        USAGE = [column for column in df_usage.columns if column !='ID']
        DIFF = [column for column in df_difference.columns if column !='ID']
        LIM =['LIMIT_BAL']
        ## Logarithmic Scaling attribute 
        df = pd.concat([df,df_usage.iloc[:,1:],df_difference.iloc[:,1:]],axis=1)
        df_log =pd.concat([df.ID,df[LIM+BILL+PAY+USAGE+DIFF].apply(self.log_pre_col,axis=0)],axis=1) #apply(function,axis=) map of those who use index/column as a index
        rename_dict = {}
        for i in df_log.iloc[:,1:].columns:
            rename_dict[i] = 'log_'+i
        df_log =df_log.rename(rename_dict,axis=1)
        self.BILL = BILL
        self.PAY =PAY
        self.DIFF = DIFF
        self.USAGE =USAGE
        return pd.concat([df,df_log[[column for column in df_log.columns if column !='ID']]],axis=1)

In [5]:
NAT = default_transfomer()

In [6]:
train_data = NAT.transform(df_train)

In [7]:
train_data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,log_USAGE_1,log_USAGE_2,log_USAGE_3,log_USAGE_4,log_USAGE_5,log_DIFF_0,log_DIFF_1,log_DIFF_2,log_DIFF_3,log_DIFF_4
20349,20350,80000.0,1,1,2,25,2,2,-2,-2,...,11.970357,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6983,6984,300000.0,2,1,1,51,-1,-1,-1,-1,...,6.440947,7.415175,6.111467,0.000000,0.000000,-1.609438,0.000000,0.000000,0.000000,0.000000
17548,17549,60000.0,2,2,1,48,0,0,0,0,...,8.203304,7.788626,7.983781,11.169519,7.164720,11.492325,11.508405,11.521062,10.394610,10.391791
954,955,110000.0,2,2,1,46,-1,-1,-1,-1,...,8.480737,8.367068,8.371705,8.142936,9.138092,0.000000,0.000000,0.000000,0.000000,12.235054
12866,12867,20000.0,2,2,2,25,0,0,0,0,...,8.260234,8.871084,8.826294,8.176673,9.253687,10.505752,10.378323,10.225100,10.124669,10.185240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22066,22067,160000.0,2,1,2,26,-1,-1,-1,0,...,8.579041,8.951440,8.710290,9.095939,8.579041,0.000000,0.000000,9.639196,9.040382,8.107419
19277,19278,80000.0,2,2,1,41,-1,-1,-1,-1,...,8.604288,8.119399,7.032624,8.819813,0.000000,0.000000,0.000000,0.000000,11.666187,11.666187
23975,23976,20000.0,1,1,2,25,0,0,0,0,...,10.468602,8.529319,8.689633,6.670766,8.865170,9.809012,9.650980,9.517604,9.729670,9.369820
12381,12382,500000.0,2,1,2,35,1,-1,-1,-1,...,0.000000,7.158514,11.309915,12.159730,8.502891,0.000000,-1.609438,-9.803833,-11.562411,0.000000


In [8]:
train_data.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'USAGE_1',
       'USAGE_2', 'USAGE_3', 'USAGE_4', 'USAGE_5', 'DIFF_0', 'DIFF_1',
       'DIFF_2', 'DIFF_3', 'DIFF_4', 'log_LIMIT_BAL', 'log_BILL_AMT1',
       'log_BILL_AMT2', 'log_BILL_AMT3', 'log_BILL_AMT4', 'log_BILL_AMT5',
       'log_BILL_AMT6', 'log_PAY_AMT1', 'log_PAY_AMT2', 'log_PAY_AMT3',
       'log_PAY_AMT4', 'log_PAY_AMT5', 'log_PAY_AMT6', 'log_USAGE_1',
       'log_USAGE_2', 'log_USAGE_3', 'log_USAGE_4', 'log_USAGE_5',
       'log_DIFF_0', 'log_DIFF_1', 'log_DIFF_2', 'log_DIFF_3', 'log_DIFF_4'],
      dtype='object')

In [9]:
#print(pd.concat([train_data.SEX,label_train],axis=1))

In [10]:
scaler = MinMaxScaler()
ATs = [column for column in NAT.log_columns if column !='ID' if column !='LIMIT_BAL']
scaler.fit(train_data[ATs])
train_data_scaled = scaler.transform(train_data[ATs])
for train_index , test_index in split.split(train_data_scaled,pd.concat([train_data.SEX,label_train],axis=1)):
    X_train, y_train= train_data_scaled[train_index],label_train.iloc[train_index]
    X_test, y_test = train_data_scaled[test_index], label_train.iloc[test_index]

In [11]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression

In [12]:
clf = LogisticRegression(
                        max_iter=100000,random_state=42)
clf.fit(X_train,y_train)
print((clf.predict(X_train)==y_train).mean())
print((clf.predict(X_test)==y_test).mean())

0.8064
0.8069333333333333


In [13]:
for penalty in ['l1', 'l2', 'elasticnet', 'none']:
    try:
        print(penalty)
        clf = LogisticRegressionCV(
                                Cs=20,
                                cv=5,
                                penalty=penalty,
                                max_iter=800,
                                random_state=42)
        clf.fit(X_train,y_train)
        print((clf.predict(X_train)==y_train).mean())
        print((clf.predict(X_test)==y_test).mean())
    except:
        print(f'error for {penalty}')

l1
error for l1
l2
0.8079407407407407
0.8064
elasticnet
error for elasticnet
none
error for none


In [14]:
for solver in {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}:
    try:
        print(solver)
        clf = LogisticRegressionCV(
                                Cs=20,
                                cv=5,
                                solver=solver,
                                max_iter=800,
                                random_state=42)
        clf.fit(X_train,y_train)
        print((clf.predict(X_train)==y_train).mean())
        print((clf.predict(X_test)==y_test).mean())
    except:

        print(f"error for {solver}")

sag
0.8083555555555556
0.8064
newton-cholesky
error for newton-cholesky
liblinear
0.8078814814814815
0.8071111111111111
lbfgs
0.8079407407407407
0.8064
saga
0.8082962962962963
0.8064
newton-cg
0.8082370370370371
0.8064


In [15]:
scaler = MinMaxScaler()
ATs = [column for column in NAT.columns if column !='ID' ]
scaler.fit(train_data[ATs])
train_data_scaled = scaler.transform(train_data[ATs])
for train_index , test_index in split.split(train_data_scaled,pd.concat([train_data.SEX,label_train],axis=1)):
    X_train, y_train= train_data_scaled[train_index],label_train.iloc[train_index]
    X_test, y_test = train_data_scaled[test_index], label_train.iloc[test_index]

In [16]:

clf = LogisticRegression(
                        max_iter=100000,random_state=42)
clf.fit(X_train,y_train)
print((clf.predict(X_train)==y_train).mean())
print((clf.predict(X_test)==y_test).mean())

0.8086518518518518
0.8092444444444444


In [17]:
for penalty in ['l1', 'l2', 'elasticnet', 'none']:
    try:
        print(penalty)
        clf = LogisticRegressionCV(
                                Cs=20,
                                cv=5,
                                penalty=penalty,
                                max_iter=800,
                                random_state=42)
        clf.fit(X_train,y_train)
        print((clf.predict(X_train)==y_train).mean())
        print((clf.predict(X_test)==y_test).mean())
    except:
        print(f'error for {penalty}')

l1
error for l1
l2
0.809362962962963
0.8090666666666667
elasticnet
error for elasticnet
none
error for none


In [18]:
for solver in {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}:
    try:
        print(solver)
        clf = LogisticRegressionCV(
                                Cs=20,
                                cv=5,
                                solver=solver,
                                max_iter=800,
                                random_state=42)
        clf.fit(X_train,y_train)
        print((clf.predict(X_train)==y_train).mean())
        print((clf.predict(X_test)==y_test).mean())
    except:

        print(f"error for {solver}")

sag
0.8094814814814815
0.8092444444444444
newton-cholesky
error for newton-cholesky
liblinear
0.809362962962963
0.8090666666666667
lbfgs
0.809362962962963
0.8090666666666667
saga
0.8094814814814815
0.8092444444444444
newton-cg
0.8094814814814815
0.8092444444444444
