In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [11]:
df=pd.read_csv('credit_card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [12]:
df=df.drop('ID',axis=1)

In [13]:
df['SEX']=df['SEX'].map({1:'Male',2:'Female'})

In [14]:
df['EDUCATION']=df['EDUCATION'].map({1:'GraduateSchool',2:'University',3:'HighSchool',4:'Others',0:'Others',5:'Others',6:'Others'})

In [15]:
df['MARRIAGE']=df['MARRIAGE'].map({1:'Married',2:'Single',3:'Others',0:'Others'})

# Statistical Test

As we can see that our dataset is of **Binary Class Classification** problem so the base model for our dataset should be **Logistic Regression**. And before preceedind towards model building we need to check the statistical significance of each independent variable w.r.t target.

In [17]:
from scipy.stats import ttest_ind,ttest_1samp,ttest_rel,shapiro,levene,mannwhitneyu,wilcoxon,anderson,chi2_contingency,chisquare

Before proceeding for any statistical test we need to check 
1. Normality
2. Equality of Variance
3. Sample are highly randomised (We consider this assumption to be true.)

If any of these assumption fail to satisfy we go for **Non-Parametric Test** otherwise **Parametric Test**

**Checking Normality**

As we have more than 5000 rows **anderson-darling test** for normality is more reliable.

**Ho: Data is normally distributed.**

**H1: Data is not normally distributed.**

In [18]:
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'DEFAULT'],
      dtype='object')

In [19]:
columns = ['LIMIT_BAL','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6',
           'PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
   
for i in columns:
    print('For', i)
    stat, cv, sl = anderson(df[i])
    print('Statistic: %.3f' % stat)
    for j in range(len(cv)):
        if stat < cv[j]:
            print('For significance level = {a}, critical value = {b} {c} looks normal (fail to reject H0)'.format(a=sl[j],b=cv[j],c=i))
        else:
            print('For significance level = {a}, critical value = {b} {c} does not looks normal (reject H0)'.format(a=sl[j],b=cv[j],c=i))
    print('\n')

For LIMIT_BAL
Statistic: 774.381
For significance level = 15.0, critical value = 0.576 LIMIT_BAL does not looks normal (reject H0)
For significance level = 10.0, critical value = 0.656 LIMIT_BAL does not looks normal (reject H0)
For significance level = 5.0, critical value = 0.787 LIMIT_BAL does not looks normal (reject H0)
For significance level = 2.5, critical value = 0.918 LIMIT_BAL does not looks normal (reject H0)
For significance level = 1.0, critical value = 1.092 LIMIT_BAL does not looks normal (reject H0)


For BILL_AMT1
Statistic: 2678.700
For significance level = 15.0, critical value = 0.576 BILL_AMT1 does not looks normal (reject H0)
For significance level = 10.0, critical value = 0.656 BILL_AMT1 does not looks normal (reject H0)
For significance level = 5.0, critical value = 0.787 BILL_AMT1 does not looks normal (reject H0)
For significance level = 2.5, critical value = 0.918 BILL_AMT1 does not looks normal (reject H0)
For significance level = 1.0, critical value = 1.092 B

As we can see that none of the continuous features are normal. So, we can go for **Non-parametric test**.

In [20]:
def_age = df[df['DEFAULT']==1]['AGE']
non_def_age = df.loc[df['DEFAULT']==0,'AGE']

In [21]:
mannwhitneyu(def_age,non_def_age)

MannwhitneyuResult(statistic=76966879.5, pvalue=0.1862517717449308)

In [22]:
lb_def = df[df['DEFAULT']==1]['LIMIT_BAL']
lb_non_def = df.loc[df['DEFAULT']==0,'LIMIT_BAL']

In [23]:
mannwhitneyu(lb_def,lb_non_def)

MannwhitneyuResult(statistic=59257217.5, pvalue=6.127742909111652e-190)

In [24]:
tab_sex = pd.crosstab(df['DEFAULT'],df['SEX'])
tab_sex

SEX,Female,Male
DEFAULT,Unnamed: 1_level_1,Unnamed: 2_level_1
0,14349,9015
1,3763,2873


In [25]:
chi2_contingency(tab_sex)

(47.708796890621116,
 4.944678999412026e-12,
 1,
 array([[14105.6256,  9258.3744],
        [ 4006.3744,  2629.6256]]))

In [26]:
tab_edu = pd.crosstab(df['DEFAULT'],df['EDUCATION'])
tab_edu

EDUCATION,GraduateSchool,HighSchool,Others,University
DEFAULT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,8549,3680,435,10700
1,2036,1237,33,3330


In [27]:
chi2_contingency(tab_edu)

(160.4099510722455,
 1.495064564810615e-34,
 3,
 array([[ 8243.598 ,  3829.3596,   364.4784, 10926.564 ],
        [ 2341.402 ,  1087.6404,   103.5216,  3103.436 ]]))

In [28]:
tab_mar = pd.crosstab(df['DEFAULT'],df['MARRIAGE'])
tab_mar

MARRIAGE,Married,Others,Single
DEFAULT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,10453,288,12623
1,3206,89,3341


In [29]:
chi2_contingency(tab_mar)

(28.130324644821993,
 7.7907203642028e-07,
 2,
 array([[10637.6292,   293.6076, 12432.7632],
        [ 3021.3708,    83.3924,  3531.2368]]))

In [30]:
repay = ['PAY_1','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']
for i in repay:
    tab_pay = pd.crosstab(df['DEFAULT'],df[i])
    chi2, p, dof, ex = chi2_contingency(tab_pay)
    if p>0.05:
        print('For {a}, p-value = {b} (Fail to reject Ho)'.format(a=i,b=p))
    else:
        print("For {a}, p-value = {b} (Reject Ho)".format(a=i,b=p))

For PAY_1, p-value = 0.0 (Reject Ho)
For PAY_2, p-value = 0.0 (Reject Ho)
For PAY_3, p-value = 0.0 (Reject Ho)
For PAY_4, p-value = 0.0 (Reject Ho)
For PAY_5, p-value = 0.0 (Reject Ho)
For PAY_6, p-value = 0.0 (Reject Ho)


In [31]:
bill_amt = ['BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
for i in bill_amt:
    def_bm = df[df['DEFAULT']==1][i]
    non_def_bm = df.loc[df['DEFAULT']==0,i]
    stat,p = mannwhitneyu(def_bm,non_def_bm)
    if p>0.05:
        print('For {a}, p-value = {b} (Fail to reject Ho)'.format(a=i,b=p))
    else:
        print("For {a}, p-value = {b} (Reject Ho)".format(a=i,b=p))

For BILL_AMT1, p-value = 5.755190912649123e-06 (Reject Ho)
For BILL_AMT2, p-value = 0.0035306025227855456 (Reject Ho)
For BILL_AMT3, p-value = 0.014101350680999257 (Reject Ho)
For BILL_AMT4, p-value = 0.07388390926897154 (Fail to reject Ho)
For BILL_AMT5, p-value = 0.11768351130808502 (Fail to reject Ho)
For BILL_AMT6, p-value = 0.4947404201391831 (Fail to reject Ho)


In [32]:
pay_amt = ['PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
for i in pay_amt:
    def_pa = df[df['DEFAULT']==1][i]
    non_def_pa = df.loc[df['DEFAULT']==0,i]
    stat,p = mannwhitneyu(def_pa,non_def_pa)
    if p>0.05:
        print('For {a}, p-value = {b} (Fail to reject Ho)'.format(a=i,b=p))
    else:
        print("For {a}, p-value = {b} (Reject Ho)".format(a=i,b=p))

For PAY_AMT1, p-value = 2.3083295995929906e-170 (Reject Ho)
For PAY_AMT2, p-value = 4.977487984626587e-151 (Reject Ho)
For PAY_AMT3, p-value = 4.4963542462153786e-129 (Reject Ho)
For PAY_AMT4, p-value = 3.642347332534603e-109 (Reject Ho)
For PAY_AMT5, p-value = 5.624349416331451e-91 (Reject Ho)
For PAY_AMT6, p-value = 1.5920582681736708e-98 (Reject Ho)
