In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy.stats import spearmanr
from scipy.stats import kendalltau
from scipy.stats import chi2_contingency

from IPython.display import display
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("Loan_Defaulters.csv")

# Feature Engineering :

**------- some columns contain 'nan' value**
-  home_ownership
-  dti
-  last_major_derog_none

In [5]:
df1 = df.copy()
df1.head()

Unnamed: 0,id,grade,annual_inc,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,od_ratio,loan_status
0,11454641,A,100000,1,1,RENT,26.27,credit_card,36 months,1,,43.2,0.0,0.160624,0
1,9604874,A,83000,0,4,OWN,5.39,credit_card,36 months,0,,21.5,0.0,0.810777,0
2,9684700,D,78000,0,11,MORTGAGE,18.45,debt_consolidation,60 months,1,,46.3,0.0,0.035147,1
3,9695736,D,37536,0,6,MORTGAGE,12.28,medical,60 months,0,,10.7,0.0,0.534887,1
4,9795013,D,65000,0,11,MORTGAGE,11.26,debt_consolidation,36 months,0,,15.2,0.0,0.1665,0


- **home_ownership**

In [6]:
df1['home_ownership'].unique()

array(['RENT', 'OWN', 'MORTGAGE', nan], dtype=object)

In [7]:
df1['home_ownership'].mode()[0]

'MORTGAGE'

In [8]:
df1['home_ownership'] = df1['home_ownership'].fillna(df1['home_ownership'].mode()[0])
df1['home_ownership'].unique()

array(['RENT', 'OWN', 'MORTGAGE'], dtype=object)

- **dti**

In [9]:
df1['dti'].unique()

array([26.27,  5.39, 18.45, ..., 33.23,  5.23, 32.53])

In [10]:
df1['dti'].mean()

16.58784137861534

In [11]:
df1['dti'].fillna(df1['dti'].mean(),inplace=True)
df1['dti'].unique()

array([26.27,  5.39, 18.45, ..., 33.23,  5.23, 32.53])

- **last_major_derog_none**

In [12]:
df1['last_major_derog_none'].unique()

array([nan,  0.,  1.])

In [13]:
df1['last_major_derog_none'].mode()[0]

1.0

In [14]:
df1['last_major_derog_none'].fillna(df1['last_major_derog_none'].mode()[0], inplace=True)
df1['last_major_derog_none'].unique()

array([1., 0.])

In [15]:
df1.isna().sum()                                    ## 'Nan'

id                       0
grade                    0
annual_inc               0
short_emp                0
emp_length_num           0
home_ownership           0
dti                      0
purpose                  0
term                     0
last_delinq_none         0
last_major_derog_none    0
revol_util               0
total_rec_late_fee       0
od_ratio                 0
loan_status              0
dtype: int64

In [16]:
df1.isnull().sum()                                  ## 'Null'

id                       0
grade                    0
annual_inc               0
short_emp                0
emp_length_num           0
home_ownership           0
dti                      0
purpose                  0
term                     0
last_delinq_none         0
last_major_derog_none    0
revol_util               0
total_rec_late_fee       0
od_ratio                 0
loan_status              0
dtype: int64

In [17]:
df1.head()

Unnamed: 0,id,grade,annual_inc,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,od_ratio,loan_status
0,11454641,A,100000,1,1,RENT,26.27,credit_card,36 months,1,1.0,43.2,0.0,0.160624,0
1,9604874,A,83000,0,4,OWN,5.39,credit_card,36 months,0,1.0,21.5,0.0,0.810777,0
2,9684700,D,78000,0,11,MORTGAGE,18.45,debt_consolidation,60 months,1,1.0,46.3,0.0,0.035147,1
3,9695736,D,37536,0,6,MORTGAGE,12.28,medical,60 months,0,1.0,10.7,0.0,0.534887,1
4,9795013,D,65000,0,11,MORTGAGE,11.26,debt_consolidation,36 months,0,1.0,15.2,0.0,0.1665,0


# Feature selection :

 - **Assumption 1 : Linearity**<br>
corr ()

In [19]:
df1.corr().loc["loan_status", :] 

id                      -0.037307
annual_inc              -0.092402
short_emp                0.037186
emp_length_num          -0.041420
dti                      0.141361
last_delinq_none         0.021494
last_major_derog_none   -0.002114
revol_util               0.053682
total_rec_late_fee       0.130233
od_ratio                 0.000403
loan_status              1.000000
Name: loan_status, dtype: float64

- **Assumption 2 : No Multicollinearity**<br>
vif

In [20]:
df1.shape

(20000, 15)

In [28]:
df1.to_csv('df1')

In [38]:
## H0 : samples are uncorrelated
## H1 : samples are correlated



coef,p = spearmanr(df1['term'],df1['bad_loan'])
print("Spearmans correlation coefficient : %3f" % coef)
alpha = 0.05
if p>alpha:
    print("samples are uncorrelated ( fail to reject H0)p=%.3f" %p)
else:
    print("samples are correlated (reject H0) p = %.3f" %p) 
    
    
    
## p-value > alpha (0.05) ------ fail to reject H0   (i.e.,accept H0)
## p-value < alpha (0.05) ------ reject H0           (i.e.,accept H1)

Spearmans correlation coefficient : 0.133146
samples are correlated (reject H0) p = 0.000


In [36]:
from scipy.stats import chi2_contingency

chi2_contingency(pd.crosstab(df1['home_ownership'],df1['bad_loan']))

(109.54607948262893,
 1.630688780916426e-24,
 2,
 array([[7871.9025339 , 1972.0974661 ],
        [1289.85968988,  323.14031012],
        [5639.23777622, 1412.76222378]]))

In [None]:
alpha(l.o.s) = 0.05
  p- value   = 1.63
    
p-value is greater than the significance level(0.05), therefore we conclude that there is 
no relationship between 2 variables

In [None]:
The test statistic - chi2         109.54607948262893

The p-value of the test - p       1.630688780916426e-24

Degrees of freedom - dof          2

The expected frequencies, based on the marginal sums of the table - expected       array([[7871.9025339 , 1972.0974661 ],
                                                                                          [1289.85968988,  323.14031012],
                                                                                          [5639.23777622, 1412.76222378]]))