In [1]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/hmeq-data/hmeq.csv


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [5]:
df=pd.read_csv('/kaggle/input/hmeq-data/hmeq.csv')

In [6]:
df.isna().sum()
totalmiss=pd.DataFrame(data=[df.columns,df.isna().sum()/df.shape[0]])
totalmiss


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
1,0,0,0.0869128,0.0187919,0.0422819,0.0468121,0.0864094,0.118792,0.0973154,0.0516779,0.0855705,0.0372483,0.212584


The data seems okay with no much missing features. 

In [7]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
oe=OrdinalEncoder()
si=SimpleImputer(strategy='most_frequent')
df=pd.DataFrame(si.fit_transform(df),columns=list(df))
df=pd.DataFrame(oe.fit_transform(df),columns=list(df))

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BAD      5960 non-null   float64
 1   LOAN     5960 non-null   float64
 2   MORTDUE  5960 non-null   float64
 3   VALUE    5960 non-null   float64
 4   REASON   5960 non-null   float64
 5   JOB      5960 non-null   float64
 6   YOJ      5960 non-null   float64
 7   DEROG    5960 non-null   float64
 8   DELINQ   5960 non-null   float64
 9   CLAGE    5960 non-null   float64
 10  NINQ     5960 non-null   float64
 11  CLNO     5960 non-null   float64
 12  DEBTINC  5960 non-null   float64
dtypes: float64(13)
memory usage: 605.4 KB


The problems of categorical data and nan values are solved by now.

# Analysis

In [9]:
df.corr()['BAD'].sort_values(ascending=False)

BAD        1.000000
DELINQ     0.348497
DEROG      0.269921
NINQ       0.173356
REASON     0.037517
JOB        0.022775
CLNO      -0.008228
YOJ       -0.019817
MORTDUE   -0.070820
LOAN      -0.080926
VALUE     -0.110090
CLAGE     -0.182655
DEBTINC   -0.270231
Name: BAD, dtype: float64

The correlation did not entertain much though. There is still a way!

 # Preparation

In [10]:
x_train=df.drop(columns=['BAD'],axis=1)
y_train=df['BAD']

In [12]:
q=pd.DataFrame(data=[y_train.value_counts(),y_train.value_counts()/y_train.shape[0]])
print(q)

             0.0          1.0
BAD  4771.000000  1189.000000
BAD     0.800503     0.199497


The data is enormously skewed. I will be using SMOTE to balance it.

In [13]:
from imblearn.over_sampling import SMOTE
smt=SMOTE()
x_train,y_train=smt.fit_resample(x_train,y_train)

Let's check the ratio now.

In [14]:
q=pd.DataFrame(data=[y_train.value_counts(),y_train.value_counts()/y_train.shape[0]])
print(q)

        0.0     1.0
BAD  4771.0  4771.0
BAD     0.5     0.5


The data seems balanced! Time for a little preprocessing!

In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.2)

## LogisticRegression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,confusion_matrix
lr=LogisticRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)

print('Accuracy Score :',accuracy_score(y_test,y_pred))
print('F1_Score :',f1_score(y_test,y_pred))
print('ROC_AUC_Score',roc_auc_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


Accuracy Score : 0.7427972760607648
F1_Score : 0.7264623955431755
ROC_AUC_Score 0.741828536538225
[[766 210]
 [281 652]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.74 is not a good accuracy score !!!

## XGBoost

In [20]:
import xgboost
xgb=xgboost.XGBClassifier(max_depth=10,n_estimators=100)
xgb.fit(x_train,y_train)
y_pred=xgb.predict(x_test)


print('Accuracy Score :',accuracy_score(y_test,y_pred))
print('F1_Score :',f1_score(y_test,y_pred))
print('ROC_AUC_Score',roc_auc_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy Score : 0.9549502357255107
F1_Score : 0.9536637931034482
ROC_AUC_Score 0.9548093142164358
[[938  38]
 [ 48 885]]


Wow! The accuracy seemed to have skyrocketed! We can trust this figure, as the f1_score also seems pretty good!

Let's try LightGBM as a last method!

## LightGBM


In [22]:
import lightgbm
lgbm=lightgbm.LGBMClassifier()
lgbm.fit(x_train,y_train)
y_pred=lgbm.predict(x_test)

print('Accuracy Score :',accuracy_score(y_test,y_pred))
print('F1_Score :',f1_score(y_test,y_pred))
print('ROC_AUC_Score',roc_auc_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

Accuracy Score : 0.9539025667888947
F1_Score : 0.952329360780065
ROC_AUC_Score 0.9536430604607032
[[942  34]
 [ 54 879]]


This gave a bit low score, but we can consider this amount of precision. 


## As a sidenote:
You can always try the problem without SMOTE and see the changes. In my experiment, the accuracy figures before applying SMOTE to the data were around 0.92 (LightGBM) with a very disappointing f1_score of 0.80 
