In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("Part2 - Data1_2_combined.csv")
df.shape

(5000, 14)

In [5]:
df.head()

Unnamed: 0,ID,Age,CustomerSince,HighestSpend,ZipCode,HiddenScore,MonthlyAverageSpend,Level,Mortgage,Security,FixedDepositAccount,InternetBanking,CreditCard,LoanOnCard
0,1,25,1,49,91107,4,1.6,1,0,1,0,0,0,
1,2,45,19,34,90089,3,1.5,1,0,1,0,0,0,
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,1,


Even though, it is not mentioned explicitly in the problem statement, examining the data reveals that "LoanOnCard" is the target variable here, rest are all IVs.

In [6]:
(df['LoanOnCard']==0).value_counts()

True     4500
False     500
Name: LoanOnCard, dtype: int64

Clear Class imbalance observed. 15X more liability customers than borrowers

We will use SMOTE to upsample the "Borrowers" class, but before resampling we should divide train and test data lest it leads to data leakage and hampers the generalization of the model

In [10]:
# check for null value
df.isnull().sum()

ID                      0
Age                     0
CustomerSince           0
HighestSpend            0
ZipCode                 0
HiddenScore             0
MonthlyAverageSpend     0
Level                   0
Mortgage                0
Security                0
FixedDepositAccount     0
InternetBanking         0
CreditCard              0
LoanOnCard             20
dtype: int64

In [11]:
df1 = df.copy()

In [12]:
#drop the null vaues from the new dataframe
df1.dropna(axis=0,inplace=True)

In [13]:
df1.shape

(4980, 14)

In [14]:
# Correcting the  negative numbers in attribute "experience"
df1['CustomerSince'] = df1['CustomerSince'].abs()

In [15]:
#Segregate predictors vs target attributes

X_df = df1.loc[:, df.columns != 'LoanOnCard']  # choose appropriate dataframe
y_df = df1['LoanOnCard']

In [16]:
X_train, x_test, y_train, y_test = train_test_split(X_df, y_df, test_size = 0.2, random_state=1)

In [17]:
#Let's apply SMOTE on Train data
# Import the SMOTE package
from imblearn.over_sampling import SMOTE
# Synthesize minority class datapoints using SMOTE
sm = SMOTE(random_state=42, sampling_strategy='minority')
smote_x_train, smote_y_train = sm.fit_resample(X_train, y_train)

In [18]:
X_train.shape

(3984, 13)

In [19]:
smote_x_train.shape

(7216, 13)

In [21]:
(y_train==0).value_counts()

True     3608
False     376
Name: LoanOnCard, dtype: int64

In [22]:
(smote_y_train==0).value_counts()

True     3608
False    3608
Name: LoanOnCard, dtype: int64

So, now both classes are balanced! Let's try modeling.

In [23]:
# Logistic regression

#from sklearn.linear_model import LogisticRegression
from sklearn import metrics

model = LogisticRegression()
model.fit(smote_x_train, smote_y_train)

model_score = model.score(smote_x_train, smote_y_train)
y_predict = model.predict(smote_x_train)
print("\nTrain accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(smote_y_train, y_predict))

model_score = model.score(x_test, y_test)
y_predict = model.predict(x_test)

print("\nTest accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(y_test, y_predict))


Train accuracy: 88.15 %

Confusion matrix:

 [[3068  540]
 [ 315 3293]]

Test accuracy: 84.44 %

Confusion matrix:

 [[751 141]
 [ 14  90]]


In [24]:
# Naive Bayes

#from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(smote_x_train, smote_y_train)

model_score = model.score(smote_x_train, smote_y_train)
y_predict = model.predict(smote_x_train)
print("\nTrain accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(smote_y_train, y_predict))

model_score = model.score(x_test, y_test)
y_predict = model.predict(x_test)

print("\nTest accuracy:",round(100*model_score,2),"%")
print("\nConfusion matrix:\n\n",metrics.confusion_matrix(y_test, y_predict))


Train accuracy: 89.94 %

Confusion matrix:

 [[3186  422]
 [ 304 3304]]

Test accuracy: 86.45 %

Confusion matrix:

 [[775 117]
 [ 18  86]]


Naive Bayes performs slightly better than Logistic Regression