Let's first load required libraries:

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

### About dataset


This dataset is about past loans. The **Loan_train.csv** data set includes details of 346 customers whose loan are already paid off or defaulted. It includes following fields:

| Field          | Description                                                                           |
| -------------- | ------------------------------------------------------------------------------------- |
| Loan_status    | Whether a loan is paid off on in collection                                           |
| Principal      | Basic principal loan amount at the                                                    |
| Terms          | Origination terms which can be weekly (7 days), biweekly, and monthly payoff schedule |
| Effective_date | When the loan got originated and took effects                                         |
| Due_date       | Since it’s one-time payoff schedule, each loan has one single due date                |
| Age            | Age of applicant                                                                      |
| Education      | Education of applicant                                                                |
| Gender         | The gender of applicant                                                               |


### Load Data From CSV File


In [None]:
path = 'C:/Users/adars/OneDrive/Desktop/Machine learning/IBM ML/Machine learning with Python/Python files from portal/loan_train.csv'

In [None]:
df = pd.read_csv(path)
df.head()

In [None]:
df.shape

### Convert to date time object


In [None]:
df['due_date'] = pd.to_datetime(df['due_date'])
df['effective_date'] = pd.to_datetime(df['effective_date'])
df.head()

# Data visualization and pre-processing


Let’s see how many of each class is in our data set


In [None]:
df['loan_status'].value_counts()

260 people have paid off the loan on time while 86 have gone into collection


Let's plot some columns to underestand data better:


In [None]:
# notice: installing seaborn might takes a few minutes
!conda install -c anaconda seaborn -y

In [None]:
import seaborn as sns

bins = np.linspace(df.Principal.min(), df.Principal.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'Principal', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
bins = np.linspace(df.age.min(), df.age.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'age', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

# Pre-processing:  Feature selection/extraction


### Let's look at the day of the week people get the loan


In [None]:
df['dayofweek'] = df['effective_date'].dt.dayofweek
bins = np.linspace(df.dayofweek.min(), df.dayofweek.max(), 10)
g = sns.FacetGrid(df, col="Gender", hue="loan_status", palette="Set1", col_wrap=2)
g.map(plt.hist, 'dayofweek', bins=bins, ec="k")
g.axes[-1].legend()
plt.show()


We see that people who get the loan at the end of the week don't pay it off, so let's use Feature binarization to set a threshold value less than day 4


In [None]:
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
df.head(20)

## Convert Categorical features to numerical values


Let's look at gender:


In [None]:
df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)

86 % of female pay there loans while only 73 % of males pay there loan


Let's convert male to 0 and female to 1:


In [None]:
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
df.head(100)

## One Hot Encoding

#### How about education?


In [None]:
df.groupby(['education'])['loan_status'].value_counts(normalize=True)

#### Features before One Hot Encoding


In [None]:
df[['Principal','terms','age','Gender','education']].head()

#### Used one hot encoding technique to conver categorical varables to binary variables and append them to the feature Data Frame


In [None]:
Feature = df[['Principal','terms','age','Gender','weekend']]
Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
Feature.drop(['Master or Above'], axis = 1,inplace=True)
Feature.head()


### Feature Selection


Let's define feature sets, X:


In [None]:
X = Feature
X[0:5]

What are our lables?


In [None]:
y = df['loan_status'].values
y[0:5]

## Normalize Data


Data Standardization give data zero mean and unit variance (technically should be done after train test split)


In [None]:
X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

# Classification


used the training set to build an accurate model. Then used the test set to report the accuracy of the model using the following algorithm:

*   K Nearest Neighbor(KNN)
*   Decision Tree
*   Support Vector Machine
*   Logistic Regression



# K Nearest Neighbor(KNN)

find the best k to build the model with the best accuracy.


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k = 7 
model_KNN = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
model_KNN

In [None]:
yhat = model_KNN.predict(X_test)
yhat[0:5]

In [None]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, model_KNN.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [None]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    model_KNN = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=model_KNN.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

# Decision Tree


In [None]:
from sklearn.tree import DecisionTreeClassifier
descTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
descTree


In [None]:
descTree.fit(X_train,y_train)

In [None]:
predTree = descTree.predict(X_test)

# Support Vector Machine


In [None]:
from sklearn import svm
SVM = svm.SVC(kernel='rbf')
SVM.fit(X_train, y_train) 

In [None]:
yhat = SVM.predict(X_test)
yhat [0:5]

# Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

In [None]:
yhat = LR.predict(X_test)
yhat

In [None]:
yhat_prob = LR.predict_proba(X_test)


# Model Evaluation using Test set


In [None]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss


First, download and load the test set:


In [14]:
path_testing = 'C:/Users/adars/OneDrive/Desktop/Machine learning/IBM ML/Machine learning with Python/Python files from portal/loan_test.csv'

### Load Test set for evaluation


In [15]:
test_df = pd.read_csv(path_testing)
test_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,1,1,PAIDOFF,1000,30,9/8/2016,10/7/2016,50,Bechalor,female
1,5,5,PAIDOFF,300,7,9/9/2016,9/15/2016,35,Master or Above,male
2,21,21,PAIDOFF,1000,30,9/10/2016,10/9/2016,43,High School or Below,female
3,24,24,PAIDOFF,1000,30,9/10/2016,10/9/2016,26,college,male
4,35,35,PAIDOFF,800,15,9/11/2016,9/25/2016,29,Bechalor,male


In [None]:
test_df.shape

In [None]:
test_df['due_date'] = pd.to_datetime(test_df['due_date'])
test_df['effective_date'] = pd.to_datetime(test_df['effective_date'])


In [None]:
test_df['dayofweek'] = test_df['effective_date'].dt.dayofweek


In [None]:
test_df['weekend'] = test_df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)


In [None]:
test_df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)


In [None]:
X1 = test_df[['Principal','terms','age','Gender','weekend']]
X1 = pd.concat([X1,pd.get_dummies(test_df['education'])], axis=1)
X1.drop(['Master or Above'], axis = 1,inplace=True)
X1.head()


In [None]:
test_df_X = preprocessing.StandardScaler().fit(X1).transform(X1)
test_df_X[0:5]


In [None]:
y_test_df = test_df['loan_status'].values
y_test_df[0:5]


In [None]:
yhat_KNN = model_KNN.predict(test_df_X)
print("KNN Jaccard index: %.2f" % jaccard_score(y_test_df, yhat_KNN, pos_label = "PAIDOFF"))
print("KNN F1-score: %.2f" % f1_score(y_test_df, yhat_KNN, average='weighted') )

In [None]:
yhat_DT = descTree.predict(test_df_X)
print("DECISION TREE Jaccard index: %.2f" % jaccard_score(y_test_df, yhat_DT, pos_label = "PAIDOFF"))
print("DECISION TREE F1-score: %.2f" % f1_score(y_test_df, yhat_DT, average='weighted') )

In [None]:
svm_yhat = SVM.predict(test_df_X)
print("SVM Jaccard index: %.2f" % jaccard_score(y_test_df, svm_yhat, pos_label = "PAIDOFF"))
print("SVM F1-score: %.2f" % f1_score(y_test_df, svm_yhat, average='weighted') )

In [None]:
yhat_LR = LR.predict(test_df_X)
yhat_LR_prob = LR.predict_proba(test_df_X)
print("LR Jaccard index: %.2f" % jaccard_score(y_test_df,yhat_LR, pos_label = "PAIDOFF"))
print("LR F1-score: %.2f" % f1_score(y_test_df, yhat_LR, average='weighted') )
print("LR LogLoss: %.2f" % log_loss(y_test_df, yhat_LR_prob))