# The goal of the following code is to classify the Lending Club loan grade of the borrower based on various parameters.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_wdo = pd.read_csv('new_df.csv')

### Since this particular dataset has almost 2.2 million rows and 168 columns, it would be computationally expensive to run machine learning models on this dataset. So we decide to take a small sample of this dataset and proceed.

In [3]:
df_wdo=df_wdo.sample(frac=0.01, replace=False, random_state=0)

### Setting grade as the dependant variable.

In [4]:
grade_y= df_wdo['grade']

In [5]:
df_wdo.drop('grade',axis =1,inplace = True)

In [6]:
df_wdo.drop('sub_grade',axis =1 ,inplace = True)

In [7]:
df_wdo.shape

(21633, 45)

In [8]:
df_wdo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21633 entries, 20564 to 925977
Data columns (total 45 columns):
Unnamed: 0                        21633 non-null int64
loan_amnt                         21633 non-null int64
int_rate                          21633 non-null float64
emp_length                        21633 non-null object
home_ownership                    21633 non-null object
annual_inc                        21633 non-null float64
verification_status               21633 non-null object
pymnt_plan                        21633 non-null object
purpose                           21633 non-null object
addr_state                        21633 non-null object
dti                               21633 non-null float64
delinq_2yrs                       21633 non-null float64
inq_last_6mths                    21633 non-null float64
mths_since_last_delinq            21633 non-null int64
mths_since_last_record            21633 non-null int64
open_acc                          21633 non-n

In [9]:
df2_wdo_dum =pd.get_dummies(df_wdo, columns=['emp_length','home_ownership','verification_status','pymnt_plan','purpose','addr_state'])

In [10]:
from scipy import stats
import numpy as np

In [11]:
df2_wdo_dum.shape

(21633, 123)

### The outliers exceeding a z score of 5 are removed.

In [12]:
z = np.abs(stats.zscore(df2_wdo_dum))
print(z)
df2_out = df2_wdo_dum[(z < 5).all(axis=1)]

[[1.69580923 0.5715904  2.8892725  ... 0.11050901 0.05531935 0.04861152]
 [1.64953866 1.32983171 0.27023355 ... 0.11050901 0.05531935 0.04861152]
 [1.19913797 1.00487115 0.65419421 ... 0.11050901 0.05531935 0.04861152]
 ...
 [0.7892407  0.26829388 0.33354621 ... 0.11050901 0.05531935 0.04861152]
 [0.48673579 1.11319133 0.78974295 ... 0.11050901 0.05531935 0.04861152]
 [0.25523527 0.78823077 0.68687171 ... 0.11050901 0.05531935 0.04861152]]


In [13]:
df2_out.shape

(7710, 123)

In [14]:
grade_y = grade_y[(z < 5).all(axis=1)]

In [15]:
grade_y.shape

(7710,)

### The x and y variables are split into training sets and test sets respectively. Once they are split, they are scaled by using the StandardScaler(). 

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

X_train_org, X_test_org, y_train, y_test = train_test_split(df2_out, grade_y, random_state = 0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.transform(X_test_org)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  # Remove the CWD from sys.path while we load stuff.


## Decision Tree Classifier:

In [17]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(X_train, y_train)

In [18]:
dt_tr_sc=cross_val_score(tree, X_train, y_train).mean()
dt_te_sc=cross_val_score(tree, X_test, y_test).mean()
print(dt_tr_sc)
print(dt_te_sc)



0.9635069799936474
0.8973220332423909


## K Nearest Neighbors Classifier:

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
from sklearn.neighbors import KNeighborsClassifier
kfold =KFold(n_splits=3, random_state=0)

param_grid={'n_neighbors':range(1,10)}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=kfold, return_train_score=True)
grid_search.fit(X_train,y_train)

k_test=cross_val_score(grid_search, X_test, y_test, cv=kfold).mean()

In [21]:
print("Train:",grid_search.best_score_)
print("Test",k_test)
print("parameters",grid_search.best_params_)

Train: 0.3614666205465237
Test 0.3449118149122509
parameters {'n_neighbors': 8}


### Adding results to mr1(model_results 1)

In [22]:
mr1 = pd.DataFrame(columns=('S.No','Model_Name','Parameters', 'Train_Score', 'Test_Score'))

In [23]:
mr1.loc[len(mr1)]=[1,'KNN Classifier',grid_search.best_params_,grid_search.best_score_,k_test]

In [24]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912


## Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
kfold =KFold(n_splits=3, random_state=0)
param_grid={'penalty':['l1','l2'],'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=kfold, return_train_score=True)
grid_search.fit(X_train,y_train)

k_test=cross_val_score(grid_search, X_test, y_test, cv=kfold).mean()





















In [26]:
print("Train:",grid_search.best_score_)
print("Test",k_test)
print("parameters",grid_search.best_params_)

Train: 0.6961259079903148
Test 0.6504039831462398
parameters {'C': 0.01, 'penalty': 'l1'}


In [27]:
mr1.loc[len(mr1)]=[2,'Logistic Model',grid_search.best_params_,grid_search.best_score_,k_test]

In [28]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912
1,2,Logistic Model,"{'C': 0.01, 'penalty': 'l1'}",0.696126,0.650404


## Linear SVC

In [29]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 10, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(),param_grid, cv=kfold, return_train_score=True);

grid_search.fit(X_train, y_train)

test=cross_val_score(grid_search,X_test, y_test,cv=3).mean();





In [30]:
print("Train:",grid_search.best_score_)
print("Test",test)
print("parameters",grid_search.best_params_)

Train: 0.6629194050501557
Test 0.6052853079468277
parameters {'C': 1}


In [31]:
mr1.loc[len(mr1)]=[3,'Linear SVC',grid_search.best_params_,grid_search.best_score_,test]

In [32]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912
1,2,Logistic Model,"{'C': 0.01, 'penalty': 'l1'}",0.696126,0.650404
2,3,Linear SVC,{'C': 1},0.662919,0.605285


## SVC - Kernel : Linear

In [33]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100, 250]}
grid_search = GridSearchCV(SVC(kernel='linear'), param_grid, cv=3, return_train_score=True);

grid_search.fit(X_train, y_train)

test=cross_val_score(grid_search,X_test, y_test,cv=3).mean();

In [34]:
print("Train:",grid_search.best_score_)
print("Test",test)
print("parameters",grid_search.best_params_)

Train: 0.8420961604980975
Test 0.8080822739498075
parameters {'C': 100}


In [35]:
mr1.loc[len(mr1)]=[4,'Linear SVC - Kernel Trick',grid_search.best_params_,grid_search.best_score_,test]

In [36]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912
1,2,Logistic Model,"{'C': 0.01, 'penalty': 'l1'}",0.696126,0.650404
2,3,Linear SVC,{'C': 1},0.662919,0.605285
3,4,Linear SVC - Kernel Trick,{'C': 100},0.842096,0.808082


## SVC - RBF Kernel 

In [37]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100, 250],'gamma':[0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=3, return_train_score=True);

grid_search.fit(X_train, y_train)

test=cross_val_score(grid_search,X_test, y_test,cv=3).mean();

In [38]:
print("Train:",grid_search.best_score_)
print("Test",test)
print("parameters",grid_search.best_params_)

Train: 0.8355240401245244
Test 0.7914626909954013
parameters {'C': 100, 'gamma': 0.001}


In [39]:
mr1.loc[len(mr1)]=[5,'Linear SVC - RBF',grid_search.best_params_,grid_search.best_score_,test]

In [40]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912
1,2,Logistic Model,"{'C': 0.01, 'penalty': 'l1'}",0.696126,0.650404
2,3,Linear SVC,{'C': 1},0.662919,0.605285
3,4,Linear SVC - Kernel Trick,{'C': 100},0.842096,0.808082
4,5,Linear SVC - RBF,"{'C': 100, 'gamma': 0.001}",0.835524,0.791463


## SVC - Polynomial Kernel 

In [41]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100, 250],'degree':[2,3]}
grid_search = GridSearchCV(SVC(kernel='poly'), param_grid, cv=3, return_train_score=True);

grid_search.fit(X_train, y_train)

test=cross_val_score(grid_search,X_test, y_test,cv=3).mean();













In [42]:
print("Train:",grid_search.best_score_)
print("Test",test)
print("parameters",grid_search.best_params_)

Train: 0.5913178830854375
Test 0.5000016124558994
parameters {'C': 100, 'degree': 3}


In [43]:
mr1.loc[len(mr1)]=[6,'Linear SVC - Polynomial Kernel',grid_search.best_params_,grid_search.best_score_,test]

In [44]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912
1,2,Logistic Model,"{'C': 0.01, 'penalty': 'l1'}",0.696126,0.650404
2,3,Linear SVC,{'C': 1},0.662919,0.605285
3,4,Linear SVC - Kernel Trick,{'C': 100},0.842096,0.808082
4,5,Linear SVC - RBF,"{'C': 100, 'gamma': 0.001}",0.835524,0.791463
5,6,Linear SVC - Polynomial Kernel,"{'C': 100, 'degree': 3}",0.591318,0.500002


## Decision Tree Classifier

In [45]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(X_train, y_train)
dt_tr_sc=cross_val_score(tree, X_train, y_train, cv=5).mean()
dt_te_sc=cross_val_score(tree, X_test, y_test, cv=5).mean()
print(dt_tr_sc)
print(dt_te_sc)

0.9731995784075934
0.9367666494459822


In [46]:
mr1.loc[len(mr1)]=[7,'Decision Tree Classifier',"None",dt_tr_sc,dt_te_sc]

In [47]:
mr1

Unnamed: 0,S.No,Model_Name,Parameters,Train_Score,Test_Score
0,1,KNN Classifier,{'n_neighbors': 8},0.361467,0.344912
1,2,Logistic Model,"{'C': 0.01, 'penalty': 'l1'}",0.696126,0.650404
2,3,Linear SVC,{'C': 1},0.662919,0.605285
3,4,Linear SVC - Kernel Trick,{'C': 100},0.842096,0.808082
4,5,Linear SVC - RBF,"{'C': 100, 'gamma': 0.001}",0.835524,0.791463
5,6,Linear SVC - Polynomial Kernel,"{'C': 100, 'degree': 3}",0.591318,0.500002
6,7,Decision Tree Classifier,,0.9732,0.936767


### So the best model to classify LC Grade is the Decision Tree Classifier with a train scores of 0.973 and test scores of 0.936.