In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import lightgbm as lgb

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [2]:
os.chdir(r'/Users/arivarasuperumal/downloads/Onboarding_project_MW')

In [3]:
data = pd.read_excel('Lending_Club_Data.xlsx')

In [4]:
data['Max_credit_age'] = round((pd.to_datetime("now") - data['earliest_cr_line'])/np.timedelta64(1,'Y'),2)

In [5]:
data['ratio_open_total_acc'] = round(data['open_acc']/data['total_acc'],2)

In [6]:
data['emp_length'] = data['emp_length'].apply(lambda x: x/11 if x in [11,22,33] else x)

In [7]:
data.drop(['purpose', 'id', 'emp_title', 'zip_code'],axis = 1, inplace = True)

In [8]:
data['Notes'] = data['Notes'].astype(str)
data["Notes_length"] = data['Notes'].apply(len)

### Subjectivity & Polarity

In [9]:
from textblob import TextBlob

def get_polarity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        pol = textblob.sentiment.polarity
    except:
        pol = 0.0
    return pol

def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

data['polarity'] = data['Notes'].apply(get_polarity)
data['subjectivity'] = data['Notes'].apply(get_subjectivity)

### TF-Idf

In [10]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.decomposition import LatentDirichletAllocation
# from sklearn.feature_extraction.text import CountVectorizer

# Notes = list(data['Notes'].values)

# vect_word = TfidfVectorizer(max_features=2500, analyzer='word', stop_words=stopwords, ngram_range=(1,3), dtype=np.float32) 
# vect_word.fit(Notes)
# tfidf_complete = vect_word.transform(Notes)

# tfidf = dict(zip(vect_word.get_feature_names(), vect_word.idf_))
# tfidf = pd.DataFrame(columns=['Notes_tfidf']).from_dict(dict(tfidf), orient='index')
# tfidf.columns = ['Notes_tfidf']

# tfidf.sort_values(by=['Notes_tfidf'], ascending=True).head(10)

In [11]:
data['delinq'] = data['mths_since_last_delinq'].apply(lambda x: 0 if pd.isnull(x) else 1)
data['record'] = data['mths_since_last_record'].apply(lambda x: 0 if pd.isnull(x) else 1)

In [12]:
final_data = data.loc[:, data.columns != 'Notes']
final_data = final_data.loc[:, final_data.columns != 'earliest_cr_line']
final_data = final_data.loc[:, final_data.columns != 'mths_since_last_delinq']
final_data = final_data.loc[:, final_data.columns != 'mths_since_last_record']

final_data = final_data.dropna()

final_data = pd.get_dummies(final_data)

In [13]:
final_data.head()

Unnamed: 0,is_bad,annual_inc,debt_to_income,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,...,addr_state_WI,addr_state_WV,addr_state_WY,initial_list_status_f,initial_list_status_m,policy_code_PC1,policy_code_PC2,policy_code_PC3,policy_code_PC4,policy_code_PC5
0,0,50000.0,10.87,0.0,0.0,15.0,0.0,12087,12.1,44.0,...,0,0,0,1,0,0,0,0,1,0
1,0,39216.0,9.15,0.0,2.0,4.0,0.0,10114,64.0,5.0,...,0,0,0,1,0,1,0,0,0,0
2,0,65000.0,11.24,0.0,0.0,4.0,0.0,81,0.6,8.0,...,0,0,0,1,0,0,0,0,1,0
3,0,57500.0,6.18,1.0,0.0,6.0,0.0,10030,37.1,23.0,...,0,0,0,1,0,0,1,0,0,0
4,0,50004.0,19.03,0.0,4.0,8.0,0.0,10740,40.4,21.0,...,0,0,0,1,0,0,0,1,0,0


### Test-Train Split

In [14]:
def test_train_split(train_data):
    return train_test_split(train_data, test_size=0.2, random_state=20, stratify=train_data[['is_bad']])

### Xy_split

In [15]:
def Xy_split(test,train):
    X_train = train.loc[:, train.columns != 'is_bad']
    y_train = train.loc[:, train.columns == 'is_bad']
    X_test = test.loc[:, test.columns != 'is_bad']
    y_test = test.loc[:, test.columns == 'is_bad']
    return X_train, X_test, y_train, y_test

### Logistic Regression

In [16]:
def log_reg(train, test, smote):
    scaler = StandardScaler()

    X_train, X_test, y_train, y_test = Xy_split(test,train)
    
    X_train = pd.DataFrame(scaler.fit_transform(X_train))
    X_test = pd.DataFrame(scaler.transform(X_test))
    
    if smote == "yes":
        sm = SMOTE(random_state=42)
        X_smote, y_smote = sm.fit_resample(X_train, y_train)

        model = LogisticRegression(random_state=20)
        solvers = ['newton-cg', 'lbfgs', 'liblinear']
        penalty = ['l2']
        c_values = [100, 10, 1.0, 0.1, 0.01]
        grid = dict(solver=solvers,penalty=penalty,C=c_values)
        cv = StratifiedKFold(n_splits=5)
        grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='precision',error_score=0)
        grid_result = grid_search.fit(X_train, y_train)

        y_pred=grid_result.predict(X_test)

        print ("Logistic Regression")
        print (classification_report(y_test, y_pred))
    if smote == 'no':
        model = LogisticRegression(random_state=20)
        solvers = ['newton-cg', 'lbfgs', 'liblinear']
        penalty = ['l2']
        c_values = [100, 10, 1.0, 0.1, 0.01]
        grid = dict(solver=solvers,penalty=penalty,C=c_values)
        cv = StratifiedKFold(n_splits=5)
        grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='precision',error_score=0)
        grid_result = grid_search.fit(X_train, y_train)

        y_pred=grid_result.predict(X_test)

        print ("Logistic Regression")
        print (classification_report(y_test, y_pred))
        

### Random Forest Classifier

In [17]:
def random_forest(train, test):
    
    X_train, X_test, y_train, y_test = Xy_split(test,train)
    
    model = RandomForestClassifier(random_state=20)
    cv = StratifiedKFold(n_splits=5)
    parameter = dict()
    parameter['n_estimators'] = [10, 50, 100, 200]
    parameter['max_features'] = ['auto', 'sqrt', 'log2']
    parameter['max_depth'] = [5, 10, 25]

    # Model fitting
    grid_search = GridSearchCV(model, parameter, scoring='precision', n_jobs=-1, cv=cv,error_score=0)
    grid_result = grid_search.fit(X_train, y_train)

    y_pred=grid_result.predict(X_test)

    print ("Random Forest Classifier")
    print (classification_report(y_test, y_pred))

### SVM Classifier

In [18]:
def svm_classifier(train, test):
    
    X_train, X_test, y_train, y_test = Xy_split(test,train)
    
    svc = LinearSVC(random_state=20)
    cv = StratifiedKFold(n_splits=5)
    loss = ['hinge', 'squared_hinge']
    max_iter = [10, 20, 50, 100]
    penalty = ['l1', 'l2']
    C = [0.01, 0.1, 1, 10, 100]
    parameters = dict(loss=loss,penalty=penalty,max_iter=max_iter, C=C)

    grid_search = GridSearchCV(estimator=svc, param_grid=parameters,  n_jobs=-1, cv=cv, scoring='precision',error_score=0)
    grid_result = grid_search.fit(X_train, y_train)

    y_pred=grid_result.predict(X_test)

    print ("SVM Classifier")
    print (classification_report(y_test, y_pred))

### LightGBM

In [19]:
def lightgbm(train, test):
    
    X_train, X_test, y_train, y_test = Xy_split(test,train)
    
    model = lgb.LGBMClassifier(random_state=20)
    cv = StratifiedKFold(n_splits=5)

    parameter = dict()
    parameter['n_estimators'] = [10, 50, 100, 200]
    parameter['max_depth'] = [5, 10, 25, 50, 100]
    parameter['learning_rate'] = [0.01, 0.1, 1, 5,10]

    # Model fitting
    grid_search = GridSearchCV(model, parameter, scoring='precision', n_jobs=-1, cv=cv,error_score=0)
    grid_result = grid_search.fit(X_train, y_train)

    y_pred=grid_result.predict(X_test)

    print ("Random Forest Classifier")
    print (classification_report(y_test, y_pred))

In [20]:
train, test = test_train_split(final_data)

log_reg(train, test, "no")
random_forest(train, test)
svm_classifier(train, test)
lightgbm(train, test)

Logistic Regression
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1732
           1       0.97      0.14      0.24       258

    accuracy                           0.89      1990
   macro avg       0.93      0.57      0.59      1990
weighted avg       0.90      0.89      0.85      1990

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.88      1.00      0.93      1732
           1       1.00      0.04      0.08       258

    accuracy                           0.88      1990
   macro avg       0.94      0.52      0.51      1990
weighted avg       0.89      0.88      0.82      1990

SVM Classifier
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1732
           1       0.00      0.00      0.00       258

    accuracy                           0.87      1990
   macro avg       0.44      0.50      0.47      1990
weighted avg  

### Over Sampling

In [21]:
train, test = test_train_split(final_data)

count_class_0, count_class_1 = train.is_bad.value_counts()

class_0 = train[train['is_bad'] == 0]
class_1 = train[train['is_bad'] == 1]

class_1_over = class_1.sample(count_class_0, replace=True)
train = pd.concat([class_0, class_1_over], axis=0)

print('Random over-sampling:')
print(train.is_bad.value_counts())

log_reg(train, test, "no")
random_forest(train, test)
svm_classifier(train, test)
lightgbm(train, test)

Random over-sampling:
0    6924
1    6924
Name: is_bad, dtype: int64
Logistic Regression
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      1732
           1       0.22      0.53      0.31       258

    accuracy                           0.69      1990
   macro avg       0.57      0.63      0.56      1990
weighted avg       0.82      0.69      0.74      1990

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1732
           1       0.91      0.15      0.26       258

    accuracy                           0.89      1990
   macro avg       0.90      0.57      0.60      1990
weighted avg       0.89      0.89      0.85      1990

SVM Classifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1732
           1       0.13      1.00      0.23       258

    accuracy                           0.13      1990

### Under Sampling

In [22]:
train, test = test_train_split(final_data)

count_class_0, count_class_1 = train.is_bad.value_counts()

class_0 = train[train['is_bad'] == 0]
class_1 = train[train['is_bad'] == 1]

class_0_under = class_0.sample(count_class_1, replace=True)
train = pd.concat([class_0_under, class_1], axis=0)

print('Random under-sampling:')
print(train.is_bad.value_counts())

log_reg(train, test,"no")
random_forest(train, test)
svm_classifier(train, test)
lightgbm(train, test)

Random under-sampling:
0    1034
1    1034
Name: is_bad, dtype: int64
Logistic Regression
              precision    recall  f1-score   support

           0       0.90      0.68      0.78      1732
           1       0.19      0.50      0.28       258

    accuracy                           0.66      1990
   macro avg       0.55      0.59      0.53      1990
weighted avg       0.81      0.66      0.71      1990

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.91      0.75      0.82      1732
           1       0.23      0.50      0.31       258

    accuracy                           0.72      1990
   macro avg       0.57      0.62      0.57      1990
weighted avg       0.82      0.72      0.76      1990

SVM Classifier
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1732
           1       0.00      0.00      0.00       258

    accuracy                           0.87      199

### SMOTE

In [23]:
from imblearn.over_sampling import SMOTE

X = final_data.loc[:, final_data.columns != 'is_bad']
y = final_data.loc[:, final_data.columns == 'is_bad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=20)

sm = SMOTE(random_state=42)
X_smote, y_smote = sm.fit_resample(X_train, y_train)

train = pd.concat([X_smote, y_smote], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [24]:
log_reg(train, test,"yes")
random_forest(train, test)
svm_classifier(train, test)
lightgbm(train, test)

Logistic Regression
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1732
           1       0.97      0.15      0.26       258

    accuracy                           0.89      1990
   macro avg       0.93      0.57      0.60      1990
weighted avg       0.90      0.89      0.85      1990

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1732
           1       0.80      0.02      0.03       258

    accuracy                           0.87      1990
   macro avg       0.84      0.51      0.48      1990
weighted avg       0.86      0.87      0.81      1990

SVM Classifier
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1732
           1       0.13      1.00      0.23       258

    accuracy                           0.13      1990
   macro avg       0.06      0.50      0.11      1990
weighted avg  