In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import scale, StandardScaler
from sklearn.pipeline import Pipeline

**Import data and create arrays for features and targets**

In [2]:
# import loan data and drop rows with missing values
loans = pd.read_csv('data/clean_data/other_loans.csv', low_memory=True, header=0)
loans = loans.dropna()

In [3]:
# split data to obtain targets and scaled features
y = loans['default'].values
X = scale(loans.drop('default', axis=1).values)

In [4]:
# split data ( training and holdout)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.98, random_state=33)

In [5]:
X_test = X_test[0:5000]
y_test = y_test[0:5000]

**Train svm without parameter tuning**

In [6]:
# instantiate SVM classifier
svm = SVC()

In [7]:
# fit SVM to training data
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
# make predictions on test features
y_pred = svm.predict(X_test)

In [9]:
# compute confusion matrix and classification report
conf_mtrx = confusion_matrix(y_test, y_pred)
class_rprt = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# print results to evaluate model performance
svm_conf_mtrx = pd.DataFrame(conf_mtrx)
svm_conf_mtrx.columns = ['Predict: NO', 'Predict: YES']
svm_conf_mtrx.index = ['Actual: NO', 'Actual: YES']
svm_conf_mtrx.head()

Unnamed: 0,Predict: NO,Predict: YES
Actual: NO,4998,0
Actual: YES,2,0


**Once again the class imbalance hinders the model's ability to correctly predict loans that will default. In this case the model incorrectly predicted NO for all loans. This means all loans in the holdout set were classified as not defaulting.**

**Fix class imbalance with SMOTE from imb-learn**

In [11]:
# inspect class imbalance for default column
unique, count = np.unique(y_train, return_counts=True)
value_counts = {k:v for (k,v) in zip(unique, count)}
value_counts

{0: 17702, 1: 36}

In [12]:
# Applt Synthetic Minority Over-sampling Technique (SMOTE)
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_sample(X_train, y_train)

In [13]:
# inspect balanced training data
unique, count = np.unique(y_train_bal, return_counts=True)
value_counts = {k:v for (k,v) in zip(unique, count)}
value_counts

{0: 17702, 1: 17702}

In [14]:
X_train_bal = scale(X_train_bal)



**Train and score SVM with Balanced classes and default hyperparameters**

In [15]:
# instantiate a new svm classifier
svm = SVC()

In [16]:
# fit the classifier to the balanced training data
svm.fit(X_train_bal, y_train_bal)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
# make predictions on the holdout set
y_pred = svm.predict(X_test)

In [18]:
# compute confusion matrix and classification report
conf_mtrx = confusion_matrix(y_test, y_pred)
class_rprt = classification_report(y_test, y_pred)

In [19]:
# display results to evaluate model performance
svm_conf_mtrx = pd.DataFrame(conf_mtrx)
svm_conf_mtrx.columns = ['Predict: NO', 'Predict: YES']
svm_conf_mtrx.index = ['Actual: NO', 'Actual: YES']
svm_conf_mtrx.head()

Unnamed: 0,Predict: NO,Predict: YES
Actual: NO,4885,113
Actual: YES,2,0


**The goal is to minimize the number of False Positives and Maximize the number of True Positives, meaning we do not want to incorrectly classify loans that will default.**

**Implement GridsearchCV to find best combination of C and gamma hyperparameters**

In [20]:
# instantiate the classifier
clf = SVC()

# define hyperparameter space
c_values = [0.001, 0.01, 0.1, 1]
gamma_values = [0.001, 0.01, 0.1]
param_grid = {'C': c_values, 'gamma': gamma_values}

# create grid-search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

In [21]:
# fit apply gridsearch to SVM with training data
grid_search.fit(X_train_bal, y_train_bal)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1],
                         'gamma': [0.001, 0.01, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [22]:
# identify best C and gamma combination
best_params = grid_search.best_params_
best_params

{'C': 1, 'gamma': 0.1}

In [23]:
# make predictions on the holdout set
y_pred = grid_search.predict(X_test)

# compute confusion matrix and classification report
conf_mtrx = confusion_matrix(y_test, y_pred)
class_rprt = classification_report(y_test, y_pred)

# display results to evaluate model performance
svm_conf_mtrx = pd.DataFrame(conf_mtrx)
svm_conf_mtrx.columns = ['Predict: NO', 'Predict: YES']
svm_conf_mtrx.index = ['Actual: NO', 'Actual: YES']
svm_conf_mtrx.head()

Unnamed: 0,Predict: NO,Predict: YES
Actual: NO,4989,9
Actual: YES,2,0


**Implement Pipeline to preprocess and and fit the model with GridSearch**

In [24]:
# Define pipeline steps
#steps = [('scaler', scale()), ('svm', SVC())]
#steps = [('scaler', StandardScaler()), ('svm', SVC())]
steps = [('svm', SVC())]


# Create pipeline object
pipeline = Pipeline(steps)


# Define parameter grid
#c_values = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
#gamma_values = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
#param_grid = {'C': c_values, 'gamma': gamma_values}
param_grid = dict(svm__C=[0.001, 0.01, 0.1],
                 svm__gamma=[0.001, 0.01, 0.1])

# Run gridsearch cross validation
cv = GridSearchCV(pipeline, param_grid=param_grid)
cv.fit(X_train_bal, y_train_bal)

# make predictions with best params from grid search
y_pred = cv.predict(X_test)

# evaluate model
print(cv.best_params_)
print(cv.score(X_test, y_test))
print(classification_report(y_test, y_pred))

{'svm__C': 0.1, 'svm__gamma': 0.1}
0.9952
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4998
           1       0.00      0.00      0.00         2

    accuracy                           1.00      5000
   macro avg       0.50      0.50      0.50      5000
weighted avg       1.00      1.00      1.00      5000



In [25]:
np.mean(y_pred)

0.0044

In [26]:

# compute confusion matrix and classification report
conf_mtrx = confusion_matrix(y_test, y_pred)
class_rprt = classification_report(y_test, y_pred)

# display results to evaluate model performance
svm_conf_mtrx = pd.DataFrame(conf_mtrx)
svm_conf_mtrx.columns = ['Predict: NO', 'Predict: YES']
svm_conf_mtrx.index = ['Actual: NO', 'Actual: YES']
svm_conf_mtrx.head()

Unnamed: 0,Predict: NO,Predict: YES
Actual: NO,4976,22
Actual: YES,2,0
