## Preprocessed Data for Machine Learning (also see separate Data Cleaning file)

In [1]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Read the csv file into a pandas DataFrame

df = pd.read_csv('../Resources/cleaned_credit_risk.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,2,5.0,1,1,1000,11.14,0,0.1,1,2
1,25,9600,0,1.0,3,2,5500,12.87,1,0.57,1,3
2,23,65500,1,4.0,3,2,35000,15.23,1,0.53,1,2
3,24,54400,1,8.0,3,2,35000,14.27,1,0.55,0,4
4,21,9900,2,2.0,5,0,2500,7.14,1,0.25,1,2


In [3]:
# Define y-axis by setting target column

target = df["loan_status"]
target_names = ["non default", "default"]

In [4]:
# Define x-axis by removing target column and keeping remaining columns as features

data = df.drop("loan_status", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,2,5.0,1,1,1000,11.14,0.1,1,2
1,25,9600,0,1.0,3,2,5500,12.87,0.57,1,3
2,23,65500,1,4.0,3,2,35000,15.23,0.53,1,2
3,24,54400,1,8.0,3,2,35000,14.27,0.55,0,4
4,21,9900,2,2.0,5,0,2500,7.14,0.25,1,2


In [5]:
print(data.shape, target.shape)

(28632, 11) (28632,)


# Split data into training and testing data

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

X_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
7029,24,37200,1,8.0,1,0,7200,7.9,0.19,1,2
9445,23,34000,1,7.0,1,1,10000,10.36,0.29,1,4
2930,24,36000,0,3.0,4,1,4750,11.83,0.13,1,3
28116,36,38568,1,7.0,1,4,2400,18.39,0.06,0,13
23372,28,28896,1,6.0,4,2,15000,13.48,0.52,0,8


In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(21474, 11) (7158, 11) (21474,) (7158,)


## Scaled or Normalized our data and used StandardScaler

In [8]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [9]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Model to the scaled training data and made predictions using the scaled test data

In [10]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.839


In [12]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

 non default       0.85      0.96      0.90      5608
     default       0.73      0.41      0.52      1550

    accuracy                           0.84      7158
   macro avg       0.79      0.68      0.71      7158
weighted avg       0.83      0.84      0.82      7158

