## Importing Libraries ##

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import random
import time

In [4]:
random.seed(100)

In [5]:
dataset=pd.read_csv('financial_data.csv')

## Feature Engineering ##

In [6]:
dataset = dataset.drop(columns = ['months_employed'])

In [7]:
dataset['personal_account_months']=(dataset.personal_account_m + (dataset.personal_account_y * 12))

In [8]:
dataset[['personal_account_m','personal_account_y','personal_account_months']].head()

Unnamed: 0,personal_account_m,personal_account_y,personal_account_months
0,6,2,30
1,2,7,86
2,7,1,19
3,2,7,86
4,2,8,98


In [9]:
dataset=dataset.drop(columns=['personal_account_m','personal_account_y'])

## One Hot Encoding ##

In [11]:
dataset = pd.get_dummies(dataset)

In [12]:
dataset.columns

Index(['entry_id', 'age', 'home_owner', 'income', 'years_employed',
       'current_address_year', 'has_debt', 'amount_requested', 'risk_score',
       'risk_score_2', 'risk_score_3', 'risk_score_4', 'risk_score_5',
       'ext_quality_score', 'ext_quality_score_2', 'inquiries_last_month',
       'e_signed', 'personal_account_months', 'pay_schedule_bi-weekly',
       'pay_schedule_monthly', 'pay_schedule_semi-monthly',
       'pay_schedule_weekly'],
      dtype='object')

In [13]:
dataset=dataset.drop(columns=['pay_schedule_semi-monthly'])

## Removing Extra Columns ##

In [14]:
response = dataset['e_signed']
users = dataset['entry_id']
dataset=dataset.drop(columns = ["e_signed","entry_id"])

## Splitting into Train and Test Set ##

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                    response,
                                                    test_size=0.2,
                                                   random_state=0)

## Feature Scaling ##

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
sc_X = StandardScaler()

In [18]:
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))

In [19]:
X_test2 = pd.DataFrame(sc_X.fit_transform(X_test))

In [20]:
X_train2.columns = X_train.columns.values

In [21]:
X_test2.columns = X_test.columns.values

In [22]:
X_train2.index = X_train.index.values

In [23]:
X_test2.index = X_test.index.values

In [24]:
X_train = X_train2

In [25]:
X_test = X_test2

## Model Building ##

### Logistic Regression ##

In [26]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, penalty= 'l2')
classifier.fit(X_train,y_train)

LogisticRegression(random_state=0)

In [27]:
y_pred = classifier.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score

In [29]:
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

In [30]:
results=pd.DataFrame([['Linear Regression(Lasso)',acc,prec,rec,f1]],
            columns =['Model','Accuracy','Precision','Recall','F1 Score'])

In [31]:
print(results)

                      Model  Accuracy  Precision    Recall  F1 Score
0  Linear Regression(Lasso)  0.563372   0.577844  0.700726   0.63338


## Support Vector Machine ##

In [32]:
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel='linear')
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
model_results=pd.DataFrame([['SVM (Linear)',acc,prec,rec,f1]],
            columns =['Model','Accuracy','Precision','Recall','F1 Score'])
results= results.append(model_results,ignore_index=True)
print(results)

                      Model  Accuracy  Precision    Recall  F1 Score
0  Linear Regression(Lasso)  0.563372   0.577844  0.700726  0.633380
1              SVM (Linear)  0.568398   0.578536  0.729772  0.645413


In [33]:
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel='rbf')
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
model_results1=pd.DataFrame([['SVM (rbf)',acc,prec,rec,f1]],
            columns =['Model','Accuracy','Precision','Recall','F1 Score'])
results= results.append(model_results1,ignore_index=True)
print(results)

                      Model  Accuracy  Precision    Recall  F1 Score
0  Linear Regression(Lasso)  0.563372   0.577844  0.700726  0.633380
1              SVM (Linear)  0.568398   0.578536  0.729772  0.645413
2                 SVM (rbf)  0.592686   0.607519  0.687241  0.644926


In [34]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0, n_estimators=100,
                                   criterion = 'entropy')
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
model_results2=pd.DataFrame([['Random Forest n=100',acc,prec,rec,f1]],
            columns =['Model','Accuracy','Precision','Recall','F1 Score'])
results= results.append(model_results2,ignore_index=True)
print(results)

                      Model  Accuracy  Precision    Recall  F1 Score
0  Linear Regression(Lasso)  0.563372   0.577844  0.700726  0.633380
1              SVM (Linear)  0.568398   0.578536  0.729772  0.645413
2                 SVM (rbf)  0.592686   0.607519  0.687241  0.644926
3       Random Forest n=100  0.623953   0.643741  0.674793  0.658901


## K-fold Cross Validation ##

In [35]:
from sklearn.model_selection import cross_val_score

In [36]:
accuracies = cross_val_score(estimator = classifier, X=X_train,
                             y=y_train, cv= 10)

In [37]:
print("Random Forest Classifier Accuracy: %0.2f (+/- %0.2f)" % (accuracies.mean(),accuracies.std() * 2))

Random Forest Classifier Accuracy: 0.63 (+/- 0.03)


## Parameter Tuning ##

In [41]:
parameters = {"max_depth": [3, None],
              "max_features": [1,5,10],
              'min_samples_split': [2,5,10],
              "bootstrap": [True, False],
              "criterion": ["entropy"]}

In [42]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = "accuracy",
                          cv = 10,
                          n_jobs = -1)

In [43]:
t0 = time.time()
grid_search = grid_search.fit(X_train,y_train)
t1=time.time()
print("Took %0.2f Seconds" % (t1-t0))

Took 825.32 Seconds


In [44]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy,rf_best_parameters

(0.6329763417762478,
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 5,
  'min_samples_split': 5})

In [46]:
parameters = {"max_depth": [None],
              "max_features": [3,5,7],
              'min_samples_split': [3,5,7],
              "bootstrap": [True],
              "criterion": ["entropy"]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = "accuracy",
                          cv = 10,
                          n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train,y_train)
t1=time.time()
print("Took %0.2f Seconds" % (t1-t0))

Took 247.50 Seconds


In [47]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy,rf_best_parameters

(0.6366070906446998,
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 7,
  'min_samples_split': 5})

In [48]:
parameters = {"max_depth": [None],
              "max_features": [3,5,7],
              'min_samples_split': [6,7,8],
              "bootstrap": [True],
              "criterion": ["entropy"]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = "accuracy",
                          cv = 10,
                          n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train,y_train)
t1=time.time()
print("Took %0.2f Seconds" % (t1-t0))

Took 240.94 Seconds


In [49]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy,rf_best_parameters

(0.6357692967443384,
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 7,
  'min_samples_split': 7})

In [51]:
y_pred = grid_search.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
model_results2=pd.DataFrame([['Random Forest (n=100, GSx3 + ENTROPY)',acc,prec,rec,f1]],
            columns =['Model','Accuracy','Precision','Recall','F1 Score'])
results= results.append(model_results2,ignore_index=True)
print(results)

                                   Model  Accuracy  Precision    Recall  \
0               Linear Regression(Lasso)  0.563372   0.577844  0.700726   
1                           SVM (Linear)  0.568398   0.578536  0.729772   
2                              SVM (rbf)  0.592686   0.607519  0.687241   
3                    Random Forest n=100  0.623953   0.643741  0.674793   
4  Random Forest (n=100, GSx2 + ENTROPY)  0.636795   0.653151  0.693465   

   F1 Score  
0  0.633380  
1  0.645413  
2  0.644926  
3  0.658901  
4  0.672704  


In [52]:
parameters = {"max_depth": [3, None],
              "max_features": [1,5,10],
              'min_samples_split': [2,5,10],
              "bootstrap": [True, False],
              "criterion": ["gini"]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = "accuracy",
                          cv = 10,
                          n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train,y_train)
t1=time.time()
print("Took %0.2f Seconds" % (t1-t0))
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy,rf_best_parameters
parameters = {"max_depth": [None],
              "max_features": [3,5,7],
              'min_samples_split': [3,5,7],
              "bootstrap": [True],
              "criterion": ["gini"]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = "accuracy",
                          cv = 10,
                          n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train,y_train)
t1=time.time()
print("Took %0.2f Seconds" % (t1-t0))
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy,rf_best_parameters
parameters = {"max_depth": [None],
              "max_features": [3,5,7],
              'min_samples_split': [6,7,8],
              "bootstrap": [True],
              "criterion": ["gini"]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = "accuracy",
                          cv = 10,
                          n_jobs = -1)
t0 = time.time()
grid_search = grid_search.fit(X_train,y_train)
t1=time.time()
print("Took %0.2f Seconds" % (t1-t0))
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy,rf_best_parameters
y_pred = grid_search.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
model_results2=pd.DataFrame([['Random Forest (n=100, GSx3 + GINI)',acc,prec,rec,f1]],
            columns =['Model','Accuracy','Precision','Recall','F1 Score'])
results= results.append(model_results2,ignore_index=True)
print(results)

Took 627.18 Seconds
Took 186.70 Seconds
Took 165.24 Seconds
                                   Model  Accuracy  Precision    Recall  \
0               Linear Regression(Lasso)  0.563372   0.577844  0.700726   
1                           SVM (Linear)  0.568398   0.578536  0.729772   
2                              SVM (rbf)  0.592686   0.607519  0.687241   
3                    Random Forest n=100  0.623953   0.643741  0.674793   
4  Random Forest (n=100, GSx2 + ENTROPY)  0.636795   0.653151  0.693465   
5     Random Forest (n=100, GSx3 + GINI)  0.623953   0.642612  0.678942   

   F1 Score  
0  0.633380  
1  0.645413  
2  0.644926  
3  0.658901  
4  0.672704  
5  0.660277  


## Formatting Final Results ##

In [53]:
final_results = pd.concat([y_test,users], axis = 1).dropna()
final_results['predictions'] = y_pred
final_results = final_results[['entry_id','e_signed','predictions']]

In [54]:
final_results

Unnamed: 0,entry_id,e_signed,predictions
8,6493191,1.0,0
9,8908605,1.0,1
12,6889184,1.0,1
16,9375601,0.0,1
18,8515555,1.0,1
...,...,...,...
17881,5028251,1.0,1
17888,8958068,0.0,0
17890,3605941,0.0,1
17901,1807355,0.0,0
