In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, precision_recall_curve, recall_score,accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [2]:
X_train = pd.read_csv('X_train')
X_test = pd.read_csv('X_test')
y_train = pd.read_csv('y_train')
y_test = pd.read_csv('y_test')


## 1. Fitting in Models

### Logistic Regression

In [8]:
def run_logisticRegressor(X_train, X_test, y_train, y_test):
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    print('Train set')
    pred = lr.predict(X_train)
    print('Logistic Regression Mean Accuracy: {}'.format(lr.score(X_train, y_train)))
    print('Logistic Regression Recall: {}'.format(recall_score(y_train, pred)))
    
    print('Test set')
    pred = lr.predict(X_test)
    print('Logistic Regression Mean Accuracy: {}'.format(lr.score(X_test, y_test)))
    print('Logistic Regression Recall: {}'.format(recall_score(y_test, pred)))

    print(classification_report(y_test, pred))

In [30]:
run_logisticRegressor(X_train,
                  X_test,
                  y_train, y_test)




  return f(**kwargs)


Train set
Logistic Regression Mean Accuracy: 0.9324888578477432
Logistic Regression Recall: 0.8686356579020094
Test set
Logistic Regression Mean Accuracy: 0.9967463689243121
Logistic Regression Recall: 0.8503401360544217
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.33      0.85      0.47       147

    accuracy                           1.00     85443
   macro avg       0.66      0.92      0.74     85443
weighted avg       1.00      1.00      1.00     85443



### Nearest Neighbors

In [15]:
def run_nearestNeighbors(X_train, X_test, y_train, y_test):
    neigh = KNeighborsClassifier(n_neighbors=50)
    neigh.fit(X_train, y_train)
    print('Train set')
    pred = neigh.predict(X_train)
    print('Nearest Neighbors Classifier Mean Accuracy: {}'.format(neigh.score(X_train, y_train)))
    print('Nearest Neighbors Classifier Recall: {}'.format(recall_score(y_train, pred)))
    
    print('Test set')
    pred = neigh.predict(X_test)
    print('Nearest Neighbors Classifier Mean Accuracy: {}'.format(neigh.score(X_test, y_test)))
    print('Nearest Neighbors Classifier Recall: {}'.format(recall_score(y_test, pred)))

    print(classification_report(y_test, pred))

In [31]:
run_nearestNeighbors(X_train,
                  X_test,
                  y_train, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


Train set
Nearest Neighbors Classifier Mean Accuracy: 0.9870263643169748
Nearest Neighbors Classifier Recall: 0.9906742572317216
Test set
Nearest Neighbors Classifier Mean Accuracy: 0.9830998443406715
Nearest Neighbors Classifier Recall: 0.8639455782312925
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85296
           1       0.08      0.86      0.15       147

    accuracy                           0.98     85443
   macro avg       0.54      0.92      0.57     85443
weighted avg       1.00      0.98      0.99     85443



### SVM

In [16]:
def run_SVC(X_train, X_test, y_train, y_test):
    svc = SVC()
    svc.fit(X_train, y_train)
    print('Train set')
    pred = svc.predict(X_train)
    print('Support Vector Classifier Mean Accuracy: {}'.format(svc.score(X_train, y_train)))
    print('Support Vector Classifier Recall: {}'.format(recall_score(y_train, pred)))
    
    print('Test set')
    pred = svc.predict(X_test)
    print('Support Vector Classifier Mean Accuracy: {}'.format(svc.score(X_test, y_test)))
    print('Support Vector Classifier Recall: {}'.format(recall_score(y_test, pred)))
    
    print(classification_report(y_test, pred))


In [32]:
run_SVC(X_train,
                  X_test,
                  y_train, y_test)

  return f(**kwargs)


Train set
Support Vector Classifier Mean Accuracy: 0.9466407729915234
Support Vector Classifier Recall: 0.9026223626889895
Test set
Support Vector Classifier Mean Accuracy: 0.9911988109031752
Support Vector Classifier Recall: 0.8163265306122449
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85296
           1       0.14      0.82      0.24       147

    accuracy                           0.99     85443
   macro avg       0.57      0.90      0.62     85443
weighted avg       1.00      0.99      0.99     85443



### Decision Tree

In [18]:
def run_decisionTree(X_train, X_test, y_train, y_test):
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    print('Train set')
    pred = dt.predict(X_train)
    print('Decision Tree Classifier Mean Accuracy: {}'.format(dt.score(X_train, y_train)))
    print('Decision Tree Classifier Recall: {}'.format(recall_score(y_train, pred)))
    
    print('Test set')
    pred = dt.predict(X_test)
    print('Decision Tree Classifier Mean Accuracy: {}'.format(dt.score(X_test, y_test)))
    print('Decision Tree Classifier Recall: {}'.format(recall_score(y_test, pred)))

    print(classification_report(y_test, pred))


In [33]:
run_decisionTree(X_train,
                  X_test,
                  y_train, y_test)

Train set
Decision Tree Classifier Mean Accuracy: 0.9991834950431868
Decision Tree Classifier Recall: 0.9997085705384913
Test set
Decision Tree Classifier Mean Accuracy: 0.996044146389991
Decision Tree Classifier Recall: 0.7755102040816326
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.27      0.78      0.40       147

    accuracy                           1.00     85443
   macro avg       0.64      0.89      0.70     85443
weighted avg       1.00      1.00      1.00     85443



### Random Forest

In [23]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict(X_train)
    print('Random Forests Mean Accuracy: {}'.format(accuracy_score(y_train, pred)))
    print('Random Forests Recall: {}'.format(recall_score(y_train, pred)))
    
    print('Test set')
    pred = rf.predict(X_test)
    print('Random Forests Mean Accuracy: {}'.format(accuracy_score(y_test, pred)))
    print('Random Forests Recall: {}'.format(recall_score(y_test, pred)))

    print(classification_report(y_test, pred))


In [34]:
run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


Train set
Random Forests Mean Accuracy: 0.9694426160316352
Random Forests Recall: 0.9487335380039091
Test set
Random Forests Mean Accuracy: 0.9902625141907471
Random Forests Recall: 0.8639455782312925
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     85296
           1       0.14      0.86      0.23       147

    accuracy                           0.99     85443
   macro avg       0.57      0.93      0.61     85443
weighted avg       1.00      0.99      0.99     85443



### Gradient Boosting

In [25]:
def run_gradientboosting(X_train, X_test, y_train, y_test):
    gb = GradientBoostingClassifier(n_estimators=100,random_state=0)
    gb.fit(X_train, y_train)
    print('Train set')
    pred = gb.predict(X_train)
    print('Gradient Boosting Mean Accuracy: {}'.format(accuracy_score(y_train, pred)))
    print('Gradient Boosting Recall: {}'.format(recall_score(y_train, pred)))
    
    print('Test set')
    pred = gb.predict(X_test)
    print('Gradient Boosting Mean Accuracy: {}'.format(accuracy_score(y_test, pred)))
    print('Gradient Boosting Recall: {}'.format(recall_score(y_test, pred)))

    print(classification_report(y_test, pred))


In [35]:
run_gradientboosting(X_train,
                  X_test,
                  y_train, y_test)

  return f(**kwargs)


Train set
Gradient Boosting Mean Accuracy: 0.98089880865646
Gradient Boosting Recall: 0.9724398173038755
Test set
Gradient Boosting Mean Accuracy: 0.9893613286050349
Gradient Boosting Recall: 0.8775510204081632


In [29]:
X_train.head()

Unnamed: 0,V3,V4,V10,V11,V12,V14,V16,V17
0,0.227616,0.018083,0.099318,0.188327,0.029645,0.020877,0.742579,0.097345
1,0.469216,0.083397,0.075784,0.019857,0.015978,0.026405,0.125902,0.283255
2,0.026194,0.140226,0.044724,0.974699,0.053992,0.181934,0.140324,0.283255
3,0.117023,0.018083,0.072942,0.095292,0.342553,0.026405,0.330268,0.156723
4,0.068908,0.018083,0.195926,0.051653,0.342553,2.68385,0.125064,0.156723


## 2. Tuning Hyperparameters

As SVC is the best performing model, we will tune the model's hyperparameters to achieve better results.

In [None]:
param_grid = {'C': [float(x) for x in np.linspace(0.1, 1000, num = 5)],  
              'gamma': [float(x) for x in np.linspace(0.01, 1, num = 5)], 
              'kernel': ['rbf']}  

svc=SVC()
  
svc_random = RandomizedSearchCV(estimator=svc, param_distributions = param_grid, n_iter = 100, cv = 3,scoring='recall', n_jobs=-1)

svc_random.fit(X_train, y_train)


