To Do:
* Confusion matrix diagram for best model
* Add explanation for why to use SMOTE, options of over and/or under sampling.

# Library Imports

In [3]:
# Dataframes
import pandas as pd
import numpy as np

# Graphing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Data Preparation
    # Train:Test
from sklearn.model_selection import train_test_split

# Model Tuning and Cross Validation
from imblearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV

# Model metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve

# SMOTE
from imblearn.over_sampling import SMOTE

# Classifiers
from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier 
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier 

import itertools

# # KNN
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn import metrics
# from scipy.spatial import distance
# # Logistic Regression

# from sklearn.linear_model import LogisticRegression
#     # Statsmodel
# import statsmodels.api as sm
# from patsy import dmatrices

# # Random Forest
# from sklearn.tree import DecisionTreeClassifier  # Decision Tree
# from sklearn.ensemble import BaggingClassifier, RandomForestClassifier  #Bagging & Random Forest
# from sklearn import tree
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
#     # Visuals for Random Forest
# from sklearn.externals.six import StringIO
# from IPython.display import Image
# from sklearn.tree import export_graphviz

# # SVC
# from sklearn.svm import SVC
# from sklearn.metrics import classification_report, confusion_matrix , accuracy_score

# Import Cleaned Data 

In [4]:
df = pd.read_csv('../data/processed/cleaned_dataframe.gz', compression='gzip')
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [5]:
df_scaled = pd.read_csv('../data/processed/cleaned_dataframe_scaled.gz', compression='gzip')
df_scaled.head(3)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,Scaled_Amount,Scaled_Time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.774718,-0.99529
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.26853,-0.99529
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,4.959811,-0.995279


# Train:Test Split

## Without Scaling

In [6]:
X = df.drop(['Class'], axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=1)

print("No. of samples in each training set:\t{}".format(X_train.shape[0]))
print("No. of samples in each test set:\t{}".format(X_test.shape[0]))

No. of samples in each training set:	198608
No. of samples in each test set:	85118


We choose to stratify as we want to keep the distribution of classes the same in the training set as the test set. 

## With Scaling

In [7]:
X_sc = df_scaled.drop(['Class'], axis=1)
y_sc = df_scaled['Class']

X_train_sc, X_test_sc, y_train_sc, y_test_sc = train_test_split(X_sc, y_sc,
                                                                test_size=0.3,
                                                                stratify=y_sc,
                                                                random_state=1)


print("No. of samples in each training set:\t{}".format(X_train.shape[0]))
print("No. of samples in each test set:\t{}".format(X_test.shape[0]))

No. of samples in each training set:	198608
No. of samples in each test set:	85118


# Baseline Model & Model Metrics

The Baseline Model is if we decided to predict onl the majority Class (non-fraud).  Due to the high imbalance of the dataset, our baseline model will return a very high accuracy.  

In [26]:
num_of_transactions = len(df)
num_of_frauds = len(df[df['Class']==0])
accuracy = round((num_of_frauds/num_of_transactions)*100, 2)
explanation = 'A Baseline Model, where we say all transactions are not fraud, gives us an accuracy of'
print(explanation, '{}%'.format(accuracy))

A Baseline Model, where we say all transactions are not fraud, gives us an accuracy of 99.83%


A good model will catch as many frauds as possible while minimizing the number of transactions flagged as fraud that are actually legitimate.

If we want to know how well a model predicts the minority class (fraud), F1 is the most useful metric. F1 is a weighted average of *Precision* and *Recall*:

### Precision 
\begin{equation*}
\frac{True Positives}{True Positives + False Positives}
\end{equation*}

### Recall
\begin{equation*}
\frac{True Positives}{True Positives + False Negatives}
\end{equation*}

### F1 Score
\begin{equation*}
2*\frac{Precision * Recall}{Precision + Recall}
\end{equation*}

We also looked at Precision-Recall curves to get a sense of how well the model captures true positives. A PR curve plots the Precision and Recall scores across different thresholds for a positive identification. This helps us to see whether a model captures true positives while minimizing false positives and false negatives.

# Techniques Used

Here are the techniques we used to get the best model:

* Balancing the data with over-sampling 

> We used the imblearn library's Synthetic Minority Oversampling Technique (SMOTE) to generate additional instances of fraud between the existing ones and their k-nearest neighbors. 

* A Random Forest classifier

> A Random Forest classifier uses an ensemble of decision trees that are trained using different portions of the data and different combinations of features. The forests predicts based on what the majority of trees in the forest predict.   

* Using class weights

> Class weights ensure that there is a higher penalty for misclassifying instances of fraud. 

Before settling on these techniques, we compared three class-balancing methods using a number of candidate classifiers. Here is a summary of the resulting F1 scores:


| Classifier           | Oversampled minority | Undersampled majority class | Mix of Both |
|----------------------|----------------------|-----------------------------|-------------|
|Random Forest         | .63                  | .14                         | .27         | 
|Logistic Regression   | .04                  | .03                         | .05         |
|K-nearest Neighbors   | .27                  | .14                         | .06         |
|Support Vector Machine| .06                  | .04                         | .09         |
|Gradient Boosting.    | .30                  | .07                         | .14         |

# Gaussian Naive Bayes

In [14]:
classifier = GaussianNB()

smote = SMOTE(random_state=1)

pipe = make_pipeline(smote, classifier)
model = pipe.fit(X_train, y_train)

train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# print scores  
rounding = 4
print('Train Accuracy: \t{}'.format(str(round(metrics.accuracy_score(y_train, train_predictions), rounding))))
print('Train Precision:\t{}'.format(str(round(metrics.precision_score(y_train, train_predictions), rounding))))
print('Train Recall:\t\t{}'.format(str(round(metrics.recall_score(y_train, train_predictions), rounding))))
print('Train F1:\t\t{}'.format(str(round(metrics.f1_score(y_train, train_predictions), rounding))))
print('\n')
print('Test Accuracy:\t\t{}'.format(str(round(metrics.accuracy_score(y_test, test_predictions), rounding))))
print('Test Precision:\t\t{}'.format(str(round(metrics.precision_score(y_test, test_predictions), rounding))))
print('Test Recall:\t\t{}'.format(str(round(metrics.recall_score(y_test, test_predictions), rounding))))
print('Test F1:\t\t{}'.format(str(round(metrics.f1_score(y_test, test_predictions), rounding))))

Train Accuracy: 	0.9927
Train Precision:	0.1576
Train Recall:		0.7825
Train F1:		0.2624


Test Accuracy:		0.9925
Test Precision:		0.1406
Test Recall:		0.6831
Test F1:		0.2332


# KNN