# PS Machine Learning Project - Financial Transactions

This notebook uses the SciKit-Learn implementation of the Gaussian Naive Bayes model to predict whether a financial transaction is fraudulent or not. 

In [18]:
# import libraries

import pandas as pd
import numpy as np
import os
import pathlib
import pickle

import sklearn
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [19]:
# import test data set (the one from OLAT) and separate X and y

path = "/data/mlproject22" if os.path.exists("/data/mlproject22") else "."
train_data = pd.read_csv(os.path.join(path, "transactions.csv.zip"))
X = train_data.drop(columns = "Class")
y = train_data["Class"]

fraud = (y == 1).sum()
no_fraud = (y == 0).sum()

# print number of fraudulent vs. non-fraudulent data points

print(fraud)
print(no_fraud)

394
227451


In [20]:
# split data: 70% training data, 30% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [21]:
# initialize Gaussian Naive Bayes model: 
gnb = GaussianNB()

# fit model to train data, predict y_pred for X_test
# very quick!
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Number of false positives out of a total %d points : %d" % (X_test.shape[0], (y_test < y_pred).sum()))
print("Number of false negatives out of a total %d points : %d" % (X_test.shape[0], (y_test > y_pred).sum()))

# Number of mislabeled points out of a total 68354 points : 426
# Number of false positives out of a total 68354 points : 393
# Number of false negatives out of a total 68354 points : 33

Number of mislabeled points out of a total 68354 points : 426
Number of false positives out of a total 68354 points : 393
Number of false negatives out of a total 68354 points : 33


In [22]:
# Model Accuracy (percentage of correct classifications)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print(roc_auc_score(y_test, y_pred))

Accuracy: 0.993767738537613
0.8414604173900735


# Gaussian Naive Bayes using 10-fold cross validation

In [48]:
# Gaussian Naive Bayes using 10-fold cross validation for training the model
# (much slower than before)

param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

gnb_cv = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10)
gnb_cv.fit(X_train, y_train)
print(gnb_cv.best_estimator_)

#Fitting 10 folds for each of 100 candidates, totalling 1000 fits
#GaussianNB(priors=None, var_smoothing=1.0)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
GaussianNB(var_smoothing=2.848035868435799e-08)


In [99]:
# prediction using the model trained with cross validation
y_pred2 = gnb_cv.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred2).sum()))
print("Number of false positives out of a total %d points : %d" % (X_test.shape[0], (y_test < y_pred2).sum()))
print("Number of false negatives out of a total %d points : %d" % (X_test.shape[0], (y_test > y_pred2).sum()))

print("Number of fraudulent transactions in test data :", (y_test == 1).sum())

# print accuracy 
print("Accuracy:",metrics.accuracy_score(y_test, y_pred2))

print(roc_auc_score(y_test, y_pred2))

Number of mislabeled points out of a total 68354 points : 78
Number of false positives out of a total 68354 points : 2
Number of false negatives out of a total 68354 points : 76
Number of fraudulent transactions in test data : 106
Accuracy: 0.99885888170407
0.6414947815182365


in the second try, false negatives make up more of the misclassifications
than the false positives. (initially, this was the other way around.)
the total amount of false negatives is also higher than before, while there
are less misclassifications overall. therefore, I think the first version 
actually performs better, because it seems to be more reliable in classifying
actual fraud as such. (the cross validation model only found 30 out of 106 fraudulent transactions, and it takes way longer for training)

also, the roc_auc_score is lower than for the initial implementation.

In [72]:
# here I tried removing any highly correlated features from the data set
# but it seems like there are none to remove, which might explain
# the high accuracy of the model (correlated features have a negative 
# impact on naive bayes models)

from sklearn.covariance import empirical_covariance

cor_matrix = X.corr().abs()
#print(cor_matrix)

upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
#print(); print(to_drop)

X_reduced = X.drop(X.columns[to_drop], axis=1)
print(); print(X_reduced.head())


       Time  Feature0  Feature1  Feature2  Feature3  Feature4  Feature5  \
0   12187.0  1.127257  0.170387  1.675702  1.662017 -1.093046 -0.447651   
1  149717.0 -0.723098 -1.307087  1.119492 -2.486829 -1.781857  0.382495   
2   72288.0  1.357358 -0.802677  1.135552 -0.490788 -1.672022 -0.509976   
3  168435.0  1.891806 -0.123111 -1.791275  0.342303  0.235308 -0.723866   
4   55416.0 -0.378806  0.449422  0.154983 -0.899310 -0.678177 -1.419243   

   Feature6  Feature7  Feature8  ...  Feature19  Feature20  Feature21  \
0 -0.590031 -0.071291  2.015259  ...  -0.150918  -0.170890   0.009832   
1  0.221389 -0.021550 -1.964369  ...  -0.405059  -0.457047  -0.980797   
2 -1.192288  0.044009 -0.001600  ...   0.034062   0.299976   0.911314   
3  0.082800 -0.114664  0.767009  ...  -0.058299  -0.186402  -0.408728   
4  0.130648  0.087002  0.509679  ...  -0.048294   0.053215   0.266919   

   Feature22  Feature23  Feature24  Feature25  Feature26  Feature27  Amount  
0   0.066699   0.877103   0.350

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))


Save and load the Naive Bayes Model:

In [5]:
#Save the model
filename = 'naive_bayes_model.sav'
pickle.dump(gnb, open(filename, 'wb'))

In [6]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.993767738537613
