# Project: Fraud Detection 

## 1. Overview

### PaySim simulates mobile money transactions based on a sample of real transacions extracted from one month of financial logs from a mobile money service implemented in an African country. The original logs were provided by a multinational company, who is the provider of the mobile financial service which is currently running in more than 14 countries all around the world. The objective of the project is to predict if a transaction is fraudulent or not.

## 2. Preprocess the data

### Libraries

In [None]:
# libraries: mathematical computing 
import numpy as np
import pandas as pd

# libraries: sklearn
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import train_test_split

# libraries: pyspark sql
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from  pyspark.sql.functions import monotonically_increasing_id, desc, row_number

# libraries: pyspark machine learning
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.stat import Statistics

# libraries: visualization
import seaborn as sb
import matplotlib.pyplot as mpt
import functools
from collections import Counter

In [None]:
# global variables

global df_bank, results 

#### We´ll use PySpark to preprocess the data.

In [None]:
# creation of the SparkSession

spark = SparkSession.builder.appName("FraudDetection").getOrCreate()
spark

In [None]:
# spark dataframe 

df = spark.read.csv('fraudDetection.csv', header=True)

#### Now, we´ll convert this "df" dataframe into a parquet file using the following method of pyspark. The file will be named "fraudDetection.parquet"

In [None]:
df.write.parquet("/Users/alexangelbracho/Desktop/GitHub_projects/FraudDetection/Fraud-Detection-Project/fraudDetection.parquet")

#### Now, we´ll read the file as a parquet file. The calculation will be faster.

In [None]:
df_bank_par = spark.read.parquet("fraudDetection.parquet")

In [None]:
df_bank_par.show(10)

#### Let´s take a look to the data with the first 10 rows.

In [None]:
df_bank_par.show(10)

In [None]:
df_bank_par.printSchema()

#### There are 11 columns, some of them are numerical and others are categorical. Let´s count the number of registers.

In [None]:
print(f"The total number of registers is:",df_bank_par.count())

#### We have more than six miliions of transactions in the dataset.

### 2.1 Feature Engineering

#### Firstly, we´ll create a function to create a new variable.

In [None]:
### 2.1.1.- creation of a new variable: type2

df_type2 = df_bank_par.withColumn("type2",f.concat(f.substring("nameOrig",1,1),f.substring("nameDest",1,1)))

In [None]:
df_type2.show(5)

#### We´ve created a new column named "type2" which is composed by the first character of the column "nameOrig" and the first character of the column "nameDest"

In [None]:
### 2.1.2.1.- One Hot Encoding: column "type"

df_type2.show(3)

#### We´ll use some libraries of Spark for Machine Learning (SparkML).

In [None]:
### StringIndexer Initialization
### column: type

indexer_type = StringIndexer(inputCol="type",outputCol="types_indexed")
indexerModel_type = indexer_type.fit(df_type2)


In [None]:
### Transform the DataFrame using the fitted StringIndexer model

indexed_df_type2 = indexerModel_type.transform(df_type2)
indexed_df_type2.show(10)

#### Here, we´ve set each of the elements of the "type" column into indexes.

In [None]:
### apply One-Hot-Encoding to the indexed column, that is, 
### "types_indexed"

encoder_type = OneHotEncoder(dropLast=False, inputCol="types_indexed", outputCol="types_onehot")
encoder_type_df = encoder_type.fit(indexed_df_type2).transform(indexed_df_type2)
encoder_type_df.show(truncate=False)


In [None]:
encoder_type_df.printSchema()

In [None]:
encoder_type_df_split = encoder_type_df.select('*',vector_to_array('types_onehot').alias('types_onehot_split'))
encoder_type_df_split.show(5)

In [None]:
### now, we´ll split the "types_onehot_split" into five columns, one per category

num_categories = len(encoder_type_df_split.first()['types_onehot_split'])
cols_expanded = [(f.col('types_onehot_split')[i].alias(f"{indexerModel_type.labels[i]}")) for i in range(num_categories)]
type_df = encoder_type_df_split.select('*',*cols_expanded)


In [None]:
type_df.show(100)

#### We´ve applied One-Hot-Encoding to the column "type" resulting in five new columns:
+ CASH_OUT
+ CASH_IN
+ PAYMENT
+ TRANSFER 
+ DEBIT

#### Now, we´ll apply this procedure to the column "type2".

In [None]:
### 2.1.2.2.- One Hot Encoding: column "type2"

type_df.show(5)

In [None]:
### StringIndexer Initialization
### column: type2

indexer_type = StringIndexer(inputCol="type2",outputCol="types_indexed2")
indexerModel_type = indexer_type.fit(type_df)

In [None]:
### Transform the DataFrame using the fitted StringIndexer model

indexed_df_type = indexerModel_type.transform(type_df)
indexed_df_type.show(10)

In [None]:
### apply One-Hot-Encoding to the indexed column, that is, 
### "types_indexed2"

encoder_type2 = OneHotEncoder(dropLast=False, inputCol="types_indexed2", outputCol="types_onehot2")
encoder_type2_df = encoder_type2.fit(indexed_df_type).transform(indexed_df_type)
encoder_type2_df.show(truncate=False)

In [None]:
encoder_type2_df.printSchema()

In [None]:
encoder_type2_df_split = encoder_type2_df.select('*',vector_to_array('types_onehot2').alias('types_onehot_split2'))
encoder_type2_df_split.show(5)

In [None]:
### now, we´ll split the "types_onehot_split2" into two columns, one per category

num_categories = len(encoder_type2_df_split.first()['types_onehot_split2'])
cols_expanded = [(f.col('types_onehot_split2')[i].alias(f"{indexerModel_type.labels[i]}")) for i in range(num_categories)]
encoder_type2_df_split = encoder_type2_df_split.select('*',*cols_expanded)

In [None]:
encoder_type2_df_split.show(5)

#### We´ve split the "type2" column into two columns based on One-Hot-Encoding. Now, we´ll eliminate some unnecessaruy columns. Let´s check out all the columns.

In [None]:
encoder_type2_df_split.printSchema()

#### Now, we´ll eliminate the unnecessary columns:
+ nameOrig
+ nameDest
+ isFlaggedFraud
+ newbalanceDest
+ oldbalanceDest
+ oldbalanceOrg
+ newbalanceOrig 
+ types_indexed
+ types_onehot
+ types_onehot_split
+ types_indexed2
+ types_onehot2
+ types_onehot_split2
+ type
+ type2

In [None]:
df_bank_par = encoder_type2_df_split.drop("nameOrig","nameDest","isFlaggedFraud","newbalanceDest","oldbalanceDest",
                       "oldbalanceOrg","newbalanceOrig","type","types_indexed","types_onehot",
                       "types_onehot_split","type2","types_indexed2","types_onehot2","types_onehot_split2" )
df_bank_par.show(5)

In [None]:
df_bank_par.count()

#### We can see that there are the same quantity of registers.

### 2.2 Data Cleaning

In [None]:
### 2.2.1.- Eliminate duplicated

num_all_rows = df_bank_par.count()
num_all_rows

In [None]:
num_duplicated_rows = df_bank_par.distinct().count() 

In [None]:
print(f"The total number of duplicated rows is:",num_all_rows - num_duplicated_rows)

#### We can see that there are 7597 duplicated rows. Let´s remove the null values and duplicated values from the df_bank_par dataframe.

In [None]:
df_bank_par = df_bank_par.dropna()

df_bank_par = df_bank_par.dropDuplicates()

In [None]:
df_bank_par.count()

#### We can see the duplicated registers have been removed because there are fewer registers than before. Let´s take a look at the "clean" dataset.

In [None]:
df_bank_par.show(10)

## 3. Exploratory Data Analysis (EDA)

### 3.1 Visualization

#### The visualization will be done using a functions which leverages the method histogram() of pyspark. 

In [None]:
# definition of the "histogram" function

def histogram(df, col, bins=10, xname=None, yname=None):
    
    '''
    This function makes a histogram from spark dataframe named 
    df for column name col. 
    '''
    
    # Calculating histogram in Spark 
    vals = df.select(col).rdd.flatMap(lambda x: x).histogram(bins)
    
    # Preprocessing histogram points and locations 
    width = vals[0][1] - vals[0][0]
    loc = [vals[0][0] + (i+1) * width for i in range(len(vals[1]))]
    
    # Making a bar plot 
    mpt.bar(loc, vals[1], width=width)
    mpt.xlabel(col)
    mpt.ylabel(yname)
    mpt.show()

In [None]:
df_bank_par.printSchema()

#### There are some features that need to be converted to integers such as "step","amount" and "isFraud".

In [None]:
# convert string columns into integer columns

df_bank_par = df_bank_par.withColumn("step",df_bank_par["step"].cast(IntegerType()))

In [None]:
df_bank_par = df_bank_par.withColumn("amount",df_bank_par["amount"].cast(IntegerType()))

In [None]:
df_bank_par = df_bank_par.withColumn("isFraud",df_bank_par["isFraud"].cast(IntegerType()))

In [None]:
df_bank_par.printSchema()

#### We´ve seen that all the features are "integer" types now. Therefore, we´re able to perform various visualizations with the histogram method. That´s what we´ll do next.

In [None]:
# histogram: "step"

histogram(df_bank_par, 'step', bins=15, yname='frequency')

In [None]:
# histogram: "amount"

histogram(df_bank_par, 'amount', bins=15, yname='frequency')

In [None]:
# histogram: "Debit"

histogram(df_bank_par, 'Debit', bins=15, yname='frequency')


In [None]:
# histogram: "Payment"

histogram(df_bank_par, 'Payment', bins=15, yname='frequency')


In [None]:
# histogram: "CASH_OUT"

histogram(df_bank_par, 'CASH_OUT', bins=15, yname='frequency')


In [None]:
# histogram: "CASH_IN"

histogram(df_bank_par, 'CASH_IN', bins=15, yname='frequency')


In [None]:
# histogram: "TRANSFER"

histogram(df_bank_par, 'TRANSFER', bins=15, yname='frequency')


In [None]:
# histogram: "CC"

histogram(df_bank_par, 'CC', bins=15, yname='frequency')


In [None]:
# histogram: "CM"

histogram(df_bank_par, 'CM', bins=15, yname='frequency')

In [None]:
# histogram: "isFraud"

histogram(df_bank_par, 'isFraud', bins=15, yname='frequency')

#### Remember that our label is "isFraud", therefore, we can see that this class is unbalanced as we can see from the previous graphic. We need to perform an **Oversampling** through ***Data Balancing*** using *pyspark*.

### 3.2 Data Balancing

In [None]:
### oversampling with "pysaprk"

minor_df = df_bank_par.filter(f.col("isFraud")==1)
major_df = df_bank_par.filter(f.col("isFraud")==0)

In [None]:
num_df_bank_par = df_bank_par.count()

In [None]:
num_df_bank_par

In [None]:
num_major_df = major_df.count()

In [None]:
num_major_df

In [None]:
ratio = int(major_df.count()/minor_df.count())

In [None]:
print("The ratio is:",ratio)

In [None]:
a = range(ratio)

In [None]:
# let´s duplicate the minoriry rows

oversampled_df = minor_df.withColumn("dummy",f.explode(f.array([f.lit(x) for x in a]))).drop("dummy")

In [None]:
oversampled_df.show(5)

In [None]:
oversampled_df.printSchema()

In [None]:
# we drop the unnecessary columns in the "oversampled_df" dataframe

oversampled_df = oversampled_df.drop("step","amount","CASH_OUT","CASH_IN","PAYMENT","TRANSFER","DEBIT","CC","CM")

In [None]:
oversampled_df.printSchema()

In [None]:
num_oversampled_df = oversampled_df.count()

In [None]:
num_oversampled_df

In [None]:
num_oversampled_df + num_major_df

#### We can realize that suming "oversampled_df" and "major_df" exceeds the total number of samples. Therefore, we need to low them down to the half at least.

In [None]:
# now, we need to aggregate indexes to the "oversampled_df" dataframe

oversampled_df = oversampled_df.withColumn("index",monotonically_increasing_id())
oversampled_df.show(5)

In [None]:
oversampled_df.count()

In [None]:
# we create a view of the "oversampled_df" dataframe to use sparkSQL

oversampled_df.createOrReplaceTempView("isFraud")

In [None]:
limit_major_df = num_major_df / 2

In [None]:
limit_major_df

In [None]:
limit_oversampled_df = num_df_bank_par - limit_major_df

In [None]:
limit_oversampled_df = int(limit_oversampled_df)

In [None]:
limit_oversampled_df

In [None]:
type(limit_oversampled_df)

In [None]:
# we use this query to select some rows of the "oversampled_df" dataframe

query = f"SELECT * FROM isFraud LIMIT {limit_oversampled_df}"

In [None]:
oversampled_df = spark.sql(query)

In [None]:
oversampled_df.show(5)

In [None]:
oversampled_df.count()

In [None]:
oversampled_df.show(10)

In [None]:
# we drop the unnecessary columns in the "major_df" dataframe

major_df = major_df.drop("step","amount","CASH_OUT","CASH_IN","PAYMENT","TRANSFER","DEBIT","CC","CM")
major_df.show()

In [None]:
major_df.count()

In [None]:
# now, we need to aggregate indexes to the "major_df" dataframe

major_df = major_df.withColumn("index",monotonically_increasing_id())
major_df.show(5)

In [None]:
major_df.count()

In [None]:
limit_major_df = int(limit_major_df)

In [None]:
limit_major_df

In [None]:
# we create a view from "major_df" dataframe to do some queries

major_df.createOrReplaceTempView("isFraud")

In [None]:
# we use this query to select some rows of the "major_df" dataframe

query = f"SELECT * FROM isFraud LIMIT {limit_major_df}"

In [None]:
major_df = spark.sql(query)

In [None]:
major_df.show(5)

In [None]:
major_df.count()

In [None]:
combined_df = major_df.unionAll(oversampled_df)

In [None]:
combined_df.show(50)

In [None]:
combined_df.count()

#### The previous table contains the former unbalanced data in the feature "isFraud"; this result says that we have the same number of registers than the original dataset. Let´s check out if the the class is already balanced in this case. 

In [None]:
class_1 = combined_df.filter(f.col("isFraud")==1)
class_0 = combined_df.filter(f.col("isFraud")==0)

In [None]:
class_1.show(50)

In [None]:
class_1.count()

In [None]:
class_0.show(50)

In [None]:
class_0.count()

#### We can see that the class "isFraud" is almost the same in this dataframe, resulting in the same number of samples in the original dataset. Now, we need to merge the original dataframe "df_bank_par" with "combined_pd" dataframe.

In [None]:
df_bank_par.show(10)

In [None]:
df_bank_par = df_bank_par.drop("isFraud")

In [None]:
df_bank_par.show(10)

In [None]:
combined_df.count(), df_bank_par.count()

In [None]:
combined_df.printSchema()

In [None]:
df_bank_par.printSchema()

In [None]:
# now, we need to aggregate indexes to the "df_bank_par" dataframe

df_bank_par = df_bank_par.withColumn("index",monotonically_increasing_id())
df_bank_par.show(5)

In [None]:
df_bank_par = df_bank_par.join(combined_df,on=['index']).drop('index')

In [None]:
df_bank_par.show(50)

#### Let´s check out again the number of samples of each class in the feature "isFraud" (label) in this dataset.

In [None]:
class_1 = df_bank_par.filter(f.col("isFraud")==1)
class_0 = df_bank_par.filter(f.col("isFraud")==0)

In [None]:
df_bank_par.show(50)

In [None]:
type(df_bank_par)

In [None]:
df_bank_par.count()

In [None]:
class_0.show(50)

In [None]:
type(class_0)

In [None]:
class_0.count()

In [None]:
class_1.show(50)

In [None]:
class_1.count()

#### Our latest valid and "clean" dataframe is *df_bank_par* as follows:

In [None]:
df_bank_par.show(10)

#### We have a balanced class in "isFraud". Let´s check out with a histogram.

In [None]:
histogram(df_bank_par, 'isFraud', bins=15, yname='frequency')

#### If we want to transform this pyspark "dataframe" df_bank_par into a pandas dataframe we can use the method to_pandas_on_spark.

In [None]:
# pandas dataframe

df_bank_par_pandas = df_bank_par.to_pandas_on_spark()
df_bank_par_pandas.head(10)

In [None]:
df_bank_par_pandas.describe()

In [None]:
type(df_bank_par_pandas)

#### Let´s create a function to find a correlation between the target variable "isFraud" and the features. 

In [None]:
# definition of the function "correlation_df"

def correlation_df(df,target_var,feature_cols, method):
    # assemble features into a vector
    target_var = [target_var]
    feature_cols = feature_cols
    df_cor = df.select(target_var + feature_cols)
    assembler = VectorAssembler(inputCols=target_var + feature_cols, outputCol="features")
    df_cor = assembler.transform(df_cor)

    # calculate correlation matrix
    correlation_matrix = Correlation.corr(df_cor, "features", method =method).head()[0]

    # extract the correlation coefficient between target and each feature
    target_corr_list = [correlation_matrix[i,0] for i in range(len(feature_cols)+1)][1:]

    # create a Dataframe with target variable, feature names and correlation coefficients
    correlation_data = [(feature_cols[i],float(target_corr_list[i])) for i in range(len(feature_cols))]

    correlation_df = spark.createDataFrame(correlation_data, ["feature","correlation"] )

    correlation_df = correlation_df.withColumn("abs_correlation",f.abs("correlation"))

    # print the result
    return correlation_df


In [None]:
target = "isFraud"

indep_cols = [x for x in df_bank_par.columns if x not in [target] ]

corr_values_df = correlation_df(df=df_bank_par, target_var= target, feature_cols= indep_cols, method='pearson')

print(f"The corelation between {target} and the other features is: ")

corr_values_df.show()


In [None]:
target = "amount"

indep_cols = [x for x in df_bank_par.columns if x not in [target] ]

corr_values_df = correlation_df(df=df_bank_par, target_var= target, feature_cols= indep_cols, method='pearson')

print(f"The corelation between {target} and the other features is: ")

corr_values_df.show()


In [None]:
target = "step"

indep_cols = [x for x in df_bank_par.columns if x not in [target] ]

corr_values_df = correlation_df(df=df_bank_par, target_var= target, feature_cols= indep_cols, method='pearson')

print(f"The corelation between {target} and the other features is: ")

corr_values_df.show()

## 4. Construction of models

## 4.1 train/test split

In [None]:
train,test = df_bank_par.randomSplit([0.7,0.3])

In [None]:
type(train) , type(test)

#### Let´s assemble these datasets "train" and "test" into a single feature vector using VectorAssembler class per each one.

In [None]:
# let´s assemble the train dataset as a single feature vector using VectorAssembler class

columns = ['step','amount','CASH_OUT','PAYMENT','CASH_IN','TRANSFER','DEBIT','CC','CM','isFraud']

assembler = VectorAssembler(inputCols=columns, outputCol='features')

train = assembler.transform(train)

train.show(10)

In [None]:
# let´s assemble the test dataset as a single feature vector using VectorAssembler class

columns = ['step','amount','CASH_OUT','PAYMENT','CASH_IN','TRANSFER','DEBIT','CC','CM','isFraud']

assembler = VectorAssembler(inputCols=columns, outputCol='features')

test = assembler.transform(test)

test.show(10)

## 4.2 Models

We´ll use several machine learning algorithms to evaluate all of them and to select the best one. We´ll start with Random Forest.

### 4.2.1 Random Forest

#### Training

In [None]:
# train the model "random forest" (rf)

rf = RandomForestClassifier(featuresCol='features', labelCol='isFraud')
model_RF = rf.fit(train)

In [None]:
type(model_RF)

### Predictions

In [None]:
# make predictions of the random forest model using the test dataset

predictions = model_RF.transform(test)


In [None]:
type(predictions)

In [None]:
predictions.show(50)

#### We can see that there are three more columns: rawPrediction, probability and prediction. We can clearly compare the actual values and predicted values with the output below:

In [None]:
predictions.select("isFraud","prediction").show(50)

#### At a glance we can see that the predicted values are the same of the actual values, at least for the first fifty registers.

### Evaluation

#### We need to evaluate our random forest machine learning algorithm.

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="isFraud", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)


In [None]:
type(accuracy)

In [None]:
print(f"The accuracy is {accuracy}")

In [None]:
Test_Error = (1 - accuracy)
print(f"The Test Error is {Test_Error}")

#### Let´s check out the Consufion Matrix.

In [None]:
preds_and_labels = predictions.select(["prediction","isFraud"])
preds_and_labels = preds_and_labels.withColumn("isFraud", f.col("isFraud").cast(FloatType())).orderBy("prediction")

In [None]:
preds_and_labels.show(20)

In [None]:
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

In [None]:
type(metrics)

In [None]:
print("The Confusion Matrix is:")

metrics.confusionMatrix().toArray()

#### According to the confusion matrix, all the actual values will be correctly predicted. It may mean an Overfitting.

### 4.2.2 Logistic Regression

#### Training

In [None]:
# train the model Logistic Regression (lr)

lr = LogisticRegression(featuresCol='features', labelCol='isFraud')

model_LR = lr.fit(train)

In [None]:
type(model_LR)

#### To better understand the model, we can examine its coefficients and intercept. The values represent the weights assigned to each feature and the bias term, respectively.

In [None]:
coefficients = model_LR.coefficients

intercept = model_LR.intercept

print("Coefficients: ", coefficients)

print("Intercept: ", intercept)


#### Predictions

In [None]:
# make predictions of the logistic regression model using the test dataset

predictions = model_LR.transform(test)

predictions.show(50)

#### Evaluation

In [None]:
# AUC - ROC

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="isFraud")

auc = evaluator.evaluate(predictions)

# Accuracy, Precision and Recall

metrics = MulticlassClassificationEvaluator(labelCol="isFraud", predictionCol="prediction",)

accuracy = metrics.evaluate(predictions, {metrics.metricName:"accuracy"})

precision = metrics.evaluate(predictions, {metrics.metricName:"weightedPrecision"})

recall = metrics.evaluate(predictions, {metrics.metricName:"weightedRecall"})

print(f"AUC-ROC: ", auc)

print(f"Accuracy: ", accuracy)

print(f"Precsion: ", precision)

print(f"Recall: ", recall)

### 4.2.3 Decision Tree

#### Training

In [None]:
# train the model Decision Tree (dt)

dt = DecisionTreeClassifier(featuresCol='features', labelCol='isFraud')

model_dt = dt.fit(train)

#### Predictions

In [None]:
# make predictions of the decision tree model using the test dataset

predictions = model_dt.transform(test)

predictions.show(50)

#### Evaluation

In [None]:
# AUC - ROC

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="isFraud")

auc = evaluator.evaluate(predictions)

# Accuracy, Precision and Recall

metrics = MulticlassClassificationEvaluator(labelCol="isFraud", predictionCol="prediction",)

accuracy = metrics.evaluate(predictions, {metrics.metricName:"accuracy"})

precision = metrics.evaluate(predictions, {metrics.metricName:"weightedPrecision"})

recall = metrics.evaluate(predictions, {metrics.metricName:"weightedRecall"})

print(f"AUC-ROC: ", auc)

print(f"Accuracy: ", accuracy)

print(f"Precsion: ", precision)

print(f"Recall: ", recall)

#### Let´s check out the Confusion Matrix.

In [None]:
preds_and_labels = predictions.select(["prediction","isFraud"])
preds_and_labels = preds_and_labels.withColumn("isFraud", f.col("isFraud").cast(FloatType())).orderBy("prediction")

In [None]:
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

In [None]:
print("The Confusion Matrix is:")

metrics.confusionMatrix().toArray()

### 4.2.4 Naive Bayes

#### Training

In [None]:
# train the model Naive Bayes (nb)

nb = NaiveBayes(featuresCol='features', labelCol='isFraud')

model_nb = nb.fit(train)

#### Predictions

In [None]:
# make predictions of the naive bayes model using the test dataset

predictions = model_nb.transform(test)

predictions.show(50)

#### Evaluation

In [None]:
# AUC - ROC

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="isFraud")

auc = evaluator.evaluate(predictions)

# Accuracy, Precision and Recall

metrics = MulticlassClassificationEvaluator(labelCol="isFraud", predictionCol="prediction",)

accuracy = metrics.evaluate(predictions, {metrics.metricName:"accuracy"})

precision = metrics.evaluate(predictions, {metrics.metricName:"weightedPrecision"})

recall = metrics.evaluate(predictions, {metrics.metricName:"weightedRecall"})

print(f"AUC-ROC: ", auc)

print(f"Accuracy: ", accuracy)

print(f"Precsion: ", precision)

print(f"Recall: ", recall)

## 5. Storage

### 5.1 Model

In [None]:
# model: Random Forest

model_RF.save("randomF_model")

# model: Logistic Regression

model_LR.save("logit_model")

# model: Decision Tree

model_dt.save("decisionT_model")

# model: Naive Bayes

model_nb.save("naiveB_model")


### 5.2 Load

In [None]:
# model: Random Forest

loaded_model_RF = RandomForestClassifier.load("randomF_model")

# model: Logistic Regression

loaded_model_LR = LogisticRegression.load("logit_model")

# model: Decision Tree

loaded_model_LR = DecisionTreeClassifier.load("decisionT_model")

# model: Naive Bayes

loaded_model_LR = NaiveBayes.load("naiveB_model")