# Project: Fraud Detection 

## 1. Overview

### PaySim simulates mobile money transactions based on a sample of real transacions extracted from one month of financial logs from a mobile money service implemented in an African country. The original logs were provided by a multinational company, who is the provider of the mobile financial service which is currently running in more than 14 countries all around the world. The objective of the project is to predict if a transaction is fraudulent or not.

## 2. Preprocess the data

#### We´ll use PySpark to preprocess the data.

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.functions import vector_to_array
import functools
import seaborn as sb
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from collections import Counter
#from sklearn.datasets import make_classification

In [None]:
# global variables

global df_bank, results 

In [None]:
# creation of the SparkSession

spark = SparkSession.builder.appName("FraudDetection").getOrCreate()
spark

In [None]:
# spark dataframe 

df = spark.read.csv('fraudDetection.csv', header=True)

#### Let´s take a look to the data with the first 10 rows.

In [None]:
df.show(10)

In [None]:
df.printSchema()

#### There are 11 columns, some of them are numerical and others are categorical. Let´s count the number of registers.

In [None]:
print(f"The total number of registers is:",df.count())

#### We have more than six miliions of transactions in the dataset.

### 2.1 Feature Engineering

#### Firstly, we´ll create a function to create a new variable.

In [None]:
### 2.1.1.- creation of a new variable: type2

df_type2 = df.withColumn("type2",f.concat(f.substring("nameOrig",1,1),f.substring("nameDest",1,1)))

In [None]:
df_type2.show(5)

#### We´ve created a new column named "type2" which is composed by the first character of the column "nameOrig" and the first character of the column "nameDest"

In [None]:
### 2.1.2.1.- One Hot Encoding: column "type"

df_type2.show(3)

#### We´ll use some libraries of Spark for Machine Learning (SparkML).

In [None]:
### StringIndexer Initialization
### column: type

indexer_type = StringIndexer(inputCol="type",outputCol="types_indexed")
indexerModel_type = indexer_type.fit(df_type2)


In [None]:
### Transform the DataFrame using the fitted StringIndexer model

indexed_df_type2 = indexerModel_type.transform(df_type2)
indexed_df_type2.show(10)

#### Here, we´ve set each of the elements of the "type" column into indexes.

In [None]:
### apply One-Hot-Encoding to the indexed column, that is, 
### "types_indexed"

encoder_type = OneHotEncoder(dropLast=False, inputCol="types_indexed", outputCol="types_onehot")
encoder_type_df = encoder_type.fit(indexed_df_type2).transform(indexed_df_type2)
encoder_type_df.show(truncate=False)


In [None]:
encoder_type_df.printSchema()

In [None]:
encoder_type_df_split = encoder_type_df.select('*',vector_to_array('types_onehot').alias('types_onehot_split'))
encoder_type_df_split.show(5)

In [None]:
### now, we´ll split the "types_onehot_split" into five columns, one per category

num_categories = len(encoder_type_df_split.first()['types_onehot_split'])
cols_expanded = [(f.col('types_onehot_split')[i].alias(f"{indexerModel_type.labels[i]}")) for i in range(num_categories)]
type_df = encoder_type_df_split.select('*',*cols_expanded)


In [None]:
type_df.show(100)

#### We´ve applied One-Hot-Encoding to the column "type" resulting in five new columns:
+ CASH_OUT
+ CASH_IN
+ PAYMENT
+ TRANSFER 
+ DEBIT

#### Now, we´ll apply this procedure to the column "type2".

In [None]:
### 2.1.2.2.- One Hot Encoding: column "type2"

type_df.show(5)

In [None]:
### StringIndexer Initialization
### column: type2

indexer_type = StringIndexer(inputCol="type2",outputCol="types_indexed2")
indexerModel_type = indexer_type.fit(type_df)

In [None]:
### Transform the DataFrame using the fitted StringIndexer model

indexed_df_type = indexerModel_type.transform(type_df)
indexed_df_type.show(10)

In [None]:
### apply One-Hot-Encoding to the indexed column, that is, 
### "types_indexed2"

encoder_type2 = OneHotEncoder(dropLast=False, inputCol="types_indexed2", outputCol="types_onehot2")
encoder_type2_df = encoder_type2.fit(indexed_df_type).transform(indexed_df_type)
encoder_type2_df.show(truncate=False)

In [None]:
encoder_type2_df.printSchema()

In [None]:
encoder_type2_df_split = encoder_type2_df.select('*',vector_to_array('types_onehot2').alias('types_onehot_split2'))
encoder_type2_df_split.show(5)

In [None]:
### now, we´ll split the "types_onehot_split2" into two columns, one per category

num_categories = len(encoder_type2_df_split.first()['types_onehot_split2'])
cols_expanded = [(f.col('types_onehot_split2')[i].alias(f"{indexerModel_type.labels[i]}")) for i in range(num_categories)]
encoder_type2_df_split = encoder_type2_df_split.select('*',*cols_expanded)

In [None]:
encoder_type2_df_split.show(5)

#### We´ve split the "type2" column into two columns based on One-Hot-Encoding. Now, we´ll eliminate some unnecessaruy columns. Let´s check out all the columns.

In [None]:
encoder_type2_df_split.printSchema()

In [None]:
### 2.1.3.- Eliminate unneccesary columns: "nameOrig","nameDest","isFlaggedFraud","newbalanceDest",
### "oldbalanceDest","oldbalanceOrg","newbalanceOrig","types_indexed","types_onehot","types_onehot_split",
### "types_indexed2","types_onehot2", "types_onehot_split2"


#### Now, we´ll eliminate the unnecessary columns:
+ nameOrig
+ nameDest
+ isFlaggedFraud
+ newbalanceDest
+ oldbalanceDest
+ oldbalanceOrg
+ newbalanceOrig 
+ types_indexed
+ types_onehot
+ types_onehot_split
+ types_indexed2
+ types_onehot2
+ types_onehot_split2
+ type
+ type2

In [None]:
df_bank = encoder_type2_df_split.drop("nameOrig","nameDest","isFlaggedFraud","newbalanceDest","oldbalanceDest",
                       "oldbalanceOrg","newbalanceOrig","type","types_indexed","types_onehot",
                       "types_onehot_split","type2","types_indexed2","types_onehot2","types_onehot_split2" )
df_bank.show(5)

In [None]:
df_bank.count()

#### We can see that there are the same quantity of registers.

### 2.2 Data Cleaning

In [None]:
### 2.2.1.- Eliminate duplicated

num_all_rows = df_bank.count()
num_all_rows

In [None]:
num_duplicated_rows = df_bank.distinct().count() 

In [None]:
print(f"The total number of duplicated rows is:",num_all_rows - num_duplicated_rows)

#### We can see that there are 7597 duplicated rows.

In [None]:
df_bank = df_bank.dropDuplicates()

In [None]:
df_bank.count()

#### We can see the duplicated registers have been removed because there fewer registers than before.

In [None]:
### 2.2.2.- Eliminate null values

df_bank.dropna()

In [None]:
df_bank.count()

#### We can see that there were no null values in the dataset because the number of registers is the same. Let´s take a look at the "clean" dataset.

In [None]:
df_bank.show(10)

## 3. Exploratory Data Analysis (EDA)

### 3.1 Data Balancing

#### Remember that our label is "isFraud", therefore, we need to check if this class is balanced or unbalanced.

In [None]:
df_bank_partitioned = df_bank.drop("step","amount","CASH_OUT","CASH_IN","TRANSFER","DEBIT","CC","CM","PAYMENT")
df_bank_partitioned.show(5)

#### Now, we´ll transform it to a Pandas dataframe.

In [None]:
df_bank_partitioned_pd = df_bank_partitioned.toPandas()

In [None]:
type(df_bank_partitioned_pd)

In [None]:
df_bank_partitioned_pd.describe()

In [None]:
df_bank_partitioned_pd.head(5)

In [None]:
df_bank_partitioned_pd.info()

#### From the previous view, we can see that the columns have only "object" type. We need to cast it into integers using the method "astype()".

In [None]:
df_bank_partitioned_pd['isFraud'] = df_bank_partitioned_pd['isFraud'].astype(int)

In [None]:
df_bank_partitioned_pd['isFraud']

#### We can see that the type has been changed from "object" to "int64" (integer).

In [None]:
df_bank_partitioned_pd.plot.hist()

In [None]:
#6346820
len(df_bank_partitioned_pd)


#### From the previous plot we can see that the classes are unbalanced. Remember that "0" means that there is no fraud and "1" is the opposite case. We need to apply a technique to revert this. We´ll use an algorithm named "SMOTE". 

In [None]:
# create an object "SMOTE"

smt = SMOTE()

#### Let´s split the dataset into two pieces: "X" and "Y". 

In [None]:
X_train, X_test,y_train,y_test = train_test_split(df_bank_partitioned_pd,df_bank_partitioned_pd,test_size=0.4)

In [None]:
len(X_train),len(y_train)

In [None]:
len(X_test),len(y_test)

#### Suming both results in 6.35

In [None]:
X_resampled, Y_resambled = smt.fit_resample(X_train,y_train)

In [None]:
smt.fit

In [None]:
type(X_resampled)

In [None]:
len(X_resampled)

In [None]:
Counter(df_bank_partitioned_pd['isFraud'])

In [None]:
Counter(X_resampled['isFraud'])

In [None]:
Counter(Y_resambled['isFraud'])

In [None]:
X_resampled

In [None]:
len(X_resampled)

In [None]:
Y_resambled

In [None]:
X_resampled.hist()

In [None]:
X_resampled.count()

In [None]:
len(X_resampled)

In [None]:
len(Y_resambled)

In [None]:
df_bank_partitioned_pd['isFraud']

In [None]:
values0,value1 = df_bank_partitioned_pd.isFraud.value_counts()

In [None]:
values0,value1

In [None]:
df_class_0 = df_bank_partitioned_pd[df_bank_partitioned_pd['isFraud']==0]
df_class_1 = df_bank_partitioned_pd[df_bank_partitioned_pd['isFraud']==1]


In [None]:
df_class_0, df_class_1 

In [None]:
### Random over-sampling

df_class_1_overSampling = df_class_1.sample(values0, replace=True)

In [None]:
df_class_1_overSampling

In [None]:
df_test_overSampling = pd.concat([df_class_0,df_class_1_overSampling],axis=0)

In [None]:
type(df_test_overSampling)

In [None]:
df_test_overSampling.isFraud.value_counts()

In [None]:
df_test_overSampling.hist()

In [None]:
###