In [1]:
# Configure MLflow Experiment
mlflow_experiment_id = 866112

# Including MLflow
import mlflow
import mlflow.spark
import os
print("MLflow Version: %s" % mlflow.__version__)

MLflow Version: 2.1.1


In [2]:
import pandas as pd
import numpy as np
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

### Source Data

In [3]:
#Data set is synthetic dataset is scaled down 1/4 of the original dataset and it is created just for Kaggle
data_urls = "https://media.githubusercontent.com/media/FelixQLe/Detect_Financial_Fraud_at_Scale_with_decision_Trees/main/Synthetic_Financial_datasets_log.csv"

In [4]:
fin_fraud_dataset = pd.read_csv(data_urls, delimiter=',', header = 0)

In [5]:
#large dataset takes longer to load, so i make a copy to reuse in case
fin_fraud_copy = fin_fraud_dataset.copy()

In [6]:
fin_fraud_copy.shape

(6362620, 11)

### Create SQL database using PySpark

In [7]:
#Create a spark Context class
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/03 23:44:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
#Create a spark session
spark = SparkSession.builder.appName("Python Spark Dataframes Financial Fruad").config(
        "spark.some.config.option", "some-value").getOrCreate()

In [9]:
fin_fraud_copy.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [10]:
#Loading data into Spark, take long
spark_df = spark.createDataFrame(fin_fraud_copy)

In [11]:
#create table view fin_fraud_table, we can treat it as sql table
spark_df.createTempView("fin_fraud_table")

In [12]:
# Create df DataFrame which contains our simulated financial fraud detection dataset
fin_fraud_df = spark.sql("select step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest from fin_fraud_table")

In [None]:
fin_fraud_df.show()

23/02/03 23:47:32 ERROR Inbox: An error happened while processing message in the inbox for LocalSchedulerBackendEndpoint
java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1877)
	at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1786)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1189)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.JavaSerializerInst

Exception in thread "dispatcher-event-loop-2" java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1877)
	at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1786)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1189)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:115)
	at org.apache.spark.scheduler.Tas

In [None]:
# Review the schema of your data 
fin_fraud_df.printSchema()

In [None]:
#add column orgDiff and destDiff based on the difference between Originating and Destination
fin_fraud_df = fin_fraud_df.withColumn("orgDiff",
                            fin_fraud_df.newbalanceOrig - 
                                       fin_fraud_df.oldbalanceOrg).withColumn("destDiff",
                            fin_fraud_df.newbalanceDest - fin_fraud_df.oldbalanceDest)
    
#create temporary view
fin_fraud_df.createOrReplaceTempView("financials")

In [None]:
#review the new table
fin_fraud_df.show()

In [None]:
fin_fraud_df.printSchema()

### Exploring Data Analysis

#### What are the type of transactions?

In [None]:
%%sql
select type, count(1) from financials group by type