In [1]:
# Configure MLflow Experiment
mlflow_experiment_id = 866112

# Including MLflow
import mlflow
import mlflow.spark
import os
print("MLflow Version: %s" % mlflow.__version__)

MLflow Version: 2.1.1


In [2]:
import pandas as pd
import numpy as np
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

### Source Data

In [3]:
#Data set is synthetic dataset is scaled down 1/4 of the original dataset and it is created just for Kaggle
data_urls = "https://media.githubusercontent.com/media/FelixQLe/Detect_Financial_Fraud_at_Scale_with_decision_Trees/main/Synthetic_Financial_datasets_log.csv"

In [4]:
fin_fraud_dataset = pd.read_csv(data_urls, delimiter=',', header = 0)

In [5]:
#large dataset takes longer to load, so i make a copy to reuse in case
fin_fraud_copy = fin_fraud_dataset.copy()

In [6]:
fin_fraud_copy.shape

(6362620, 11)

### Create SQL database using PySpark

In [7]:
#Create a spark Context class
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/03 21:43:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
#Create a spark session
spark = SparkSession.builder.appName("Python Spark Dataframes Financial Fruad").config(
        "spark.some.config.option", "some-value").getOrCreate()

In [9]:
fin_fraud_copy.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [10]:
#Loading data into Spark, take long
spark_df = spark.createDataFrame(fin_fraud_copy)

In [11]:
#create table view fin_fraud_table, we can treat it as sql table
spark_df.createTempView("fin_fraud_table")

In [12]:
# Create df DataFrame which contains our simulated financial fraud detection dataset
fin_fraud_df = spark.sql("select step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest from fin_fraud_table")

In [18]:
fin_fraud_df.show()

[Stage 0:>                                                          (0 + 0) / 1]

23/02/03 21:52:24 WARN TaskSetManager: Stage 0 contains a task of very large size (64404 KiB). The maximum recommended task size is 1000 KiB.


[Stage 0:>                                                          (0 + 1) / 1]

23/02/03 21:52:29 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 0 (TID 0): Attempting to kill Python Worker


                                                                                

+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|M1230701703|           0.0|           0.0|
|   1| PAYMENT|  7817.71|  C90045638|      53860.0|      46042.29| M573487274|           0.0|           0.0|
|   1| PAYMENT|  71

In [20]:
# Review the schema of your data 
fin_fraud_df.printSchema()

root
 |-- step: long (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)



In [27]:
#add column orgDiff and destDiff based on the difference between Originating and Destination
fin_fraud_df = fin_fraud_df.withColumn("orgDiff",
                            fin_fraud_df.newbalanceOrig - 
                                       fin_fraud_df.oldbalanceOrg).withColumn("destDiff",
                            fin_fraud_df.newbalanceDest - fin_fraud_df.oldbalanceDest)
    
#create temporary view
fin_fraud_df.createOrReplaceTempView("financials")

In [28]:
#review the new table
display(fin_fraud_df)

DataFrame[step: bigint, type: string, amount: double, nameOrig: string, oldbalanceOrg: double, newbalanceOrig: double, nameDest: string, oldbalanceDest: double, newbalanceDest: double, orgDiff: double, destDiff: double]