## Run SparkSession

In [None]:
import os
import sys

In [None]:
sys.path.insert(0, "Full\path\to\Hadoop\spark-2.4.7-bin-hadoop2.7\python")
sys.path.insert(0, "Full\path\to\Hadoop\spark-2.4.7-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip")

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [None]:
conf = SparkConf()
conf.setMaster("local").setAppName("Graph")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession(sc)

In [None]:
spark

### Import modules

In [None]:
from pyspark.sql.types import *
import pyspark.sql.functions as f
from func import *

### Load data

In [None]:
%%time
schema = StructType([
        StructField('Payer', StringType()),
        StructField('Recipient', StringType()),
        StructField('Amount', FloatType()),
        StructField('Date', StringType()),
                    ])
df = spark.read.csv('data.csv',schema,';',header=False).repartition(4)

### Change date string column to timestamp

In [None]:
df = df.withColumn('Date', f.to_timestamp(f.col('Date'), "yyyy-MM-dd HH:mm:ss"))

In [None]:
df.show(5)

### Data preprocessing

In [None]:
init_graph = Graph_miner(df, "Payer", "Recipient", "Amount", "Date")

In [None]:
init_graph.result.head(10)

### Draw graphs

In [None]:
paint = Painter(init_graph)

In [None]:
paint.draw(filename = "start_graph")

In [None]:
paint.filtering_df(r_count_tresh=1)
paint.draw(filename = "r_count_tresh_1")

In [None]:
paint.filtering_df(sum_tresh = 3000, r_count_tresh=1)
paint.draw(filename = "sum_tresh_3000_r_count_tresh_1")

In [None]:
paint.filtering_df(sum_tresh = 3000, r_count_tresh=1, acc_name="account_96")
paint.draw(filename = "sum_tresh_3000_r_count_tresh_1_name_account_96")

## Stop SparkSession

In [None]:
spark.stop()