In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *            

In [2]:
import json

In [3]:
spark = SparkSession.builder.appName("spark_dataframe_py").config("spark.jars", "C:\spark-3.5.1-bin-hadoop3\jars\mysql-connector-j-8.4.0.jar").getOrCreate()

In [4]:
url = "jdbc:mysql://localhost:3306/extenso_assignment"
properties = {
    "user": "root",
    "password": "root",
    "driver": "com.mysql.jdbc.Driver"
}

In [5]:
emptyRDD = spark.sparkContext.emptyRDD()
schema = StructType([
  StructField('payer_account_id', IntegerType(), True),
  StructField('first_day_of_month', DateType(), True),
  StructField('txn_flow', StringType(), True),
  StructField('sum(amount)', DoubleType(), True),
  StructField('count(amount)', LongType(), True),
  StructField('last_modified_date', DateType(), True),
  StructField('min(last_modified_date)', DateType(), True),
  ])
Schema = spark.createDataFrame(emptyRDD,schema)

In [6]:
Schema.printSchema()

root
 |-- payer_account_id: integer (nullable = true)
 |-- first_day_of_month: date (nullable = true)
 |-- txn_flow: string (nullable = true)
 |-- sum(amount): double (nullable = true)
 |-- count(amount): long (nullable = true)
 |-- last_modified_date: date (nullable = true)
 |-- min(last_modified_date): date (nullable = true)



In [7]:
def config(table,last_date):
    last_transaction_date = table.select(max("last_modified_date")).collect()[0][0]
    config_data = {
      "last_transaction_date" : last_date,
      "start_transaction_date" : last_transaction_date.strftime('%Y-%m-%d')
    }
    with open("config.json", "w") as json_file:
        json.dump(config_data, json_file)

In [8]:
def get_start_last_transaction_date(config_path = "config.json"):
    with open(config_path, 'r') as file:
        config_data = json.load(file)
        last_transaction_date = config_data.get('last_transaction_date')
        start_transaction_date = config_data.get('start_transaction_date')
        return start_transaction_date,last_transaction_date

In [9]:
start_date,last_date = get_start_last_transaction_date()

In [10]:
def table(table_name, start_date, end_date):
  df = spark.read.jdbc(url=url, table=table_name, properties=properties)
  if table_name == "rw_transaction_data":
    df = df.filter((df.last_modified_date <= to_date(lit(end_date))) & (df.last_modified_date >= to_date(lit(start_date))))
  return df

In [11]:
customer_profile = table('customer_profile',start_date,last_date)
product_category = table("product_category",start_date,last_date)
product_category_map = table("product_category_map",start_date,last_date)
products = table("products",start_date,last_date)
rw_transaction_data = table("rw_transaction_data",start_date,last_date)

In [12]:
rw_transaction_data.count()

4691

In [13]:
def new_data(final_table , rw_transaction_data , product_category_map = product_category_map):
    rw_transaction_data = rw_transaction_data.withColumn("first_day_of_month", trunc(col("last_modified_date"), "month"))
    joined = rw_transaction_data.join(product_category_map, ['product_id', 'product_type_id', 'module_id'])
    table_to_join = joined.groupBy(["payer_account_id","first_day_of_month","txn_flow"]).agg(sum("amount"),count("amount"),max("last_modified_date").alias("last_modified_date"),min("last_modified_date"))
    def merge_table(row):
        existing_row = table_to_join.filter((table_to_join.payer_account_id == row.payer_account_id) & (table_to_join.first_day_of_month == row.first_day_of_month) & (table_to_join.txn_flow == row.txn_flow)).collect()
        if existing_row:
            return existing_row[0]
        else:
            return row
    final_table = final_table.unionAll(table_to_join)
    if final_table.count() == 0:
        merged_rows = table_to_join.collect()
    else:
        merged_rows = final_table.collect()
    merged_rows = [merge_table(row) for row in merged_rows]
    merged_table = spark.createDataFrame(merged_rows, final_table.schema)
    merged_table = merged_table.distinct()
    return merged_table

In [14]:
final_table = new_data(Schema,rw_transaction_data)

In [15]:
# 2022-01-01 -> 2023-01-01
final_table.show(n=23)

+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|payer_account_id|first_day_of_month|   txn_flow|sum(amount)|count(amount)|last_modified_date|min(last_modified_date)|
+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|              34|        2022-12-01|Value Chain|     4351.0|            6|        2022-12-01|             2022-12-01|
|              56|        2022-12-01|    OutFlow|    12125.0|            4|        2022-12-01|             2022-12-01|
|             531|        2022-12-01|Value Chain|   600054.0|         3836|        2022-12-01|             2022-12-01|
|             471|        2022-12-01|Value Chain|     3048.0|           20|        2022-12-01|             2022-12-01|
|              26|        2023-01-01|     InFlow|    11240.0|            3|        2023-01-01|             2023-01-01|
|              26|        2022-12-01|Value Chain

In [16]:
config(final_table,"2024-01-01")

In [17]:
start_date,last_date = get_start_last_transaction_date()

In [18]:
rw_transaction_data = table("rw_transaction_data",start_date,last_date)

In [19]:
rw_transaction_data.count()

8232

In [20]:
final_tables = new_data(final_table,rw_transaction_data)

In [23]:
final_tables.count()

26

In [22]:
# 2023-01-02 -> 2024-01-01
final_tables.show(n=26)

+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|payer_account_id|first_day_of_month|   txn_flow|sum(amount)|count(amount)|last_modified_date|min(last_modified_date)|
+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|             531|        2022-12-01|Value Chain|   600054.0|         3836|        2022-12-01|             2022-12-01|
|              34|        2022-12-01|Value Chain|     4351.0|            6|        2022-12-01|             2022-12-01|
|              56|        2022-12-01|    OutFlow|    12125.0|            4|        2022-12-01|             2022-12-01|
|             471|        2022-12-01|Value Chain|     3048.0|           20|        2022-12-01|             2022-12-01|
|            1056|        2022-12-01|Value Chain|      400.0|            2|        2022-12-01|             2022-12-01|
|              56|        2022-12-01|Value Chain