In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
# from pyspark.sql.types import *
from pyspark.sql.window import *

In [2]:
import json

In [3]:
spark = SparkSession.builder.appName("spark_dataframe").config("spark.jars", "C:\spark-3.5.1-bin-hadoop3\jars\mysql-connector-j-8.4.0.jar").getOrCreate()


In [4]:
url = "jdbc:mysql://localhost:3306/extenso_assignment"
properties = {
    "user": "root",
    "password": "root",
    "driver": "com.mysql.jdbc.Driver"
}

In [5]:
def table(table_name):
    df = spark.read.jdbc(url=url, table=table_name, properties=properties)
    return df

In [6]:
customer_profile = table('customer_profile')
product_category = table("product_category")
product_category_map = table("product_category_map")
products = table("products")
rw_transaction_data = table("rw_transaction_data")

In [7]:
rw_transaction_data.show()

+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|module_id|product_id|product_type_id|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|
+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|660612529|        2023-03-29|           2079-12-15|  2022-11-23|  50.0|     1|        1|        77|             29|             531|                  2|         0.0|             0.0|           0.0|                   4|14:07:40|
|666435422|        2022-12-01|           2079-08-15|  2022-12-01| 750.0|     1|     

In [77]:
rw_transaction_data.select("last_modified_date").agg(max("last_modified_date"))

DataFrame[max(last_modified_date): date]

In [52]:
last_date = rw_transaction_data.select("last_modified_date").first()[0]
rw_transaction_data_part1 = rw_transaction_data.filter(col("last_modified_date") <= last_date)
rw_transaction_data_part2 = rw_transaction_data.filter(col("last_modified_date") > last_date)

In [53]:
rw_transaction_data_part2.show()

+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|module_id|product_id|product_type_id|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|
+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|660612529|        2023-03-29|           2079-12-15|  2022-11-23|  50.0|     1|        1|        77|             29|             531|                  2|         0.0|             0.0|           0.0|                   4|14:07:40|
|689868970|        2023-01-01|           2079-09-17|  2023-01-01| 100.0|     1|     

In [54]:
def config(table):
    last_transaction_date = table.select(max("last_modified_date")).collect()[0][0]
    config_data = {
      "last_transaction_date":last_transaction_date.strftime('%Y-%m-%d')
    }
    with open("config.json", "w") as json_file:
        json.dump(config_data, json_file)

In [55]:
config(rw_transaction_data_part1)

In [56]:
def get_last_transaction_date(config_path):
    with open(config_path, 'r') as file:
        config_data = json.load(file)
        last_transaction_date = config_data.get('last_transaction_date')
        return last_transaction_date
last_transaction_date = get_last_transaction_date("config.json")

In [57]:
date = to_date(last_transaction_date)

In [58]:
rw_transaction_data1 = rw_transaction_data.filter(col("last_modified_date") <= lit(last_transaction_date))

In [59]:
rw_transaction_data1.show()

+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|module_id|product_id|product_type_id|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|
+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+
|666435422|        2022-12-01|           2079-08-15|  2022-12-01| 750.0|     1|        1|        76|             29|             531|                 81|         0.0|             0.0|         33.75|                   4|00:03:41|
|666436001|        2022-12-01|           2079-08-15|  2022-12-01|  50.0|     1|     

In [60]:
rw_transaction_data1 = rw_transaction_data1.withColumn("months",month(col("last_modified_date")))
# rw_transaction_data = rw_transaction_data.withColumn("last_modified_date", col("last_modified_date").cast("string"))
rw_transaction_data1 = rw_transaction_data1.withColumn("first_day_of_month", trunc(col("last_modified_date"), "month"))
rw_transaction_data1.show()

+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+------+------------------+
|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|module_id|product_id|product_type_id|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|months|first_day_of_month|
+---------+------------------+---------------------+------------+------+------+---------+----------+---------------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+------+------------------+
|666435422|        2022-12-01|           2079-08-15|  2022-12-01| 750.0|     1|        1|        76|             29|             531|                 81|         0.0|             0.0|         33.75|                   4|00:03:41|    12|

In [61]:
joined = rw_transaction_data1.join(product_category_map,['product_id', 'product_type_id','module_id'])

In [62]:
joined.count()

3935

In [63]:
final_table = joined.groupBy(["payer_account_id","first_day_of_month","txn_flow"]).agg(sum("amount"),count("amount"),max("last_modified_date").alias("last_modified_date"),min("last_modified_date"))

In [64]:
final_table.show()

+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|payer_account_id|first_day_of_month|   txn_flow|sum(amount)|count(amount)|last_modified_date|min(last_modified_date)|
+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|              34|        2022-12-01|Value Chain|     4351.0|            6|        2022-12-01|             2022-12-01|
|              56|        2022-12-01|    OutFlow|    12125.0|            4|        2022-12-01|             2022-12-01|
|             531|        2022-12-01|Value Chain|   600054.0|         3836|        2022-12-01|             2022-12-01|
|             471|        2022-12-01|Value Chain|     3048.0|           20|        2022-12-01|             2022-12-01|
|              26|        2022-12-01|Value Chain|      460.0|           11|        2022-12-01|             2022-12-01|
|            1056|        2022-12-01|Value Chain

In [65]:
config(final_table)

In [66]:
def new_data(final_table,rw_transaction_data,product_category_map = product_category_map):
    last_transaction_date = get_last_transaction_date("config.json")
    rw_transaction_data = rw_transaction_data.filter(col("last_modified_date") > lit(last_transaction_date))
    rw_transaction_data = rw_transaction_data.withColumn("first_day_of_month", trunc(col("last_modified_date"), "month"))
    joined = rw_transaction_data.join(product_category_map,['product_id', 'product_type_id','module_id'])
    table_to_join = joined.groupBy(["payer_account_id","first_day_of_month","txn_flow"]).agg(sum("amount"),count("amount"),max("last_modified_date").alias("last_modified_date"),min("last_modified_date"))
    final_tables = final_table.unionAll(table_to_join)
    return final_tables

In [67]:
final_tables = new_data(final_table,rw_transaction_data)
final_tables.show()

+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|payer_account_id|first_day_of_month|   txn_flow|sum(amount)|count(amount)|last_modified_date|min(last_modified_date)|
+----------------+------------------+-----------+-----------+-------------+------------------+-----------------------+
|              34|        2022-12-01|Value Chain|     4351.0|            6|        2022-12-01|             2022-12-01|
|              56|        2022-12-01|    OutFlow|    12125.0|            4|        2022-12-01|             2022-12-01|
|             531|        2022-12-01|Value Chain|   600054.0|         3836|        2022-12-01|             2022-12-01|
|             471|        2022-12-01|Value Chain|     3048.0|           20|        2022-12-01|             2022-12-01|
|              26|        2022-12-01|Value Chain|      460.0|           11|        2022-12-01|             2022-12-01|
|            1056|        2022-12-01|Value Chain

In [68]:
final_tables.printSchema()

root
 |-- payer_account_id: integer (nullable = true)
 |-- first_day_of_month: date (nullable = true)
 |-- txn_flow: string (nullable = true)
 |-- sum(amount): double (nullable = true)
 |-- count(amount): long (nullable = false)
 |-- last_modified_date: date (nullable = true)
 |-- min(last_modified_date): date (nullable = true)

