In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

In [2]:
spark = SparkSession.builder.appName("spark_session").config("spark.jars", "C:\spark-3.5.1-bin-hadoop3\jars\mysql-connector-j-8.4.0.jar").getOrCreate()

In [3]:
url = "jdbc:mysql://localhost:3306/extenso_assignment"
properties = {
    "user": "root",
    "password": "root",
    "driver": "com.mysql.jdbc.Driver"
}

In [4]:
def table(table_name):
    df = spark.read.jdbc(url=url, table=table_name, properties=properties)
    return df

In [5]:
customer_profile = table('customer_profile')
product_category = table("product_category")
product_category_map = table("product_category_map")
products = table("products")
rw_transaction_data = table("rw_transaction_data")

In [6]:
customer_profile.show()

In [7]:
rw_transaction_data.printSchema()

In [8]:
rw_transaction_data = rw_transaction_data.withColumn("months",month(col("last_modified_date")))
rw_transaction_data = rw_transaction_data.withColumn("last_modified_date", col("last_modified_date").cast("string"))

In [9]:
rw_transaction_data.printSchema()

In [10]:
rw_transaction_data.show()

In [11]:
rw_transaction_data.select("payer_account_id").distinct().show()

In [12]:
joined = rw_transaction_data.join(product_category_map,['product_id', 'product_type_id','module_id'])

In [13]:
joined.show()

In [14]:
joined_total = joined.groupBy(["payer_account_id","months"]).pivot("txn_flow").sum("amount").fillna(0)

In [15]:
joined_total.show()

In [16]:
joined_count = joined.groupBy(["payer_account_id","months"]).pivot("txn_flow").count().fillna(0)

In [17]:
joined_count.show()

In [18]:
total_count = joined_count.groupBy("payer_account_id").agg(sum("InFlow").alias("TotalCountInflow"),sum("Outflow").alias("TotalCountOutflow"),sum("Value Chain").alias("TotalCountValueChain"))

In [19]:
total_amount = joined_total.groupBy("payer_account_id").agg(sum("InFlow").alias("TotalAmtInflow"),sum("Outflow").alias("TotalAmtOutflow"),sum("Value Chain").alias("TotalAmtValueChain"))

In [20]:
total_amount.show()

In [21]:
avg_count = joined_count.groupBy("payer_account_id").agg(avg("InFlow").alias("AvgCountInflow"),avg("Outflow").alias("AvgCountOutflow"),avg("Value Chain").alias("AvgCountValueChain"))

In [22]:
avg_count.show()

In [23]:
avg_total = joined_total.groupBy("payer_account_id").agg(avg("InFlow").alias("AvgAmtInflow"),avg("Outflow").alias("AvgAmtOutflow"),avg("Value Chain").alias("AvgAmtValueChain"))

In [24]:
avg_total.show()

In [25]:
reward_point = joined.groupBy("payer_account_id").agg(sum("reward_point").alias("TotalRewardPoint"))

In [26]:
reward_point.show()

In [27]:
joined = joined.withColumn("Date", concat_ws(" ", col("last_modified_date"), col("time")))

In [28]:
joined.show()

In [29]:
joined = joined.withColumn("Date", to_timestamp(col("Date"), "yyyy-MM-dd HH:mm:ss"))

In [30]:
# joined.show()

In [31]:
last_transaction = joined.groupBy("payer_account_id").agg(max("Date").alias("Latest_Transaction_Date"))

In [32]:
joined_alias = joined.alias("joined")

In [33]:
joined_and_last = joined_alias.join(
    last_transaction,
    last_transaction["Latest_Transaction_Date"] == joined_alias["Date"],how="semi"
).select(joined_alias["payer_account_id"],joined_alias["Date"].alias("Latest_Transaction_Date"),joined_alias["product_name"].alias("latest_Product"))
joined_and_last = joined_and_last.dropDuplicates(['payer_account_id'])

In [34]:
joined_and_last.show()

In [35]:
rev_amt = joined.groupBy("payer_account_id","months").agg(sum("revenue_amount").alias("rev_amt"))                                                         

In [36]:
rev_amt.show()

In [37]:
rev_amt_tot_mean = rev_amt.groupBy("payer_account_id").agg(sum("rev_amt").alias("Total_Revenue_Amount"),mean("rev_amt").alias("Monthly_Revenue_Amount"))

In [38]:
rev_amt_tot_mean.show()

In [40]:
product_used_count = joined.groupBy("payer_account_id","product_name").count()

In [41]:
product_used_count.show()

In [42]:
window_spec = Window.partitionBy("payer_account_id").orderBy(col("count").desc())

In [43]:
data = product_used_count.orderBy("payer_account_id","count",ascending=[0, 0])

In [44]:
data.show()

In [45]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [46]:
datas_count_sorted_with_row_num = data.withColumn("row_num", row_number().over(window_spec))


In [47]:
datas_count_sorted_with_row_num.show()

In [48]:
most_used_product = datas_count_sorted_with_row_num.filter(col("row_num") == 1).select("payer_account_id", col("product_name").alias("most_used_product"))
second_used_product = datas_count_sorted_with_row_num.filter(col("row_num") == 2).select("payer_account_id", col("product_name").alias("Second_most_used_product"))
third_used_product = datas_count_sorted_with_row_num.filter(col("row_num") == 3).select("payer_account_id", col("product_name").alias("Third_most_used_product"))
product_usage = joined.groupBy("payer_account_id").count()

In [49]:
final_table = total_amount.join(avg_total,['payer_account_id'],how="left").join(total_count,['payer_account_id'],how="left").join(avg_count,['payer_account_id'],how="left").join(joined_and_last,['payer_account_id'],how="left").join(rev_amt_tot_mean,['payer_account_id'],how="left").join(product_usage,['payer_account_id'],how="left").join(most_used_product,['payer_account_id'],how="left").join(second_used_product,['payer_account_id'],how="left").join(third_used_product,['payer_account_id'],how="left")

In [50]:
final_table.show()