### **Adding necessary library**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *  
import json

### **Creating SparkSession**

In [2]:
spark = SparkSession.builder.appName("spark_dataframe_py").config("spark.jars", "C:\spark-3.5.1-bin-hadoop3\jars\mysql-connector-j-8.4.0.jar").getOrCreate()

### **Adding Necessary configs for connection**

In [3]:
url = "jdbc:mysql://localhost:3306/extenso_config"
properties = {
    "user": "root",
    "password": "root",
    "driver": "com.mysql.jdbc.Driver"
}

### **Function to create dataframe from each sql table**

In [4]:
def table(table_name, start_date, end_date):
  df = spark.read.jdbc(url=url, table=table_name, properties=properties)
  if table_name == "rw_transaction_data":
    df = df.filter((df.last_modified_date <= to_date(lit(end_date))) & (df.last_modified_date >= to_date(lit(start_date))))
  return df
product_category_map = table("product_category_map",'2023-01-01','2024-01-01')
rw_transaction_data = table("rw_transaction_data",'2023-01-01','2024-01-01')

### **Joined rw_transaction and product_category_map**

In [5]:
joined = rw_transaction_data.join(product_category_map,['product_id', 'product_type_id','module_id'])
joined.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_type_id: integer (nullable = true)
 |-- module_id: integer (nullable = true)
 |-- txn_id: long (nullable = true)
 |-- last_modified_date: date (nullable = true)
 |-- last_modified_date_bs: string (nullable = true)
 |-- created_date: date (nullable = true)
 |-- amount: double (nullable = true)
 |-- status: integer (nullable = true)
 |-- payer_account_id: integer (nullable = true)
 |-- receiver_account_id: integer (nullable = true)
 |-- reward_point: double (nullable = true)
 |-- cash_back_amount: double (nullable = true)
 |-- revenue_amount: double (nullable = true)
 |-- transactor_module_id: long (nullable = true)
 |-- time: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- product_category_id: long (nullable = true)
 |-- txn_flow: string (nullable = true)



In [6]:
joined.count()

212488

### **Function to get months**

In [7]:
joined =  joined.withColumn("months",month(col("last_modified_date")))
joined.show(n=2)

+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+------+
|product_id|product_type_id|module_id|   txn_id|last_modified_date|last_modified_date_bs|created_date|amount|status|payer_account_id|receiver_account_id|reward_point|cash_back_amount|revenue_amount|transactor_module_id|    time|        product_name|product_category_id|   txn_flow|months|
+----------+---------------+---------+---------+------------------+---------------------+------------+------+------+----------------+-------------------+------------+----------------+--------------+--------------------+--------+--------------------+-------------------+-----------+------+
|       143|             59|        1|693893736|        2023-01-06|           2079-09-22|  2023-01-06| 175.0|     1|             531|

In [8]:
joined.select('product_name').distinct().collect()

[Row(product_name='Smart Cell Topup'),
 Row(product_name='Kolhuwa Khanepani'),
 Row(product_name='Gajedi 2 Bakalghad Khanepani'),
 Row(product_name='Ncell Pack'),
 Row(product_name='Postpaid Topup'),
 Row(product_name='Karahiya Khanepani upabhokta tatha Sarsafai Sanstha'),
 Row(product_name='Lalbandi khanepani'),
 Row(product_name='Parsa Khanepani'),
 Row(product_name='Kerwani Khanepani'),
 Row(product_name='Kushma Shivalaya Khanepani'),
 Row(product_name='Waling Khanepani'),
 Row(product_name='Jaluke Water Supply And Sanitation Users Organisation'),
 Row(product_name='Shankarnagar Khanepani'),
 Row(product_name='eSewa agent to Century'),
 Row(product_name='Prabhu TV'),
 Row(product_name='Electricity'),
 Row(product_name='Sipadol Khanepani'),
 Row(product_name='BUDDHA AIR'),
 Row(product_name='Damak Khanepani'),
 Row(product_name='Parasi Khanepani'),
 Row(product_name='Devdaha Khanepani'),
 Row(product_name='Siddhipur Khanepani'),
 Row(product_name='Sau Farsatikar Khanepani'),
 Row(pro

### **Grouping according to payer_acc_id and product_name**

In [9]:
product_map = joined.groupBy(["payer_account_id","product_name"]).pivot("months").count().fillna(0)

### **Function to get top 10 most_used_product**

In [10]:
def most_used_product():
    most_used_product = joined.groupBy("product_name").count().fillna(0)
    most_used_product = most_used_product.orderBy("count",ascending=[0])
    top_10 =most_used_product.select("product_name").take(10)
    top_product = [row['product_name'] for row in top_10]
    return top_product
top_product = most_used_product()

In [11]:
top_product

['NT Topup via Bank',
 'Ncell Topup via Bank',
 'TOPUP VIA BANK DIRECT',
 'Send Money',
 'Ncell Data Via Bank',
 'Ncell Topup',
 'Wordlink Topup via Bank',
 'NT Prepaid Topup',
 'Prepaid Topup',
 'Electricity']

In [12]:
filtered_df = joined.filter(col("product_name").isin(top_product))
product_used_count = filtered_df.groupBy("payer_account_id","product_name").pivot("months").count().fillna(0)
product_used_count.show()

+----------------+--------------------+-----+----+----+-----+-----+
|payer_account_id|        product_name|    1|   2|   3|    4|    5|
+----------------+--------------------+-----+----+----+-----+-----+
|              34|          Send Money| 1444|1330|1524| 1430| 1188|
|              26|         Electricity|    4|   1|   7|    3|    3|
|             471|          Send Money|   18|  18|  34|   44|   14|
|              56|         Electricity|  255| 251| 256|  273|  191|
|              34|    NT Prepaid Topup|   71|  85|  93|   65|   53|
|             531| Ncell Data Via Bank|  764| 516| 660| 1166|  686|
|             531|   NT Topup via Bank|16977|4675|6278| 6205|24663|
|             222|       Prepaid Topup|    1|   1|   2|    1|    2|
|             471|       Prepaid Topup|  137| 120| 118|  144|  113|
|             531|TOPUP VIA BANK DI...|    0|   0|3453|35028|18912|
|              34|         Ncell Topup|   60|  44|  86|   68|   76|
|              34|         Electricity|  107| 11

In [13]:
product_used_count.columns

['payer_account_id', 'product_name', '1', '2', '3', '4', '5']

### **if count greater than 0 then replace it with 1**

In [14]:
cols_to_change = ['1','2','3','4','5']

for column in cols_to_change:
    product_used_count = product_used_count.withColumn(column,when(col(column)>0,1).otherwise(col(column)))
product_used_count.show()

+----------------+--------------------+---+---+---+---+---+
|payer_account_id|        product_name|  1|  2|  3|  4|  5|
+----------------+--------------------+---+---+---+---+---+
|              34|          Send Money|  1|  1|  1|  1|  1|
|              26|         Electricity|  1|  1|  1|  1|  1|
|             471|          Send Money|  1|  1|  1|  1|  1|
|              56|         Electricity|  1|  1|  1|  1|  1|
|              34|    NT Prepaid Topup|  1|  1|  1|  1|  1|
|             531| Ncell Data Via Bank|  1|  1|  1|  1|  1|
|             531|   NT Topup via Bank|  1|  1|  1|  1|  1|
|             222|       Prepaid Topup|  1|  1|  1|  1|  1|
|             471|       Prepaid Topup|  1|  1|  1|  1|  1|
|             531|TOPUP VIA BANK DI...|  0|  0|  1|  1|  1|
|              34|         Ncell Topup|  1|  1|  1|  1|  1|
|              34|         Electricity|  1|  1|  1|  1|  1|
|             471|    NT Prepaid Topup|  1|  1|  1|  1|  1|
|             222|          Send Money| 

### Final_result

In [18]:
product_used_count = product_used_count.withColumn("used_map",concat(col("1"), col("2"),col("3"), col("4"),col("5")))
product_used_count.show(n=37)

+----------------+--------------------+---+---+---+---+---+--------+
|payer_account_id|        product_name|  1|  2|  3|  4|  5|used_map|
+----------------+--------------------+---+---+---+---+---+--------+
|              34|          Send Money|  1|  1|  1|  1|  1|   11111|
|              26|         Electricity|  1|  1|  1|  1|  1|   11111|
|             471|          Send Money|  1|  1|  1|  1|  1|   11111|
|              56|         Electricity|  1|  1|  1|  1|  1|   11111|
|              34|    NT Prepaid Topup|  1|  1|  1|  1|  1|   11111|
|             531| Ncell Data Via Bank|  1|  1|  1|  1|  1|   11111|
|             531|   NT Topup via Bank|  1|  1|  1|  1|  1|   11111|
|             222|       Prepaid Topup|  1|  1|  1|  1|  1|   11111|
|             471|       Prepaid Topup|  1|  1|  1|  1|  1|   11111|
|             531|TOPUP VIA BANK DI...|  0|  0|  1|  1|  1|   00111|
|              34|         Ncell Topup|  1|  1|  1|  1|  1|   11111|
|              34|         Electri