In [2]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import month, year, quarter, count
import matplotlib.pyplot as plt

<h2>Sales Dataframe </h2>

In [9]:
schema = StructType([
    StructField('product_id', IntegerType(), True),
    StructField('customer_id', StringType(), True),
    StructField('order_date', DateType(), True),
    StructField('location', StringType(), True),
    StructField('source_order', StringType(), True)
])
spark = SparkSession.builder.appName('Analysis').getOrCreate()
sales_df=spark.read.format("csv").option("inferschema","true").schema(schema).load("Data/sales.csv.txt")

sales_df.show(5)

+----------+-----------+----------+--------+------------+
|product_id|customer_id|order_date|location|source_order|
+----------+-----------+----------+--------+------------+
|         1|          A|2023-01-01|   India|      Swiggy|
|         2|          A|2022-01-01|   India|      Swiggy|
|         2|          A|2023-01-07|   India|      Swiggy|
|         3|          A|2023-01-10|   India|  Restaurant|
|         3|          A|2022-01-11|   India|      Swiggy|
+----------+-----------+----------+--------+------------+
only showing top 5 rows



<h2>Deriving year, month, quarter</h2>

In [10]:
sales_df = sales_df.withColumn('order_year', year('order_date'))
sales_df = sales_df.withColumn('order_month', month('order_date'))
sales_df = sales_df.withColumn('order_quarter', quarter('order_date'))
sales_df.show(5)

+----------+-----------+----------+--------+------------+----------+-----------+-------------+
|product_id|customer_id|order_date|location|source_order|order_year|order_month|order_quarter|
+----------+-----------+----------+--------+------------+----------+-----------+-------------+
|         1|          A|2023-01-01|   India|      Swiggy|      2023|          1|            1|
|         2|          A|2022-01-01|   India|      Swiggy|      2022|          1|            1|
|         2|          A|2023-01-07|   India|      Swiggy|      2023|          1|            1|
|         3|          A|2023-01-10|   India|  Restaurant|      2023|          1|            1|
|         3|          A|2022-01-11|   India|      Swiggy|      2022|          1|            1|
+----------+-----------+----------+--------+------------+----------+-----------+-------------+
only showing top 5 rows



<h2> Menu dataframe</h2>

In [None]:
schema = StructType([
    StructField('product_id', IntegerType(), True),
    StructField('product_name', StringType(), True),
    StructField('price', StringType(), True)
])

menu_df = spark.read.format("csv").option("inferschema","true").schema(schema).load("Data/menu.csv.txt")
menu_df.show(5)

<h2> Total amount spent by each customer </h2>