In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import count
from pyspark.sql.functions import countDistinct

In [0]:
spark = SparkSession.builder.appName('SalesAnalysis').getOrCreate()

In [0]:
schema = StructType([
    StructField("Product_id", IntegerType(), True),
    StructField("Customer_id", StringType(), True),
    StructField("Order_date", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Source_order", StringType(), True),
])

sales_df = spark.read.format('csv').option("schema",True).schema(schema).load('/FileStore/tables/sales_csv.txt')
sales_df.display()

Product_id,Customer_id,Order_date,Location,Source_order
1,A,2023-01-01,India,Swiggy
2,A,2022-01-01,India,Swiggy
2,A,2023-01-07,India,Swiggy
3,A,2023-01-10,India,Restaurant
3,A,2022-01-11,India,Swiggy
3,A,2023-01-11,India,Restaurant
2,B,2022-02-01,India,Swiggy
2,B,2023-01-02,India,Swiggy
1,B,2023-01-04,India,Restaurant
1,B,2023-02-11,India,Swiggy


In [0]:
from pyspark.sql.functions import month, year, quarter

sales_df = sales_df.withColumn("Order_year",year(sales_df.Order_date))
sales_df.display()

Product_id,Customer_id,Order_date,Location,Source_order,Order_year
1,A,2023-01-01,India,Swiggy,2023
2,A,2022-01-01,India,Swiggy,2022
2,A,2023-01-07,India,Swiggy,2023
3,A,2023-01-10,India,Restaurant,2023
3,A,2022-01-11,India,Swiggy,2022
3,A,2023-01-11,India,Restaurant,2023
2,B,2022-02-01,India,Swiggy,2022
2,B,2023-01-02,India,Swiggy,2023
1,B,2023-01-04,India,Restaurant,2023
1,B,2023-02-11,India,Swiggy,2023


In [0]:
sales_df = sales_df.withColumn("Order_month",month(sales_df.Order_date))
sales_df = sales_df.withColumn("Order_quarter",quarter(sales_df.Order_date))
sales_df.display()

Product_id,Customer_id,Order_date,Location,Source_order,Order_year,Order_month,Order_quarter
1,A,2023-01-01,India,Swiggy,2023,1,1
2,A,2022-01-01,India,Swiggy,2022,1,1
2,A,2023-01-07,India,Swiggy,2023,1,1
3,A,2023-01-10,India,Restaurant,2023,1,1
3,A,2022-01-11,India,Swiggy,2022,1,1
3,A,2023-01-11,India,Restaurant,2023,1,1
2,B,2022-02-01,India,Swiggy,2022,2,1
2,B,2023-01-02,India,Swiggy,2023,1,1
1,B,2023-01-04,India,Restaurant,2023,1,1
1,B,2023-02-11,India,Swiggy,2023,2,1


In [0]:
schema = StructType([
    StructField("Product_id", IntegerType(), True),
    StructField("Product_name", StringType(), True),
    StructField("Price", StringType(), True),
])
menu_df = spark.read.format('csv').option("schema",True).schema(schema).load('/FileStore/tables/menu_csv.txt')
menu_df = menu_df.withColumn("Price", menu_df["Price"].cast("integer"))
menu_df.display()

Product_id,Product_name,Price
1,PIZZA,100
2,Chowmin,150
3,sandwich,120
4,Dosa,110
5,Biryani,80
6,Pasta,180


In [0]:
# By default it is inner join
TotalAmountSpendByEachCustomer = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy(sales_df.Customer_id).sum("Price").orderBy       
                                (sales_df.Customer_id)
TotalAmountSpendByEachCustomer.display()

Customer_id,sum(Price)
A,4260
B,4440
C,2400
D,1200
E,2040


Databricks visualization. Run in Databricks to view.

In [0]:
TotalAmountSpendByEachFood = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy('Product_name').sum('Price')
TotalAmountSpendByEachFood.display()

Product_name,sum(Price)
Pasta,1080
PIZZA,2100
sandwich,5760
Biryani,480
Chowmin,3600
Dosa,1320


Databricks visualization. Run in Databricks to view.

In [0]:
TotalAmountOfSalesEachMonth = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy('Order_month').sum('Price').orderBy('Order_month')
TotalAmountOfSalesEachMonth.display()

Order_month,sum(Price)
1,2960
2,2730
3,910
5,2960
6,2960
7,910
11,910


Databricks visualization. Run in Databricks to view.

In [0]:
YearlySales = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy('Order_year').sum('Price').orderBy('Order_year')
YearlySales.display()

Order_year,sum(Price)
2022,4350
2023,9990


Databricks visualization. Run in Databricks to view.

In [0]:
QuarterlySales = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy('Order_quarter').sum('Price').orderBy('Order_quarter')
QuarterlySales.display()

Order_quarter,sum(Price)
1,6600
2,5920
3,910
4,910


Databricks visualization. Run in Databricks to view.

In [0]:
ProductPurchaseCount = (sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy(sales_df.Product_id,'Product_name')
                        .agg(count('Product_name').alias('Product_count')).orderBy('Product_count', ascending=0).drop('Product_id')
                    )
display(ProductPurchaseCount)

Product_name,Product_count
sandwich,48
Chowmin,24
PIZZA,21
Dosa,12
Biryani,6
Pasta,6


Databricks visualization. Run in Databricks to view.

In [0]:
MostOrderedItem = (sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy(sales_df.Product_id,'Product_name')
                        .agg(count('Product_name').alias('Product_count')).orderBy('Product_count', ascending=0).drop('Product_id').limit(1)
                    )
display(MostOrderedItem)

Product_name,Product_count
sandwich,48


Databricks visualization. Run in Databricks to view.

In [0]:

FreqOfCustomer = sales_df.filter(sales_df.Source_order == 'Restaurant').groupBy('Customer_id').agg(countDistinct('Order_date')).orderBy('Customer_id')
display(FreqOfCustomer)

Customer_id,count(Order_date)
A,6
B,6
C,3
D,1
E,5


Databricks visualization. Run in Databricks to view.

In [0]:
TotalSalesByEachCountry = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy('Location').sum('Price')
display(TotalSalesByEachCountry)

Location,sum(Price)
India,4860
USA,2460
UK,7020


Databricks visualization. Run in Databricks to view.

In [0]:
TotalSalesBySourceOrder = sales_df.join(menu_df, sales_df.Product_id == menu_df.Product_id).groupBy('Source_order').sum('Price')
display(TotalSalesBySourceOrder)

Source_order,sum(Price)
zomato,4920
Swiggy,6330
Restaurant,3090


Databricks visualization. Run in Databricks to view.