In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
storage_account_name = "XXXXXXXXXXXXX"
storage_account_key = "XXXXXXXXXXXXXX"
container_name = "XXXXXXXXXXX"
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)
input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/"
 
df = spark.read.format("parquet").load(input_path)

In [0]:
df.show()

+--------+------------------+-------------------+------------+----------+-----------------+----+-------------+--------------+------------------+------+----------------+-----+--------------+--------+-----------------+----------+----------+------+---+--------------------+---------+----------+-----+
|Category|ship-service-level|           Order ID|ship-country|Fulfilment|              SKU|Size|Sales Channel|Courier Status|         ship-city|Amount|ship-postal-code|  B2B|    ship-state|currency|promotion_applied|      Date|      ASIN| Style|Qty|              Status|DayOfWeek|DayOfmonth|Month|
+--------+------------------+-------------------+------------+----------+-----------------+----+-------------+--------------+------------------+------+----------------+-----+--------------+--------+-----------------+----------+----------+------+---+--------------------+---------+----------+-----+
|     Set|          Standard|171-0157225-5497910|          IN|  Merchant|   SET319-KR-NP-S|   S|    Amazon

In [0]:
num_rows = df.count()

num_columns = len(df.columns)

print(f"Shape of DataFrame: ({num_rows}, {num_columns})")

Shape of DataFrame: (120347, 24)


In [0]:
df.createOrReplaceTempView("sales_table")

In [0]:
%sql
select count(*) as orders_count from sales_table

orders_count
120347


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select sum(amount) as total_sales_amount from sales_table

total_sales_amount
72011702.0


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select sum(qty) as Total_Products_Ordered  from sales_table

Total_Products_Ordered
108733


Databricks visualization. Run in Databricks to view.

### Total Sales Amount Per Category

In [0]:
%sql
select category,sum(amount) total_amount
from sales_table
group by category
order by total_amount

category,total_amount
Dupatta,305.0
Saree,101996.0
Bottom,133958.0
Blouse,433244.0
Ethnic Dress,727525.0
Top,4978948.0
Western Dress,10225225.0
kurta,19209857.0
Set,36200644.0


Databricks visualization. Run in Databricks to view.

### Outlier Sales Within Month

In [0]:
%sql
SELECT month, amount
FROM
  (SELECT *,
      PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY amount) OVER(PARTITION BY month) AS Q1,
      PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY amount) OVER(PARTITION BY month) AS Q3,
      (PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY amount) OVER(PARTITION BY month) -
      PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY amount) OVER(PARTITION BY month)) AS IQR
  FROM sales_table
  where amount>0) as Q
WHERE (amount <  Q1 - 1.5 * (IQR) OR amount >  Q3 + 1.5 * (IQR)) AND amount>0

In [0]:
%sql 
select month,amount from sales_table
where amount>0

People Frequently Order As Quantities Of 1

### Most Frequently Order Price Range Of Products

In [0]:
%sql
select `order id`,amount/qty as price_per_product
from sales_table
where qty!=0 and amount!=0


### Top 10 Cities By Sales Amount

In [0]:
%sql
select * from
(select *,dense_rank() OVER(order by total_amount desc) as RANK from
(select `ship-city`,sum(amount) as total_amount from sales_table
group by `ship-city`
order by total_amount desc) as _) as t
where RANK<=10


ship-city,total_amount,RANK
BENGALURU,6387938.0,1
HYDERABAD,4437508.0,2
MUMBAI,3393315.0,3
NEW DELHI,3370803.0,4
CHENNAI,2769001.0,5
PUNE,2153386.0,6
KOLKATA,1308886.0,7
GURUGRAM,1156793.0,8
THANE,943504.0,9
LUCKNOW,866524.0,10


Databricks visualization. Run in Databricks to view.

Sales is maximum in the cases of Urban Cities

In [0]:
%sql
select * from
(select *,dense_rank() OVER(order by total_amount desc) as RANK from
(select `ship-state`,sum(amount) as total_amount from sales_table
group by `ship-state`
order by total_amount desc) as _) as t
where RANK<=10


ship-state,total_amount,RANK
MAHARASHTRA,12288678.0,1
KARNATAKA,9732089.0,2
UTTAR PRADESH,6284460.0,3
TELANGANA,6207798.0,4
TAMIL NADU,5840676.0,5
DELHI,3941826.0,6
KERALA,3416881.0,7
WEST BENGAL,3247246.0,8
ANDHRA PRADESH,2840705.0,9
HARYANA,2684285.0,10


Databricks visualization. Run in Databricks to view.

Maharashtra, Karnataka, Uttar Pradesh, Telangana, and Tamilnad are some of the largest states in terms of population and economic activity, which could explain their high sales numbers.

### Impact Of Promotion On Revenue

In [0]:
%sql
select promotion_applied,sum(amount) from sales_table
group by promotion_applied

promotion_applied,sum(amount)
No,21861871.0
Yes,50149831.0


Databricks visualization. Run in Databricks to view.

# TRENDS

### Order Count By Day Of Month

In [0]:
%sql
select dayofmonth,count(`Order ID`) from sales_table
group by dayofmonth
order by count(`Order ID`) desc

dayofmonth,count(Order ID)
2,4685
3,4667
4,4597
1,4443
8,4435
5,4361
15,4212
14,4211
9,4199
7,4182


Databricks visualization. Run in Databricks to view.

The Order Count Is Peak At the Start Of The Month As People Recieve The Salaries At That Time

### Order Count By Day Of Week

In [0]:
%sql
select dayofweek,count(`Order ID`) from sales_table
group by dayofweek
order by count(`Order ID`) desc

dayofweek,count(Order ID)
1,18205
3,17585
2,17366
4,17363
7,17255
6,16573
5,16000


Databricks visualization. Run in Databricks to view.

On Sundays Offline Stores Are Closed
Saturdays There is rise compared to the week days but its still low,
maybe cause of the the availability of Offline Stores.
Also There is a possibility of people working on saturdays.

### Day On Day Sales Growth

In [0]:
%sql
SELECT month, dayofmonth as Day, sum(qty) as Total_Quantity 
FROM sales_table 
where month!=3 and dayofmonth<=29
GROUP BY month,dayofmonth
ORDER BY month,dayofmonth

month,Day,Total_Quantity
4,1,1226
4,2,1312
4,3,1426
4,4,1253
4,5,1382
4,6,1341
4,7,1286
4,8,1420
4,9,1380
4,10,1512


Databricks visualization. Run in Databricks to view.

The irregular peak in sales in the month of may pertains to the summer 
sale amazon usually has during the period of may 2nd to 5th 

### Sales Decline Day By Day For Top 4 categories

In [0]:
%sql
select *,lag(total_sales,1) OVER(partition by month order by dayofmonth) as prev_total_sales,total_sales-(lag(total_sales,1) OVER(partition by month order by dayofmonth)) as diff
from
(select month,dayofmonth,sum(amount) as total_sales
from sales_table
where month!=3 and (category="Set" OR category="Top" OR category="Western Dress" OR category="kurta" or category="Ethnic Dress")
group by month,dayofmonth
order by month,dayofmonth) as _

month,dayofmonth,total_sales,prev_total_sales,diff
4,1,786508.0,,
4,2,828031.0,786508.0,41523.0
4,3,911878.0,828031.0,83847.0
4,4,810034.0,911878.0,-101844.0
4,5,874403.0,810034.0,64369.0
4,6,812933.0,874403.0,-61470.0
4,7,805707.0,812933.0,-7226.0
4,8,918597.0,805707.0,112890.0
4,9,888859.0,918597.0,-29738.0
4,10,976199.0,888859.0,87340.0


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT
    `order id`,
    category,
    date,
    amount,
    SUM(amount) OVER(PARTITION BY category ORDER BY date) AS cumulative_sales_by_product
FROM sales_table
ORDER BY category,date;


In [0]:
%sql
select fulfilment,`courier status`,count(`Order ID`) as order_count from sales_table
group by fulfilment,`courier status`
order by fulfilment

fulfilment,courier status,order_count
Amazon,Unshipped,5705
Amazon,Shipped,72662
Amazon,Cancelled,5616
Merchant,Shipped,29498
Merchant,Cancelled,6385
Merchant,Unshipped,481


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select `ship-service-level`,`Courier Status`,count(`Order ID`) as order_count from sales_table
group by `ship-service-level`,`Courier status`
order by `ship-service-level`,count(`Order ID`)

ship-service-level,Courier Status,order_count
Expedited,Cancelled,5574
Expedited,Unshipped,5578
Expedited,Shipped,71754
Standard,Unshipped,608
Standard,Cancelled,6427
Standard,Shipped,30406


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 
    `ship-service-level`,
    fulfilment,
    COUNT(*) AS total_orders,
    SUM(CASE WHEN `Courier status` = 'Cancelled' THEN 1 ELSE 0 END) AS total_cancellations,
    SUM(CASE WHEN `Courier status` = 'Cancelled' THEN 1 ELSE 0 END) / COUNT(*) AS cancellation_rate
FROM sales_table
GROUP BY `ship-service-level`, fulfilment
ORDER BY cancellation_rate DESC

ship-service-level,fulfilment,total_orders,total_cancellations,cancellation_rate
Standard,Merchant,36364,6385,0.1755857441425585
Expedited,Amazon,82906,5574,0.0672327696427279
Standard,Amazon,1077,42,0.0389972144846796


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
select * from (
select *,RANK() OVER(order by cancellation_count desc)  as rank from
(select `ship-city`,count(*) as cancellation_count from sales_table 
where `courier status`!="Cancelled"
group by `ship-city`
order by count(*) desc)
as _)
where rank<=5

ship-city,cancellation_count,rank
BENGALURU,9678,1
HYDERABAD,6691,2
MUMBAI,5219,3
NEW DELHI,4923,4
CHENNAI,4492,5


Databricks visualization. Run in Databricks to view.

### Categorize Product Category Sales Performance as 'High', 'Medium', or 'Low' Based on Percentiles

In [0]:
%sql
with sales_category_table as
(with ntiles as (select category,amount,ntile(3) over(order by amount) as ntile from
sales_table
where amount!=0)
select *, case when ntile=1 then "Low"
when ntile=2 then "Medium"
when ntile=3 then "High"
end as sales_category
from ntiles)
select category,sales_category,count(sales_category) from sales_category_table
where category="Set" OR category="Top" OR category="Western Dress" OR category="kurta" or category="Ethnic Dress"
group by category,sales_category
order by category

category,sales_category,count(sales_category)
Ethnic Dress,Low,243
Ethnic Dress,Medium,101
Ethnic Dress,High,631
Set,Low,1596
Set,Medium,15633
Set,High,25207
Top,Low,3408
Top,Medium,5155
Top,High,726
Western Dress,Low,360


Databricks visualization. Run in Databricks to view.

In the Case of Sets Very Few People Go For Cheap Items
In case of Kurtas Majority prefer the cheap option


### Size vs Category Sales Heatmap

In [0]:
%sql
select category,size,sum(qty)
from sales_table
where category="Set" OR category="Top" OR category="Western Dress" OR category="kurta"
group by category,size


category,size,sum(qty)
Set,6XL,66
Western Dress,XL,2019
Set,5XL,54
kurta,3XL,4656
Top,XXL,1594
kurta,6XL,560
kurta,XXL,6382
Set,L,6972
Western Dress,M,2221
Set,XL,6463


Databricks visualization. Run in Databricks to view.