## Loading Data

In [None]:
df = spark.read.csv('/FileStore/tables/train.csv',header=True,inferSchema=True)
df.show(5)

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+
|     1|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset Col...|  261.96|
|     2|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   C

In [None]:
df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Ship Date: date (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: string (nullable = true)



In [None]:
df.count()

Out[3]: 9800

## Dropping irrelevant columns

In [None]:
df = df.drop('Row ID','Customer ID','Customer Name','Postal Code','Product ID','Product Name')
df.show(5)

+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+
|      Order ID|Order Date| Ship Date|     Ship Mode|  Segment|      Country|           City|     State|Region|       Category|Sub-Category|   Sales|
+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|   Bookcases|  261.96|
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|      Chairs|  731.94|
|CA-2017-138688|2017-06-12|2017-06-16|  Second Class|Corporate|United States|    Los Angeles|California|  West|Office Supplies|      Labels|   14.62|
|US-2016-108966|2016-10-11|2016-10-18|Standard Class| Consumer|United States|Fort Lauderdale|   Flor

In [None]:
cols = list(df.columns)
cols

Out[5]: ['Order ID',
 'Order Date',
 'Ship Date',
 'Ship Mode',
 'Segment',
 'Country',
 'City',
 'State',
 'Region',
 'Category',
 'Sub-Category',
 'Sales']

## Lowering columns name

In [None]:
for col in cols:
    df = df.withColumnRenamed(col,col.lower())

## Segment wise count

In [None]:
sdf = df.groupBy('segment').count().orderBy('count',ascending=False)
sdf.show()
display(sdf)

+-----------+-----+
|    segment|count|
+-----------+-----+
|   Consumer| 5101|
|  Corporate| 2953|
|Home Office| 1746|
+-----------+-----+



segment,count
Consumer,5101
Corporate,2953
Home Office,1746


Databricks visualization. Run in Databricks to view.

## Segment category wise total sale

In [None]:
from pyspark.sql import functions as f

csdf = df.groupBy(['segment','category']).agg(
    {'sales':'sum'}
)

csdf = csdf.withColumn('total_sale',f.round(csdf['sum(sales)'],1)).drop('sum(sales)')
csdf = csdf.withColumnRenamed('sum(sales)','total_sales')
csdf.show()
display(csdf)

+-----------+---------------+----------+
|    segment|       category|total_sale|
+-----------+---------------+----------+
|Home Office|Office Supplies|  119379.7|
|  Corporate|     Technology|  243931.9|
|   Consumer|Office Supplies|  353174.5|
|Home Office|     Technology|  182379.6|
|   Consumer|     Technology|  400890.4|
|Home Office|      Furniture|  119295.3|
|  Corporate|      Furniture|  217434.7|
|   Consumer|      Furniture|  383061.5|
|  Corporate|Office Supplies|  217585.6|
+-----------+---------------+----------+



segment,category,total_sale
Home Office,Office Supplies,119379.7
Corporate,Technology,243931.9
Consumer,Office Supplies,353174.5
Home Office,Technology,182379.6
Consumer,Technology,400890.4
Home Office,Furniture,119295.3
Corporate,Furniture,217434.7
Consumer,Furniture,383061.5
Corporate,Office Supplies,217585.6


Databricks visualization. Run in Databricks to view.

## Lets concatenate segment category columns

In [None]:
seg_df = df.withColumn('segment_cat',f.concat(df['segment'],f.lit('_'),f.concat(df['category'])))
seg_df.show(5)

+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+--------------------+
|      order id|order date| ship date|     ship mode|  segment|      country|           city|     state|region|       category|sub-category|   sales|         segment_cat|
+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+--------------------+
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|   Bookcases|  261.96|  Consumer_Furniture|
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|      Chairs|  731.94|  Consumer_Furniture|
|CA-2017-138688|2017-06-12|2017-06-16|  Second Class|Corporate|United States|    Los Angeles|California|  West|Office Supplies|      Labels|   14

In [None]:
seg_df1 = seg_df.groupBy('segment_cat').agg(
    f.sum('sales')
)
seg_df1 = seg_df1.withColumn('sum(sales)',f.round(seg_df1['sum(sales)'],1))
display(seg_df1)

segment_cat,sum(sales)
Home Office_Office Supplies,119379.7
Home Office_Technology,182379.6
Consumer_Office Supplies,353174.5
Corporate_Office Supplies,217585.6
Consumer_Furniture,383061.5
Consumer_Technology,400890.4
Home Office_Furniture,119295.3
Corporate_Furniture,217434.7
Corporate_Technology,243931.9


Databricks visualization. Run in Databricks to view.

In [None]:
df.show(5)

+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+
|      order id|order date| ship date|     ship mode|  segment|      country|           city|     state|region|       category|sub-category|   sales|
+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|   Bookcases|  261.96|
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|      Chairs|  731.94|
|CA-2017-138688|2017-06-12|2017-06-16|  Second Class|Corporate|United States|    Los Angeles|California|  West|Office Supplies|      Labels|   14.62|
|US-2016-108966|2016-10-11|2016-10-18|Standard Class| Consumer|United States|Fort Lauderdale|   Flor

## Creating widgets

In [None]:
dbutils.widgets.combobox('segment','',['Consumer','Corporate','Home Office'],'Choose segment')
dbutils.widgets.combobox('category','',['Office Supplies','Furniture','Technology'],'Choose category')

In [None]:
seg = dbutils.widgets.get('segment')
cat = dbutils.widgets.get('category')

In [None]:
wdf = df.filter(df['category']==seg)
wdf = df.filter(df['category']==cat)
wdf.show()

+--------------+----------+----------+--------------+-----------+-------------+---------------+--------------+-------+---------------+------------+---------------+
|      order id|order date| ship date|     ship mode|    segment|      country|           city|         state| region|       category|sub-category|          sales|
+--------------+----------+----------+--------------+-----------+-------------+---------------+--------------+-------+---------------+------------+---------------+
|CA-2017-138688|2017-06-12|2017-06-16|  Second Class|  Corporate|United States|    Los Angeles|    California|   West|Office Supplies|      Labels|          14.62|
|US-2016-108966|2016-10-11|2016-10-18|Standard Class|   Consumer|United States|Fort Lauderdale|       Florida|  South|Office Supplies|     Storage|         22.368|
|CA-2015-115812|2015-06-09|2015-06-14|Standard Class|   Consumer|United States|    Los Angeles|    California|   West|Office Supplies|         Art|           7.28|
|CA-2015-115812|

## Create new columns 'Order year','order month','monthname'

In [None]:
from pyspark.sql.functions import year,month,quarter,date_format

df = df.withColumn('order year',year(df['order date']))
df = df.withColumn('order month',month(df['order date']))
df = df.withColumn('monthname',date_format(df['order date'],'MMMM'))
df.show()

+--------------+----------+----------+--------------+-----------+-------------+---------------+--------------+-------+---------------+------------+--------+----------+-----------+---------+
|      order id|order date| ship date|     ship mode|    segment|      country|           city|         state| region|       category|sub-category|   sales|order year|order month|monthname|
+--------------+----------+----------+--------------+-----------+-------------+---------------+--------------+-------+---------------+------------+--------+----------+-----------+---------+
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   Consumer|United States|      Henderson|      Kentucky|  South|      Furniture|   Bookcases|  261.96|      2017|         11| November|
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   Consumer|United States|      Henderson|      Kentucky|  South|      Furniture|      Chairs|  731.94|      2017|         11| November|
|CA-2017-138688|2017-06-12|2017-06-16|  Second Cla

## Yearly total sales

In [None]:
ydf = df.groupBy('order year').agg(
    f.sum('sales')
)
ydf = ydf.withColumn('sum(sales)',f.round(ydf['sum(sales)'],1))
ydf = ydf.withColumnRenamed('sum(sales)','total sale')
ydf.show()
display(ydf)

+----------+----------+
|order year|total sale|
+----------+----------+
|      2018|  713928.6|
|      2015|  477480.4|
|      2016|  453395.7|
|      2017|  592328.4|
+----------+----------+



order year,total sale
2018,713928.6
2015,477480.4
2016,453395.7
2017,592328.4


Databricks visualization. Run in Databricks to view.

## Monthly total sales

In [None]:
mdf = df.groupBy('monthname').agg(
    f.sum('sales')
)
mdf = mdf.withColumn('sum(sales)',f.round(mdf['sum(sales)'],1))
mdf = mdf.withColumnRenamed('sum(sales)','monthly_total sale')
mdf = mdf.orderBy(mdf['monthly_total sale'].desc())
mdf.show()
display(mdf)

+---------+------------------+
|monthname|monthly_total sale|
+---------+------------------+
| November|          346581.5|
| December|          317347.7|
|September|          296147.3|
|  October|          197629.7|
|    March|          196287.4|
|   August|          155914.1|
|      May|          152626.8|
|     June|          144219.8|
|     July|          143931.8|
|    April|          133908.2|
|  January|           93906.1|
| February|           58632.7|
+---------+------------------+



monthname,monthly_total sale
November,346581.5
December,317347.7
September,296147.3
October,197629.7
March,196287.4
August,155914.1
May,152626.8
June,144219.8
July,143931.8
April,133908.2


Databricks visualization. Run in Databricks to view.

## Region wise total sales

In [None]:
reg_df = df.groupBy('region').agg(
    f.sum('sales')
)
reg_df = reg_df.withColumn('sum(sales)',f.round(reg_df['sum(sales)'],1))
reg_df = reg_df.withColumnRenamed('sum(sales)','regionwise_sale')
reg_df.show()
display(reg_df)

+-------+---------------+
| region|regionwise_sale|
+-------+---------------+
|  South|       386413.1|
|Central|       489321.4|
|   East|       663043.9|
|   West|       698354.8|
+-------+---------------+



region,regionwise_sale
South,386413.1
Central,489321.4
East,663043.9
West,698354.8


Databricks visualization. Run in Databricks to view.

## City wise top sales

In [None]:
city_df = df.groupBy('city').agg(
    f.sum('sales')
)
city_df = city_df.orderBy(city_df['sum(sales)'].desc()).limit(10)
city_df = city_df.withColumn('sum(sales)',f.round(city_df['sum(sales)'],1))
city_df = city_df.withColumnRenamed('sum(sales)','citywise_sales')
city_df.show()
display(city_df)

+-------------+--------------+
|         city|citywise_sales|
+-------------+--------------+
|New York City|      251343.4|
|  Los Angeles|      170737.7|
|      Seattle|      114338.2|
|San Francisco|      107313.3|
| Philadelphia|      107261.6|
|      Houston|       63840.8|
|    San Diego|       47115.1|
|      Chicago|       47047.9|
| Jacksonville|       44595.4|
|      Detroit|       42302.9|
+-------------+--------------+



city,citywise_sales
New York City,251343.4
Los Angeles,170737.7
Seattle,114338.2
San Francisco,107313.3
Philadelphia,107261.6
Houston,63840.8
San Diego,47115.1
Chicago,47047.9
Jacksonville,44595.4
Detroit,42302.9


Databricks visualization. Run in Databricks to view.

## State wise top sales

In [None]:
st_df = df.groupBy('state').agg(
    f.sum('sales')
)
st_df = st_df.withColumn('sum(sales)',f.round(st_df['sum(sales)'],1)).limit(10)
st_df = st_df.withColumnRenamed('sum(sales)','statewise_sale')
st_df = st_df.orderBy(st_df['statewise_sale'].desc())
st_df.show()
display(st_df)

+------------+--------------+
|       state|statewise_sale|
+------------+--------------+
|       Texas|      167954.5|
|Pennsylvania|      114686.3|
|        Ohio|       73489.9|
|   Minnesota|       29863.1|
|      Oregon|       17135.4|
| Connecticut|       13317.8|
|    Arkansas|       11495.2|
|        Utah|       10575.0|
|    Nebraska|        7447.3|
|North Dakota|         919.9|
+------------+--------------+



state,statewise_sale
Texas,167954.5
Pennsylvania,114686.3
Ohio,73489.9
Minnesota,29863.1
Oregon,17135.4
Connecticut,13317.8
Arkansas,11495.2
Utah,10575.0
Nebraska,7447.3
North Dakota,919.9


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

## Lets concatenate category and sub-category columns

In [None]:
con_df = df.withColumn('cat_subcat',f.concat(df['category'],f.lit('-'),df['sub-category']))
con_df.show(5)

+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+----------+-----------+---------+--------------------+
|      order id|order date| ship date|     ship mode|  segment|      country|           city|     state|region|       category|sub-category|   sales|order year|order month|monthname|          cat_subcat|
+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+----------+-----------+---------+--------------------+
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|   Bookcases|  261.96|      2017|         11| November| Furniture-Bookcases|
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|      Chairs|  731.94|      2017|         11| November|  

## Category/sub-category wise total sales

In [None]:
from pyspark.sql.functions import col,sum
catsub_df = con_df.groupBy('cat_subcat') \
    .agg(sum('sales').alias('total_sales')) \
    .orderBy(col('total_sales').desc())

catsub_df = catsub_df.withColumn('total_sales',f.round(catsub_df['total_sales'],1))
catsub_df.show()
display(catsub_df)

+--------------------+-----------+
|          cat_subcat|total_sales|
+--------------------+-----------+
|   Technology-Phones|   327528.5|
|    Furniture-Chairs|   322822.7|
|Office Supplies-S...|   212303.0|
|    Furniture-Tables|   202810.6|
|Office Supplies-B...|   196538.2|
| Technology-Machines|   189238.6|
|Technology-Access...|   164186.7|
|  Technology-Copiers|   146248.1|
| Furniture-Bookcases|   113813.2|
|Office Supplies-A...|   104618.4|
|Furniture-Furnish...|    80344.9|
|Office Supplies-P...|    73797.5|
|Office Supplies-S...|    45852.1|
| Office Supplies-Art|    26705.4|
|Office Supplies-E...|    14991.1|
|Office Supplies-L...|    12347.7|
|Office Supplies-F...|     2986.3|
+--------------------+-----------+



cat_subcat,total_sales
Technology-Phones,327528.5
Furniture-Chairs,322822.7
Office Supplies-Storage,212303.0
Furniture-Tables,202810.6
Office Supplies-Binders,196538.2
Technology-Machines,189238.6
Technology-Accessories,164186.7
Technology-Copiers,146248.1
Furniture-Bookcases,113813.2
Office Supplies-Appliances,104618.4


Databricks visualization. Run in Databricks to view.

In [None]:
df.show(5)

+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+----------+-----------+---------+
|      order id|order date| ship date|     ship mode|  segment|      country|           city|     state|region|       category|sub-category|   sales|order year|order month|monthname|
+--------------+----------+----------+--------------+---------+-------------+---------------+----------+------+---------------+------------+--------+----------+-----------+---------+
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|   Bookcases|  261.96|      2017|         11| November|
|CA-2017-152156|2017-11-08|2017-11-11|  Second Class| Consumer|United States|      Henderson|  Kentucky| South|      Furniture|      Chairs|  731.94|      2017|         11| November|
|CA-2017-138688|2017-06-12|2017-06-16|  Second Class|Corporate|United States|    Los 

## Ship mode analysis

In [None]:
ship_df = df.groupBy('ship mode').count().orderBy('count',ascending=False)
ship_df.show()

+--------------+-----+
|     ship mode|count|
+--------------+-----+
|Standard Class| 5859|
|  Second Class| 1902|
|   First Class| 1501|
|      Same Day|  538|
+--------------+-----+



## Total sales on behalf of ship mode

In [None]:
ship_df1 = df.groupBy('ship mode') \
    .agg(sum('sales').alias('shipmode_sales')) \
    .orderBy(col('shipmode_sales').desc())
ship_df1 = ship_df1.withColumn('shipmode_sales',f.round(col('shipmode_sales'),1))
ship_df1.show()
display(ship_df1)

+--------------+--------------+
|     ship mode|shipmode_sales|
+--------------+--------------+
|Standard Class|     1325110.6|
|  Second Class|      444062.5|
|   First Class|      343716.6|
|      Same Day|      124243.5|
+--------------+--------------+



ship mode,shipmode_sales
Standard Class,1325110.6
Second Class,444062.5
First Class,343716.6
Same Day,124243.5


Databricks visualization. Run in Databricks to view.