In [47]:
# Import library
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import *

In [48]:
# Create a SparkSession
spark = SparkSession.builder.appName("SupplyChain").getOrCreate()

# Load dataset
df = spark.read.load("/content/supply_chain_data.csv",format="csv", sep=",", inferSchema="true", header="true")
df.show(5)

+------------+----+------------------+------------+-----------------------+-----------------+---------------------+------------+----------+----------------+--------------+-----------------+------------------+-------------+--------+---------+------------------+-----------------------+-------------------+------------------+-------------------+--------------------+-------+------------------+
|Product type| SKU|             Price|Availability|Number of products sold|Revenue generated|Customer demographics|Stock levels|Lead times|Order quantities|Shipping times|Shipping carriers|    Shipping costs|Supplier name|Location|Lead time|Production volumes|Manufacturing lead time|Manufacturing costs|Inspection results|       Defect rates|Transportation modes| Routes|             Costs|
+------------+----+------------------+------------+-----------------------+-----------------+---------------------+------------+----------+----------------+--------------+-----------------+------------------+--------

In [49]:
# Check dataframe schema
df.printSchema()

root
 |-- Product type: string (nullable = true)
 |-- SKU: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Availability: integer (nullable = true)
 |-- Number of products sold: integer (nullable = true)
 |-- Revenue generated: double (nullable = true)
 |-- Customer demographics: string (nullable = true)
 |-- Stock levels: integer (nullable = true)
 |-- Lead times: integer (nullable = true)
 |-- Order quantities: integer (nullable = true)
 |-- Shipping times: integer (nullable = true)
 |-- Shipping carriers: string (nullable = true)
 |-- Shipping costs: double (nullable = true)
 |-- Supplier name: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Lead time: integer (nullable = true)
 |-- Production volumes: integer (nullable = true)
 |-- Manufacturing lead time: integer (nullable = true)
 |-- Manufacturing costs: double (nullable = true)
 |-- Inspection results: string (nullable = true)
 |-- Defect rates: double (nullable = true)
 |-- Transportatio

In [50]:
# Check missing values
df_null = {col:df.filter(df[col].isNull()).count() for col in df.columns}
df_null

{'Product type': 0,
 'SKU': 0,
 'Price': 0,
 'Availability': 0,
 'Number of products sold': 0,
 'Revenue generated': 0,
 'Customer demographics': 0,
 'Stock levels': 0,
 'Lead times': 0,
 'Order quantities': 0,
 'Shipping times': 0,
 'Shipping carriers': 0,
 'Shipping costs': 0,
 'Supplier name': 0,
 'Location': 0,
 'Lead time': 0,
 'Production volumes': 0,
 'Manufacturing lead time': 0,
 'Manufacturing costs': 0,
 'Inspection results': 0,
 'Defect rates': 0,
 'Transportation modes': 0,
 'Routes': 0,
 'Costs': 0}

In [51]:
# Create database table
df.createOrReplaceTempView("SupplyChain")

In [57]:
# Total Revenue by Location and Product Type
q = """
  SELECT
    Location,
    `Product Type`,
    ROUND(SUM(`Revenue generated`),2) AS `Total Revenue`
  FROM SupplyChain
  GROUP BY Location, `Product Type`
  ORDER BY `Total Revenue` DESC;
"""
spark.sql(q).show()

+---------+------------+-------------+
| Location|Product Type|Total Revenue|
+---------+------------+-------------+
|  Kolkata|    skincare|     77886.27|
|  Chennai|    skincare|     58957.42|
|Bangalore|    haircare|     51654.35|
|   Mumbai|   cosmetics|     49156.51|
|   Mumbai|    haircare|     44423.98|
|   Mumbai|    skincare|     44174.54|
|    Delhi|   cosmetics|     37429.68|
|  Kolkata|    haircare|     35027.71|
|Bangalore|    skincare|     31637.82|
|  Chennai|   cosmetics|     31461.95|
|    Delhi|    skincare|     28972.12|
|  Chennai|    haircare|     28723.45|
|  Kolkata|   cosmetics|     24163.57|
|Bangalore|   cosmetics|     19309.56|
|    Delhi|    haircare|      14625.9|
+---------+------------+-------------+



In [60]:
# Total Revenue by Product Type and Customer Demographics
q = """
  SELECT
    `Product Type`,
    `Customer Demographics`,
    ROUND(SUM(`Revenue generated`),2) AS `Total Revenue`
  FROM SupplyChain
  WHERE `Customer Demographics` IN('Male','Female')
  GROUP BY `Product Type`, `Customer Demographics`
  ORDER BY `Total Revenue` DESC;
"""
spark.sql(q).show()

+------------+---------------------+-------------+
|Product Type|Customer Demographics|Total Revenue|
+------------+---------------------+-------------+
|    skincare|               Female|     79241.11|
|   cosmetics|               Female|     69548.54|
|    skincare|                 Male|      54643.5|
|    haircare|                 Male|     50599.93|
|   cosmetics|                 Male|     21390.97|
|    haircare|               Female|     12724.83|
+------------+---------------------+-------------+



In [61]:
# Total Revenue by Routes and Product Type
q = """
  SELECT
    Routes,
    `Product Type`,
    ROUND(SUM(`Revenue generated`),2) AS `Total Revenue`
  FROM SupplyChain
  GROUP BY Routes, `Product Type`
  ORDER BY `Total Revenue` DESC;
"""
spark.sql(q).show()

+-------+------------+-------------+
| Routes|Product Type|Total Revenue|
+-------+------------+-------------+
|Route A|    skincare|     96012.38|
|Route A|    haircare|     87610.87|
|Route B|    skincare|     85278.42|
|Route A|   cosmetics|      69575.6|
|Route B|   cosmetics|     63292.04|
|Route C|    skincare|     60337.36|
|Route B|    haircare|     55913.55|
|Route C|    haircare|     30930.97|
|Route C|   cosmetics|     28653.63|
+-------+------------+-------------+



In [62]:
# Total Revenue by Product Type
q = """
  SELECT
    `Product Type`,
    ROUND(SUM(`Revenue generated`),2) AS `Total Revenue`
  FROM SupplyChain
  GROUP BY `Product Type`
  ORDER BY `Total Revenue` DESC;
"""
spark.sql(q).show()

+------------+-------------+
|Product Type|Total Revenue|
+------------+-------------+
|    skincare|    241628.16|
|    haircare|    174455.39|
|   cosmetics|    161521.27|
+------------+-------------+



In [63]:
# Total Revenue by Transportation Modes and Product Type
q = """
  SELECT
    `Transportation Modes`,
    `Product Type`,
    ROUND(SUM(`Revenue generated`),2) AS `Total Revenue`
  FROM SupplyChain
  GROUP BY `Transportation Modes`, `Product Type`
  ORDER BY `Total Revenue` DESC;
"""
spark.sql(q).show()

+--------------------+------------+-------------+
|Transportation Modes|Product Type|Total Revenue|
+--------------------+------------+-------------+
|                 Air|    skincare|     76043.02|
|                Rail|    skincare|     66049.43|
|                Road|    skincare|     55928.18|
|                Road|    haircare|     54097.78|
|                 Air|    haircare|     53104.62|
|                Rail|   cosmetics|     50404.65|
|                Road|   cosmetics|     49289.27|
|                Rail|    haircare|     48536.33|
|                 Sea|    skincare|     43607.53|
|                 Sea|   cosmetics|     35239.64|
|                 Air|   cosmetics|     26587.71|
|                 Sea|    haircare|     18716.65|
+--------------------+------------+-------------+



In [64]:
# Total Costs by Transportation Modes and Product Type
q = """
  SELECT
    `Transportation Modes`,
    `Product Type`,
    ROUND(SUM(Costs),2) AS `Total Costs`
  FROM SupplyChain
  GROUP BY `Transportation Modes`, `Product Type`
  ORDER BY `Total Costs` DESC;
"""
spark.sql(q).show()

+--------------------+------------+-----------+
|Transportation Modes|Product Type|Total Costs|
+--------------------+------------+-----------+
|                 Air|    skincare|    8434.14|
|                Road|    haircare|    6313.23|
|                Road|    skincare|     5864.9|
|                Rail|    skincare|    5564.25|
|                Rail|   cosmetics|     4908.9|
|                Rail|    haircare|    4695.78|
|                 Air|    haircare|    4200.41|
|                Road|   cosmetics|    3870.06|
|                 Sea|   cosmetics|    2617.45|
|                 Sea|    skincare|    2366.03|
|                 Sea|    haircare|    2119.44|
|                 Air|   cosmetics|    1969.98|
+--------------------+------------+-----------+



In [65]:
# Total Costs by Product Type
q = """
  SELECT
    `Product Type`,
    ROUND(SUM(Costs),2) AS `Total Costs`
  FROM SupplyChain
  GROUP BY `Product Type`
  ORDER BY `Total Costs` DESC;
"""
spark.sql(q).show()

+------------+-----------+
|Product Type|Total Costs|
+------------+-----------+
|    skincare|   22229.32|
|    haircare|   17328.86|
|   cosmetics|    13366.4|
+------------+-----------+



In [71]:
# Total Products Sold by Product Type at Bangalore
q = """
  SELECT
    `Product Type`,
    ROUND(SUM(`Number of products sold`),2) AS `Total Products Sold`
  FROM SupplyChain
  WHERE Location = 'Bangalore'
  GROUP BY `Product Type`
  ORDER BY `Total Products Sold` DESC;
"""
spark.sql(q).show()

+------------+-------------------+
|Product Type|Total Products Sold|
+------------+-------------------+
|    haircare|               2160|
|    skincare|               1719|
|   cosmetics|               1541|
+------------+-------------------+



In [72]:
# Total Products Sold by Location for female
q = """
  SELECT
    Location,
    ROUND(SUM(`Number of products sold`),2) AS `Total Products Sold`
  FROM SupplyChain
  WHERE `Customer Demographics` = 'Female'
  GROUP BY Location
  ORDER BY `Total Products Sold` DESC;
"""
spark.sql(q).show()

+---------+-------------------+
| Location|Total Products Sold|
+---------+-------------------+
|    Delhi|               4002|
|  Kolkata|               3989|
|   Mumbai|               1660|
|  Chennai|               1620|
|Bangalore|               1530|
+---------+-------------------+



In [73]:
# Total Products Sold by Routes on air transportation mode
q = """
  SELECT
    Routes,
    ROUND(SUM(`Number of products sold`),2) AS `Total Products Sold`
  FROM SupplyChain
  WHERE `Transportation Modes` = 'Air'
  GROUP BY Routes
  ORDER BY `Total Products Sold` DESC;
"""
spark.sql(q).show()

+-------+-------------------+
| Routes|Total Products Sold|
+-------+-------------------+
|Route A|               5210|
|Route C|               2956|
|Route B|               2716|
+-------+-------------------+

