Import Statments

In [39]:
from pyspark.sql import SparkSession
from google.colab import drive
from pyspark.sql.functions import sum, count, col, month, year, avg, round

Creating Session

In [40]:
spark = SparkSession.builder \
                    .appName("Reatail Sales Performance Dashboard") \
                    .getOrCreate()
spark

Reading CSV file From Google Drive

In [41]:
drive.mount('/content/drive')
sales_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/RetailSales/sales.csv', header=True, inferSchema=True)
employees_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/RetailSales/employees.csv', header=True, inferSchema=True)
stores_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/RetailSales/stores.csv', header=True, inferSchema=True)
products_df = spark.read.csv('/content/drive/MyDrive/CapstoneProjectData/RetailSales/products.csv', header=True, inferSchema=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading all the data's

In [42]:
print("\n Sales Data")
sales_df.show()

print("\n Employees Data")
employees_df.show()

print("\n Stores Data")
stores_df.show()

print("\n Products Data")
products_df.show()


 Sales Data
+-------+----------+--------+-----------+--------+----------+
|sale_id|product_id|store_id|employee_id|quantity| sale_date|
+-------+----------+--------+-----------+--------+----------+
|      1|         1|       1|          1|       2|2025-06-01|
|      2|         2|       2|          2|       5|2025-06-01|
|      3|         3|       3|          3|       1|2025-06-02|
|      4|         4|       4|          4|       4|2025-06-02|
|      5|         5|       5|          5|       3|2025-06-03|
|      6|         6|       6|          6|       6|2025-06-03|
|      7|         7|       7|          7|       1|2025-06-04|
|      8|         8|       8|          8|       2|2025-06-04|
|      9|         9|       9|          9|       2|2025-06-05|
|     10|        10|      10|         10|       3|2025-06-05|
|     11|        11|      11|         11|       1|2025-06-06|
|     12|        12|      12|         12|       4|2025-06-06|
|     13|        13|      13|         13|       2|2025-06

#### Filter data for underperforming products (e.g., low sales, high returns):

Joining Sales dataframe and product data frame and adding total sales column

In [49]:
sales_products_df =  sales_df.join(products_df, "product_id")

sales_products_df = sales_products_df.withColumn("TotalSales", col("price") * col("quantity"))

sales_products_df.show()

+----------+-------+--------+-----------+--------+----------+---------------+-----+----------+
|product_id|sale_id|store_id|employee_id|quantity| sale_date|   product_name|price|TotalSales|
+----------+-------+--------+-----------+--------+----------+---------------+-----+----------+
|         1|      1|       1|          1|       2|2025-06-01|         Laptop|750.0|    1500.0|
|         2|      2|       2|          2|       5|2025-06-01|          Phone|400.0|    2000.0|
|         3|      3|       3|          3|       1|2025-06-02|         Tablet|300.0|     300.0|
|         4|      4|       4|          4|       4|2025-06-02|        Monitor|200.0|     800.0|
|         5|      5|       5|          5|       3|2025-06-03|       Keyboard| 50.0|     150.0|
|         6|      6|       6|          6|       6|2025-06-03|          Mouse| 30.0|     180.0|
|         7|      7|       7|          7|       1|2025-06-04|        Printer|120.0|     120.0|
|         8|      8|       8|          8|       2|

Calculting under performing products

In [60]:
underperforming_products_df = sales_df.groupBy("product_id") \
        .agg(sum("quantity").alias("total_quantity_sold")) \
        .filter(col("total_quantity_sold") < 3) \
        .join(products_df, "product_id") \
        .select("product_id", "product_name", "total_quantity_sold")

underperforming_products_df.show()

+----------+------------+-------------------+
|product_id|product_name|total_quantity_sold|
+----------+------------+-------------------+
|         1|      Laptop|                  2|
|        13|     SSD 1TB|                  2|
|        16|    RAM 16GB|                  1|
|         3|      Tablet|                  1|
|        20|  Microphone|                  1|
|         9|  Headphones|                  2|
|        17|  Power Bank|                  2|
|         8|     Scanner|                  2|
|         7|     Printer|                  1|
|        11|  Smartwatch|                  1|
|        14|     HDD 2TB|                  2|
+----------+------------+-------------------+



####Group by store and calculate average monthly revenue

Adding month and year column in sales_products_df

In [51]:
sales_products_df = sales_products_df.withColumn("sale_month", month("sale_date")) \
                                     .withColumn("sale_year", year("sale_date"))

Calculating monthly revenue on each store

In [53]:
monthly_revenue_df = sales_products_df.groupBy("store_id", "sale_year", "sale_month") \
                                      .agg(sum("TotalSales") \
                                      .alias("monthly_revenue"))

avg_monthly_revenue_df = monthly_revenue_df.groupBy("store_id") \
    .agg(avg("monthly_revenue").alias("avg_monthly_revenue"))


Joining avg_monthly_revenue_df with store_df

In [59]:
stores_df = stores_df.withColumnRenamed("store_name", "StoreName")

avg_monthly_revenue_df = avg_monthly_revenue_df.join(stores_df, "store_id") \
                                               .select("store_id", col("StoreName").alias("store_name"), "avg_monthly_revenue")


avg_monthly_revenue_df.show()

+--------+---------------+-------------------+
|store_id|     store_name|avg_monthly_revenue|
+--------+---------------+-------------------+
|      12|     GadgetLand|              280.0|
|       1| Downtown Store|             1500.0|
|      13|     Urban Tech|              300.0|
|      16|       ByteMart|               90.0|
|       6|  Market Street|              180.0|
|       3|     Tech Plaza|              300.0|
|      20|    Plug & Play|              110.0|
|       5|  Central Store|              150.0|
|      19|   Device Depot|              240.0|
|      15|    Techie Town|             1500.0|
|      17|Circuit Central|               80.0|
|       9|      ShopSmart|              160.0|
|       4|     Gadget Hub|              800.0|
|       8|  Digital World|              300.0|
|       7|      City Mall|              120.0|
|      10|        FastBuy|              270.0|
|      11|    ElectroCity|              250.0|
|      14|        BuyZone|              200.0|
|       2|   

#### Summary

Under Performing Products


In [62]:
underperforming_products_df.show()

+----------+------------+-------------------+
|product_id|product_name|total_quantity_sold|
+----------+------------+-------------------+
|         1|      Laptop|                  2|
|        13|     SSD 1TB|                  2|
|        16|    RAM 16GB|                  1|
|         3|      Tablet|                  1|
|        20|  Microphone|                  1|
|         9|  Headphones|                  2|
|        17|  Power Bank|                  2|
|         8|     Scanner|                  2|
|         7|     Printer|                  1|
|        11|  Smartwatch|                  1|
|        14|     HDD 2TB|                  2|
+----------+------------+-------------------+



average monthly revenue on each store

In [61]:
avg_monthly_revenue_df.show()

+--------+---------------+-------------------+
|store_id|     store_name|avg_monthly_revenue|
+--------+---------------+-------------------+
|      12|     GadgetLand|              280.0|
|       1| Downtown Store|             1500.0|
|      13|     Urban Tech|              300.0|
|      16|       ByteMart|               90.0|
|       6|  Market Street|              180.0|
|       3|     Tech Plaza|              300.0|
|      20|    Plug & Play|              110.0|
|       5|  Central Store|              150.0|
|      19|   Device Depot|              240.0|
|      15|    Techie Town|             1500.0|
|      17|Circuit Central|               80.0|
|       9|      ShopSmart|              160.0|
|       4|     Gadget Hub|              800.0|
|       8|  Digital World|              300.0|
|       7|      City Mall|              120.0|
|      10|        FastBuy|              270.0|
|      11|    ElectroCity|              250.0|
|      14|        BuyZone|              200.0|
|       2|   