Creating spark session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Retail Sales Performance Dashboard") \
                    .getOrCreate() 
spark

<pyspark.sql.connect.session.SparkSession at 0x7fae7f9fcb10>

Loading csv files

In [0]:
employee_df = spark.read.format("csv").option("header", "true") \
                                      .option("inferSchema", "true") \
                                      .load("/Volumes/workspace/default/retail_sales_performance_dashboard/employees.csv")

product_df = spark.read.format("csv").option("header", "true") \
                                     .option("inferSchema", "true") \
                                     .load("/Volumes/workspace/default/retail_sales_performance_dashboard/products.csv")

store_df = spark.read.format("csv").option("header", "true") \
                                   .option("inferSchema", "true") \
                                   .load("/Volumes/workspace/default/retail_sales_performance_dashboard/stores.csv")
                                   
sales_df = spark.read.format("csv").option("header", "true") \
                                   .option("inferSchema", "true") \
                                   .load("/Volumes/workspace/default/retail_sales_performance_dashboard/sales.csv")


Displaying Schema

In [0]:
print("Schema for employee_df")
employee_df.printSchema()

print("\n Schema for product_df")
product_df.printSchema()

print("\n Schema for sales_df:")
sales_df.printSchema()

print("\n Schema for store_df")
store_df.printSchema()

Schema for employee_df
root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- store_id: integer (nullable = true)


 Schema for product_df
root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)


 Schema for sales_df:
root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- sale_date: date (nullable = true)


 Schema for store_df
root
 |-- store_id: integer (nullable = true)
 |-- store_name: string (nullable = true)
 |-- region: string (nullable = true)



#### Joining Data

Joining sales and products

In [0]:
sales_product_df = sales_df.join(product_df, sales_df.product_id == product_df.product_id, "inner")
sales_df.show()

+-------+----------+--------+-----------+--------+----------+
|sale_id|product_id|store_id|employee_id|quantity| sale_date|
+-------+----------+--------+-----------+--------+----------+
|      1|         1|       1|          1|       2|2025-06-01|
|      2|         2|       2|          2|       5|2025-06-01|
|      3|         3|       3|          3|       1|2025-06-02|
|      4|         4|       4|          4|       4|2025-06-02|
|      5|         5|       5|          5|       3|2025-06-03|
|      6|         6|       6|          6|       6|2025-06-03|
|      7|         7|       7|          7|       1|2025-06-04|
|      8|         8|       8|          8|       2|2025-06-04|
|      9|         9|       9|          9|       2|2025-06-05|
|     10|        10|      10|         10|       3|2025-06-05|
|     11|        11|      11|         11|       1|2025-06-06|
|     12|        12|      12|         12|       4|2025-06-06|
|     13|        13|      13|         13|       2|2025-06-07|
|     14

In [0]:
sales_product_df.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- employee_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)



In [0]:
employee_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- store_id: integer (nullable = true)



Joining employee and sales_product_df

In [0]:
sales_employee_df = employee_df.join(sales_product_df, on=employee_df.employee_id == sales_product_df.employee_id, how="inner") \
                               .select(
                                          employee_df.employee_id,
                                          employee_df.employee_name,
                                          employee_df.store_id,
                                          sales_product_df.product_name,
                                          sales_product_df.price,
                                          sales_product_df.quantity,
                                          sales_product_df.sale_id,
                                          sales_product_df.sale_date
                                       )
sales_employee_df.show()                               

+-----------+-------------+--------+---------------+-----+--------+-------+----------+
|employee_id|employee_name|store_id|   product_name|price|quantity|sale_id| sale_date|
+-----------+-------------+--------+---------------+-----+--------+-------+----------+
|          1|        Alice|       1|         Laptop|750.0|       2|      1|2025-06-01|
|          2|          Bob|       2|          Phone|400.0|       5|      2|2025-06-01|
|          3|      Charlie|       3|         Tablet|300.0|       1|      3|2025-06-02|
|          4|        David|       4|        Monitor|200.0|       4|      4|2025-06-02|
|          5|          Eva|       5|       Keyboard| 50.0|       3|      5|2025-06-03|
|          6|        Frank|       6|          Mouse| 30.0|       6|      6|2025-06-03|
|          7|        Grace|       7|        Printer|120.0|       1|      7|2025-06-04|
|          8|       Hannah|       8|        Scanner|150.0|       2|      8|2025-06-04|
|          9|          Ian|       9|     He

Joining sales_employee_df and products

In [0]:
product_df.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)



In [0]:
sales_employee_df.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- sale_id: integer (nullable = true)
 |-- sale_date: date (nullable = true)



In [0]:
full_df = sales_employee_df.join(product_df, on=sales_employee_df.product_name == product_df.product_name, how="inner") \
                           .select(
                                    sales_employee_df.employee_id,
                                    sales_employee_df.employee_name,
                                    sales_employee_df.store_id,
                                    product_df.product_id,
                                    sales_employee_df.product_name,
                                    sales_employee_df.price,
                                    sales_employee_df.quantity,
                                    sales_employee_df.sale_id,
                                    sales_employee_df.sale_date
                                  )
full_df.show(5)

+-----------+-------------+--------+----------+------------+-----+--------+-------+----------+
|employee_id|employee_name|store_id|product_id|product_name|price|quantity|sale_id| sale_date|
+-----------+-------------+--------+----------+------------+-----+--------+-------+----------+
|          1|        Alice|       1|         1|      Laptop|750.0|       2|      1|2025-06-01|
|          2|          Bob|       2|         2|       Phone|400.0|       5|      2|2025-06-01|
|          3|      Charlie|       3|         3|      Tablet|300.0|       1|      3|2025-06-02|
|          4|        David|       4|         4|     Monitor|200.0|       4|      4|2025-06-02|
|          5|          Eva|       5|         5|    Keyboard| 50.0|       3|      5|2025-06-03|
+-----------+-------------+--------+----------+------------+-----+--------+-------+----------+
only showing top 5 rows


Calculating Profit margin

In [0]:
from pyspark.sql.functions import col, sum, round
# Adding selling price column
full_df = full_df.withColumn("cost_price", round(col("price") * 0.8, 2))

# Calculating total sales and total cost
full_df = full_df.withColumn("total_sale", round(col("quantity") * col("price"), 2)) \
                 .withColumn("total_cost", round(col("quantity") * col("cost_price"), 2))

# Findfing profit margin
profit_margin = full_df.groupBy("product_id", "product_name") \
                        .agg(sum("total_sale").alias("total_revenue"),
                             sum("total_cost").alias("total_cost"),
                             sum("quantity").alias("total_units_sold"))\
                        .withColumn("profit", col("total_revenue") - col("total_cost")) \
                        .withColumn("profit_margin_percentage", round((col("profit") / col("total_revenue")) * 100, 2))
profit_margin.show()


+----------+---------------+-------------+----------+----------------+------+------------------------+
|product_id|   product_name|total_revenue|total_cost|total_units_sold|profit|profit_margin_percentage|
+----------+---------------+-------------+----------+----------------+------+------------------------+
|         2|          Phone|       2000.0|    1600.0|               5| 400.0|                    20.0|
|        11|     Smartwatch|        250.0|     200.0|               1|  50.0|                    20.0|
|        19|Game Controller|        240.0|     192.0|               4|  48.0|                    20.0|
|         9|     Headphones|        160.0|     128.0|               2|  32.0|                    20.0|
|        20|     Microphone|        110.0|      88.0|               1|  22.0|                    20.0|
|         4|        Monitor|        800.0|     640.0|               4| 160.0|                    20.0|
|        15|  Graphics Card|       1500.0|    1200.0|               5| 30

Saving File

Saving to csv file

In [0]:
profit_margin.write.option("header", True).mode("overwrite").csv("/Volumes/workspace/default/retail_sales_performance_dashboard/profit_margin_csv")

Saving to delta file

In [0]:
profit_margin.write.format("delta").mode("overwrite").save("/Volumes/workspace/default/retail_sales_performance_dashboard/profit_margin_delta")

Find top 3 best-selling products

In [0]:
profit_margin.createOrReplaceTempView("profit_margin_view")

In [0]:
spark.sql("""
            SELECT 
                product_id, 
                product_name, 
                total_units_sold, 
                total_revenue,
                profit_margin_percentage
            FROM profit_margin_view
            ORDER BY total_units_sold DESC
            LIMIT 3
""").show()

+----------+-------------+----------------+-------------+------------------------+
|product_id| product_name|total_units_sold|total_revenue|profit_margin_percentage|
+----------+-------------+----------------+-------------+------------------------+
|         6|        Mouse|               6|        180.0|                    20.0|
|         2|        Phone|               5|       2000.0|                    20.0|
|        15|Graphics Card|               5|       1500.0|                    20.0|
+----------+-------------+----------------+-------------+------------------------+



Extracting top 5 lowest performing stores and storing in CSV file

In [0]:
from pyspark.sql.functions import sum, round, col, asc

store_sales_df = full_df.groupBy("store_id") \
                        .agg(sum("total_sale").alias("total_sales")) \
                        .orderBy(asc("total_sales"))


print("\nTop 5 Lowest Performing Stores")
store_sales_df.limit(5).show()




Top 5 Lowest Performing Stores
+--------+-----------+
|store_id|total_sales|
+--------+-----------+
|      17|       80.0|
|      16|       90.0|
|      20|      110.0|
|       7|      120.0|
|       5|      150.0|
+--------+-----------+



In [0]:
store_sales_df.write.option("header", True) \
                    .mode("overwrite") \
                    .csv("/Volumes/workspace/default/retail_sales_performance_dashboard/lowest_performing_stores.csv")


Displaying all the files

In [0]:
display(dbutils.fs.ls("/Volumes/workspace/default/retail_sales_performance_dashboard/"))

path,name,size,modificationTime
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/employees.csv,employees.csv,248,1751346525000
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/lowest_performing_stores.csv/,lowest_performing_stores.csv/,0,1751346978054
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/products.csv,products.csv,376,1751346525000
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/profit_margin_csv/,profit_margin_csv/,0,1751346978054
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/profit_margin_delta/,profit_margin_delta/,0,1751346978054
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/sales.csv,sales.csv,523,1751346525000
dbfs:/Volumes/workspace/default/retail_sales_performance_dashboard/stores.csv,stores.csv,424,1751346525000
