In [None]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_products.csv to cleaned_products.csv
Saving cleaned_sales .csv to cleaned_sales .csv


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, to_date, avg, sum as spark_sum

spark = SparkSession.builder.appName("Retail_Sales").getOrCreate()

sales_df = spark.read.csv("cleaned_sales .csv", header=True, inferSchema=True)
products_df = spark.read.csv("cleaned_products.csv", header=True, inferSchema=True)

In [None]:
df = sales_df.join(products_df, on="product_id", how="left")

df = df.withColumn("sale_date", to_date(col("sale_date").cast("string"), "yyyy-MM-dd"))

df = df.withColumn("revenue", col("quantity") * col("price")) \
       .withColumn("discount_amount", col("price") * (col("discount") / 100) * col("quantity")) \
       .withColumn("profit", col("revenue") - col("discount_amount"))

df.printSchema()
df.show()

root
 |-- product_id: integer (nullable = true)
 |-- sale_id: integer (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- quantity: double (nullable = true)
 |-- sale_date: date (nullable = true)
 |-- discount: double (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- revenue: double (nullable = true)
 |-- discount_amount: double (nullable = true)
 |-- profit: double (nullable = true)

+----------+-------+--------+--------+----------+--------+--------------------+----------------+-----+-------+---------------+------+
|product_id|sale_id|store_id|quantity| sale_date|discount|        product_name|        category|price|revenue|discount_amount|profit|
+----------+-------+--------+--------+----------+--------+--------------------+----------------+-----+-------+---------------+------+
|         1|   1001|       1|    10.0|2022-03-15|    10.0|  Parachute Hair Oil| Health & Beauty| 90.0|  

In [None]:
from pyspark.sql import functions as F

avg_revenue = df.select(F.avg("revenue")).collect()[0][0]
avg_profit = df.select(F.avg("profit")).collect()[0][0]
avg_quantity = df.select(F.avg("quantity")).collect()[0][0]

print(f"Overall Avg Revenue: {avg_revenue:.2f}")
print(f"Overall Avg Profit: {avg_profit:.2f}")
print(f"Overall Avg Quantity: {avg_quantity:.2f}")

underperforming_products = df.filter(
    (F.col("revenue") < avg_revenue) |
    (F.col("profit") < avg_profit) |
    (F.col("quantity") < avg_quantity)
)

underperforming_products.select("product_id", "product_name", "revenue", "profit", "quantity").show()

Overall Avg Revenue: 678.00
Overall Avg Profit: 660.33
Overall Avg Quantity: 12.25
+----------+--------------------+-------+------+--------+
|product_id|        product_name|revenue|profit|quantity|
+----------+--------------------+-------+------+--------+
|         1|  Parachute Hair Oil|  900.0| 810.0|    10.0|
|         2|     Dettol Handwash|  600.0| 570.0|     5.0|
|         3|  Britannia Biscuits|  700.0| 630.0|    20.0|
|         5|       Maggi Noodles|  375.0| 337.5|    25.0|
|         1|  Parachute Hair Oil|  450.0| 450.0|     5.0|
|         3|  Britannia Biscuits|  350.0| 350.0|    10.0|
|         5|       Maggi Noodles|  450.0| 450.0|    30.0|
|         6|  Colgate Toothpaste|  320.0| 320.0|     8.0|
|         8|             Unknown|   20.0|  20.0|     1.0|
|         9|             Cheese |  500.0| 500.0|    10.0|
|        10|               Maggi|   75.0|  75.0|     5.0|
|        11|Invalid Price Pro...|    0.0|   0.0|     0.0|
|        12|                NULL|   NULL|  NULL

In [None]:
from pyspark.sql.functions import to_date, year, month, sum as spark_sum, avg
from pyspark.sql.functions import round

df = df.withColumn("year", year("sale_date")).withColumn("month", month("sale_date"))

monthly_revenue = df.groupBy("store_id", "year", "month") \
    .agg(spark_sum("revenue").alias("monthly_revenue"))


avg_monthly_revenue = monthly_revenue.groupBy("store_id") \
    .agg(round(avg("monthly_revenue"), 2).alias("avg_monthly_revenue")) \
    .orderBy("store_id")

avg_monthly_revenue.show()

+--------+-------------------+
|store_id|avg_monthly_revenue|
+--------+-------------------+
|       0|               NULL|
|       1|             423.33|
|       2|              387.5|
|       3|             408.33|
|       4|              535.0|
|       5|              375.0|
|       6|              720.0|
|       7|             3960.0|
+--------+-------------------+



In [None]:
from google.colab import files
underperforming_products.toPandas().to_csv("underperforming_products.csv", index=False)
files.download("underperforming_products.csv")