In [1]:
# Remove old spark folder if exists (just in case)
!rm -rf /content/spark-3.4.1-bin-hadoop3
!rm -rf /content/spark-3.3.0-bin-hadoop3

# Install Java 8 (required by Spark)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark 3.3.0 (stable and tested on Colab)
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz

# Install findspark to locate Spark from Python
!pip install -q findspark


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"


In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WarehouseStockAnalysis").getOrCreate()


In [6]:
# Aggregate total stock by warehouse only
warehouse_agg = agg_df.groupBy("warehouse_id").agg(_sum("total_stock").alias("warehouse_stock"))

# Flag warehouse status
warehouse_status = warehouse_agg.withColumn(
    "warehouse_stock_status",
    when(col("warehouse_stock") < 50, "UNDERSTOCKED")
    .when(col("warehouse_stock") > 500, "OVERSTOCKED")
    .otherwise("NORMAL")
)

warehouse_status.show()

# Save output CSV
warehouse_status.write.csv("warehouse_stock_status_output", header=True, mode="overwrite")


+------------+---------------+----------------------+
|warehouse_id|warehouse_stock|warehouse_stock_status|
+------------+---------------+----------------------+
|         108|             95|                NORMAL|
|         101|            240|                NORMAL|
|         103|              2|          UNDERSTOCKED|
|         107|              1|          UNDERSTOCKED|
|         102|             10|          UNDERSTOCKED|
|         105|              1|          UNDERSTOCKED|
|         106|            200|                NORMAL|
|         104|            200|                NORMAL|
+------------+---------------+----------------------+

