# ⚙️ 2. ETL with PySpark
This notebook extracts logs from MongoDB and applies cleaning + transformation.

## 🔧 Install Spark

In [None]:
!apt-get install openjdk-11-jdk-headless -qq
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar -xzf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

## 🔌 Spark Session

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder     .appName("MongoDB_ETL")     .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0")     .config("spark.mongodb.read.connection.uri", "<mongo-uri>")     .getOrCreate()

## 🧼 Clean sales_logs and write as Parquet

In [None]:
sales_df = spark.read.format("mongodb").option("database", "enterprise_logs").option("collection", "sales_logs").load()
from pyspark.sql.functions import col, to_timestamp
sales_clean = sales_df.withColumn("timestamp", to_timestamp("timestamp"))     .withColumn("quantity", col("quantity").cast("int"))     .withColumn("price", col("price").cast("double")).drop("_id")
sales_clean.write.partitionBy("region").parquet("warehouse/sales_logs")