### 1. INGEST & CLEAN

In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

# URLs for the datasets
aq_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/air_quality.csv"
foot_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/footfall.csv"
sent_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/sentiment.csv"
parks_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/parks.csv"

# Read CSVs using pandas
aq_df_pd = pd.read_csv(aq_url)
foot_df_pd = pd.read_csv(foot_url)
sent_df_pd = pd.read_csv(sent_url)
parks_df_pd = pd.read_csv(parks_url)

# Convert to Spark DataFrames
aq_df = spark.createDataFrame(aq_df_pd)
foot_df = spark.createDataFrame(foot_df_pd)
sent_df = spark.createDataFrame(sent_df_pd)
parks_df = spark.createDataFrame(parks_df_pd)

# Write as Delta Tables
aq_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.air_quality")

foot_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.footfall")

sent_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.sentiment")

parks_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.parks")

# # Optionally display the tables
# display(spark.read.table("ml_project.bronze.air_quality"))
# display(spark.read.table("ml_project.bronze.footfall"))
# display(spark.read.table("ml_project.bronze.sentiment"))
# display(spark.read.table("ml_project.bronze.parks"))


# Load datasets
aq_df = spark.read.table("ml_project.default.air_quality")
foot_df = spark.read.table("ml_project.default.footfall")
sent_df = spark.read.table("ml_project.default.sentiment")
parks_df = spark.read.table("ml_project.default.parks")

# Data cleaning
aq_df = aq_df.dropna().withColumn("AQI", col("AQI").cast("float"))
foot_df = foot_df.dropna().withColumn("visitor_count", col("visitor_count").cast("integer"))
sent_df = sent_df.dropna()
parks_df = parks_df.dropna()


display(aq_df)
display(foot_df)
display(sent_df)
display(parks_df)