In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

# Create a SparkSession with explicit configuration
spark = SparkSession.builder \
    .appName("Urban Green Space Management System") \
    .getOrCreate()

# URLs for the datasets
aq_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_air_quality.csv"
foot_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_footfall.csv"
sent_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_sentiment.csv"
parks_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/german_national_parks.csv"

def read_data(url):
    
    return spark.createDataFrame(pd.read_csv(url))
    
def clean_data(df):
    
    df = df.dropna(how='any', subset=[col for col in df.columns if col != ''])
    if 'timestamp' in df.columns:
        df = df.withColumn("timestamp", to_timestamp(col("timestamp"), "M/d/yyyy H:mm"))
    if 'visitor_count' in df.columns:
        df = df.withColumn("visitor_count", col("visitor_count").cast("integer"))
    if 'event_day' in df.columns:
        df = df.withColumn("event_day", col("event_day").cast("boolean"))
    return df

def write_to_delta(df, table_name):
    
    try:
        df.write.format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .saveAsTable(table_name)
    except Exception as e:
        print(f"Error writing to Delta table {table_name}: {e}")

# Read and clean data
aq_df = clean_data(read_data(aq_url))
foot_df = clean_data(read_data(foot_url))
sent_df = clean_data(read_data(sent_url))
parks_df = clean_data(read_data(parks_url))

# Write to Delta tables
write_to_delta(aq_df, "ml_project.bronze.air_quality")
write_to_delta(foot_df, "ml_project.bronze.footfall")
write_to_delta(sent_df, "ml_project.bronze.sentiment")
write_to_delta(parks_df, "ml_project.bronze.parks")