### 1. INGEST & CLEAN

In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

# URLs for the datasets
# aq_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/air_quality.csv"
# foot_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/footfall.csv"
# sent_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/sentiment.csv"
# parks_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/parks.csv"

aq_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_air_quality.csv"
foot_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_footfall.csv"
sent_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_sentiment.csv"
parks_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/german_national_parks.csv"

# Read CSVs using pandas
aq_df_pd = pd.read_csv(aq_url)
foot_df_pd = pd.read_csv(foot_url)
sent_df_pd = pd.read_csv(sent_url)
parks_df_pd = pd.read_csv(parks_url)

# Convert to Spark DataFrames
aq_df = spark.createDataFrame(aq_df_pd)
foot_df = spark.createDataFrame(foot_df_pd)
sent_df = spark.createDataFrame(sent_df_pd)
parks_df = spark.createDataFrame(parks_df_pd)


# # Optionally display the tables
# display(spark.read.table("ml_project.bronze.air_quality"))
# display(spark.read.table("ml_project.bronze.footfall"))
# display(spark.read.table("ml_project.bronze.sentiment"))
# display(spark.read.table("ml_project.bronze.parks"))


# # Load datasets
# aq_df = spark.read.table("ml_project.bronze.air_quality")
# foot_df = spark.read.table("ml_project.bronze.footfall")
# sent_df = spark.read.table("ml_project.bronze.sentiment")
# parks_df = spark.read.table("ml_project.bronze.parks")

# Data cleaning
aq_df = aq_df.dropna().withColumn("AQI", col("AQI").cast("float"))
foot_df = foot_df.dropna().withColumn("visitor_count", col("visitor_count").cast("integer"))
sent_df = sent_df.dropna()
parks_df = parks_df.dropna()

# Write as Delta Tables
aq_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.air_quality")

foot_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.footfall")

sent_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.sentiment")

parks_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.parks")


display(aq_df)
display(foot_df)
display(sent_df)
display(parks_df)

In [0]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# URLs for the datasets
aq_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_air_quality.csv"
foot_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_footfall.csv"
sent_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_sentiment.csv"
parks_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/german_national_parks.csv"

# Read CSVs using pandas
aq_df_pd = pd.read_csv(aq_url)
foot_df_pd = pd.read_csv(foot_url)
sent_df_pd = pd.read_csv(sent_url)
parks_df_pd = pd.read_csv(parks_url)

# Convert to Spark DataFrames
aq_df = spark.createDataFrame(aq_df_pd)
foot_df = spark.createDataFrame(foot_df_pd)
sent_df = spark.createDataFrame(sent_df_pd)
parks_df = spark.createDataFrame(parks_df_pd)

# Data cleaning and formatting
aq_df = aq_df.dropna()
foot_df = foot_df.dropna() \
                 .withColumn("timestamp", to_timestamp(col("timestamp"), "M/d/yyyy H:mm")) \
                 .withColumn("visitor_count", col("visitor_count").cast("integer")) \
                 .withColumn("event_day", col("event_day").cast("boolean"))

sent_df = sent_df.dropna()
parks_df = parks_df.dropna()

# Optionally display the DataFrames
print("Air Quality DataFrame:")
aq_df.show()

print("Footfall DataFrame:")
foot_df.show()

print("Sentiment DataFrame:")
sent_df.show()

print("Parks DataFrame:")
parks_df.show()

# Write as Delta Tables
aq_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.air_quality")

foot_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.footfall")

sent_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.sentiment")

parks_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("ml_project.bronze.parks")

In [0]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, sum

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# URLs for the datasets
aq_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_air_quality.csv"
foot_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_footfall.csv"
sent_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/national_parks_sentiment.csv"
parks_url = "https://raw.githubusercontent.com/Dilshan-Chanuka/ml-anomaly-detection-pipeline/refs/heads/main/data_sample/german_national_parks.csv"

# Read CSVs using pandas
aq_df_pd = pd.read_csv(aq_url)
foot_df_pd = pd.read_csv(foot_url)
sent_df_pd = pd.read_csv(sent_url)
parks_df_pd = pd.read_csv(parks_url)

# Convert to Spark DataFrames
aq_df = spark.createDataFrame(aq_df_pd)
foot_df = spark.createDataFrame(foot_df_pd)
sent_df = spark.createDataFrame(sent_df_pd)
parks_df = spark.createDataFrame(parks_df_pd)

# Data cleaning and formatting
aq_df = aq_df.dropna()

# Rename the timestamp column in footfall data to avoid conflicts
foot_df = foot_df.dropna() \
                 .withColumnRenamed("timestamp", "footfall_timestamp") \
                 .withColumn("footfall_timestamp", to_timestamp(col("footfall_timestamp"), "M/d/yyyy H:mm")) \
                 .withColumn("visitor_count", col("visitor_count").cast("integer")) \
                 .withColumn("event_day", col("event_day").cast("boolean"))

sent_df = sent_df.dropna()
parks_df = parks_df.dropna()

# Calculate total visitors for each park
visitor_count_df = foot_df.groupBy("park_id").sum("visitor_count").withColumnRenamed("sum(visitor_count)", "total_visitors")

# Join DataFrames based on 'park_id'
combined_df = parks_df.join(aq_df, on='park_id', how='left') \
                     .join(foot_df, on='park_id', how='left') \
                     .join(sent_df, on='park_id', how='left') \
                     .join(visitor_count_df, on='park_id', how='left')

# Optionally display the combined DataFrame
print("Combined DataFrame:")
display(combined_df)

# Write as Delta Table
combined_df.write.format("delta") \
           .mode("overwrite") \
           .option("overwriteSchema", "true") \
           .saveAsTable("ml_project.bronze.park_data")

In [0]:
display(spark.table("ml_project.bronze.footfall"))