### clean_transform_to_silver

In [0]:
from pyspark.sql.functions import to_date, count, when, col, dayofweek, hour, mean
import pyspark.sql.functions as F

foot_df = spark.read.table("ml_project.bronze.footfall")
parks_df = spark.read.table("ml_project.bronze.parks")
sent_df = spark.read.table("ml_project.bronze.sentiment")
aq_df = spark.read.table("ml_project.bronze.air_quality")

# Convert the date column to date type
aq_df = aq_df.withColumn("date", to_date(col("date"), "M/d/yyyy"))

# Convert the timestamp column to date type and create new features
foot_df = foot_df.withColumn("date", to_date(col("timestamp"))) \
                  .withColumn("day_of_week", dayofweek(col("timestamp"))) \
                  .withColumn("hour", hour(col("timestamp")))

# Create a new feature: average visitor count per day of week
avg_visitor_count_per_day = foot_df.groupBy("day_of_week").agg(F.mean("visitor_count").alias("avg_visitor_count"))
foot_df = foot_df.join(avg_visitor_count_per_day, on="day_of_week", how="left")

# Create a new feature: sentiment score per park
sentiment_score_per_park = sent_df.groupBy("park_id").agg(F.mean("sentiment_score").alias("avg_sentiment_score"))
parks_df = parks_df.join(sentiment_score_per_park, on="park_id", how="left")

# Integrate the DataFrames
integrated_df = foot_df.join(parks_df, on="park_id", how="left") \
                        .join(aq_df, on=["park_id", "date"], how="left") \
                        .join(sent_df, on=["park_id", "timestamp"], how="left")

# Drop duplicate columns
for col_name in integrated_df.columns:
    if integrated_df.columns.count(col_name) > 1:
        integrated_df = integrated_df.drop(col_name)

# # Save the integrated DataFrame as a Delta table
integrated_df.write.format("delta").mode("overwrite").saveAsTable("ml_project.silver.integrated")

# Display the DataFrames
display(foot_df)
display(parks_df)
display(integrated_df)

In [0]:
# Visualize the data using plots
import matplotlib.pyplot as plt

# Convert the DataFrames to Pandas DataFrames
foot_df_pd = foot_df.toPandas()
sent_df_pd = sent_df.toPandas()
aq_df_pd = aq_df.toPandas()

# Plot the distribution of visitor_count
plt.figure(figsize=(10,6))
plt.hist(foot_df_pd['visitor_count'], bins=50, alpha=0.7, color='blue', edgecolor='black')
plt.title('Distribution of Visitor Count')
plt.xlabel('Visitor Count')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution of sentiment_score
plt.figure(figsize=(10,6))
plt.hist(sent_df_pd['sentiment_score'], bins=50, alpha=0.7, color='blue', edgecolor='black')
plt.title('Distribution of Sentiment Score')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()


# Plot the distribution of air quality score (assuming it's in a column named 'aqi' or 'air_quality_score')
if 'aqi' in aq_df_pd.columns:
    plt.figure(figsize=(10,6))
    plt.hist(aq_df_pd['aqi'], bins=50, alpha=0.7, color='blue', edgecolor='black')
    plt.title('Distribution of Air Quality Index (AQI)')
    plt.xlabel('AQI')
    plt.ylabel('Frequency')
    plt.show()
elif 'air_quality_score' in aq_df_pd.columns:
    plt.figure(figsize=(10,6))
    plt.hist(aq_df_pd['air_quality_score'], bins=50, alpha=0.7, color='blue', edgecolor='black')
    plt.title('Distribution of Air Quality Score')
    plt.xlabel('Air Quality Score')
    plt.ylabel('Frequency')
    plt.show()
else:
    print("No air quality score column found in the DataFrame.")