In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df_spark = spark.read.table("ml_project.gold.urban_green_space_predictions")
df = df_spark.toPandas()
# Remove 'Nationalpark' from the park names
df['park_name'] = df['park_name'].str.replace('Nationalpark ', '')

# Create a figure with multiple subplots
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: City-wise Park Count
city_intervention_counts = df['city'].value_counts()
sns.countplot(x='city', data=df, ax=axs[0, 0])
axs[0, 0].set_title('City-wise Park Count')
axs[0, 0].tick_params(axis='x', rotation=45)

# Plot 2: Intervention Probabilities
sns.scatterplot(x='park_name', y='intervention_probability', hue='intervention_pred', data=df, ax=axs[0, 1])
axs[0, 1].set_title('Intervention Probabilities')
axs[0, 1].tick_params(axis='x', rotation=90)

# Plot 3: Intervention Status
sns.countplot(x='intervention_pred', data=df, ax=axs[1, 0])
axs[1, 0].set_title('Intervention Status')

# Plot 4: City-wise Intervention Status
city_intervention_status = df.groupby('city')['intervention_pred'].sum().reset_index()
sns.barplot(x='city', y='intervention_pred', data=city_intervention_status, ax=axs[1, 1])
axs[1, 1].set_title('City-wise Intervention Status')
axs[1, 1].tick_params(axis='x', rotation=45)

# Layout so plots do not overlap
fig.tight_layout()

plt.show()

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Improved and research-aware logic
def generate_recommendation(prob, city):
    if prob > 0.9:
        if city in ['Hamburg', 'Berlin']:
            return "High urgency: Increase tree canopy, launch public awareness events, and improve waste management"
        else:
            return "High urgency: Apply a balanced intervention strategy"
    elif prob > 0.7:
        return "Moderate concern: Promote eco-tourism and introduce park ranger programs"
    elif prob > 0.4:
        return "Low risk: Monitor visitor impact and schedule seasonal clean-up drives"
    else:
        return "No immediate action needed: Maintain regular observation and reporting"

# Register and apply the UDF
udf_generate_recommendation = udf(generate_recommendation, StringType())
df = df_spark.withColumn('recommendation', udf_generate_recommendation('intervention_probability', 'city'))
display(df)

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.font_manager import FontProperties # Import FontProperties

# Convert the Spark DataFrame to a Pandas DataFrame
df_pd = df.toPandas()
# Remove 'Nationalpark' from the park names
df_pd['park_name'] = df_pd['park_name'].str.replace('Nationalpark ', '')

# Create a scatter plot
plt.figure(figsize=(12, 8))
sns.stripplot(x='intervention_probability', y='park_name', hue='recommendation', data=df_pd, jitter=True, size=10)

# Add grid lines
for i in np.arange(0, 1.1, 0.1):
    plt.axvline(i, linestyle=':', alpha=0.2, color='gray')

# Add horizontal grid lines
for i in range(len(df_pd['park_name'].unique())):
    plt.axhline(i, linestyle=':', alpha=0.2, color='gray')

plt.title('Intervention Probability by Park')
plt.xlabel('Intervention Probability') 
plt.ylabel('Park Name') 
font_properties = FontProperties(weight='bold')

# Move the legend below the chart
plt.legend(bbox_to_anchor=(0.5, -0.25), loc='upper center', ncol=2) 
plt.tight_layout()
plt.show()