In [0]:
# Enhanced Urban Green Space Management System

from pyspark.sql.functions import (
    avg, sum, min, max, stddev, count, col, when, isnan, isnull, 
    date_format, hour, dayofweek, month, percentile_approx,
    regexp_extract, split, size, desc, asc, lit
)
from pyspark.sql.types import DoubleType, IntegerType
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=== URBAN GREEN SPACE MANAGEMENT SYSTEM ===")
print("Feature Engineering & Exploratory Data Analysis")
print("=" * 50)

# =============================================================================
# 2. ENHANCED FEATURE ENGINEERING & ENRICHMENT
# =============================================================================

def load_and_validate_data():
    """Load datasets with validation and error handling"""
    try:
        print("\n Loading datasets...")
        
        # Load datasets
        aq_df = spark.read.table("ml_project.bronze.air_quality")
        foot_df = spark.read.table("ml_project.bronze.footfall")
        sent_df = spark.read.table("ml_project.bronze.sentiment")
        parks_df = spark.read.table("ml_project.bronze.parks")
        
        # Validate data
        datasets = {
            'Air Quality': aq_df,
            'Footfall': foot_df,
            'Sentiment': sent_df,
            'Parks': parks_df
        }
        
        for name, df in datasets.items():
            count = df.count()
            print(f"   {name}: {count:,} records")
            
            # Check for nulls
            null_counts = []
            for column in df.columns:
                null_count = df.filter(col(column).isNull()).count()
                if null_count > 0:
                    null_counts.append(f"{column}: {null_count}")
            
            if null_counts:
                print(f"      Null values: {', '.join(null_counts)}")
        
        return aq_df, foot_df, sent_df, parks_df
        
    except Exception as e:
        print(f" Error loading data: {str(e)}")
        raise

def create_comprehensive_features(aq_df, foot_df, sent_df, parks_df):
    """Create comprehensive feature set with temporal and statistical features"""
    
    print("\n Creating comprehensive features...")
    
    try:
        # Enhanced Air Quality Metrics
        print("   Processing air quality metrics...")
        aqi_metrics = aq_df.groupBy("park_id").agg(
            avg("AQI").alias("avg_aqi"),
            min("AQI").alias("min_aqi"),
            max("AQI").alias("max_aqi"),
            stddev("AQI").alias("aqi_volatility"),
            percentile_approx("AQI", 0.5).alias("median_aqi"),
            avg("no2_level").alias("avg_no2"),
            avg("pm25_level").alias("avg_pm25"),
            avg("o3_level").alias("avg_o3"),
            count("*").alias("aqi_measurements")
        )
        
        # Enhanced Footfall Metrics with Temporal Analysis
        print("   Processing footfall metrics...")
        
        # Check if timestamp column exists and try temporal features
        footfall_columns = foot_df.columns
        
        if "timestamp" in footfall_columns:
            try:
                # Add temporal features to footfall data
                foot_temporal = foot_df.withColumn("hour", hour(col("timestamp"))) \
                                      .withColumn("day_of_week", dayofweek(col("timestamp"))) \
                                      .withColumn("month", month(col("timestamp"))) \
                                      .withColumn("is_weekend", when(col("day_of_week").isin([1, 7]), 1).otherwise(0)) \
                                      .withColumn("is_event_day", when(col("event_day") == True, 1).otherwise(0))
            except Exception as temporal_error:
                print(f"      Temporal features failed: {str(temporal_error)}")
                foot_temporal = foot_df.withColumn("is_weekend", lit(0)) \
                                      .withColumn("is_event_day", when(col("event_day") == True, 1).otherwise(0))
        else:
            print("      No timestamp column found, using basic footfall metrics")
            foot_temporal = foot_df.withColumn("is_weekend", lit(0)) \
                                  .withColumn("is_event_day", when(col("event_day") == True, 1).otherwise(0))
        
        foot_metrics = foot_temporal.groupBy("park_id").agg(
            sum("visitor_count").alias("total_footfall"),
            avg("visitor_count").alias("avg_footfall"),
            max("visitor_count").alias("peak_footfall"),
            stddev("visitor_count").alias("footfall_volatility"),
            sum("is_weekend").alias("weekend_visits"),
            sum("is_event_day").alias("event_days"),
            count("*").alias("footfall_records")
        )
        
        # Peak hour analysis (if temporal features worked)
        try:
            if "hour" in foot_temporal.columns:
                peak_hours = foot_temporal.groupBy("park_id", "hour") \
                                         .agg(avg("visitor_count").alias("avg_hourly_visitors")) \
                                         .groupBy("park_id") \
                                         .agg(max("avg_hourly_visitors").alias("peak_hour_avg"))
                foot_metrics = foot_metrics.join(peak_hours, "park_id", "left")
            else:
                foot_metrics = foot_metrics.withColumn("peak_hour_avg", lit(0.0))
        except Exception as peak_error:
            print(f"      Peak hour analysis failed: {str(peak_error)}")
            foot_metrics = foot_metrics.withColumn("peak_hour_avg", lit(0.0))
        
        # Enhanced Sentiment Metrics
        print("   Processing sentiment metrics...")
        sent_metrics = sent_df.groupBy("park_id").agg(
            avg("sentiment_score").alias("avg_sentiment"),
            min("sentiment_score").alias("min_sentiment"),
            max("sentiment_score").alias("max_sentiment"),
            stddev("sentiment_score").alias("sentiment_volatility"),
            sum(when(col("sentiment_label") == "positive", 1).otherwise(0)).alias("positive_count"),
            sum(when(col("sentiment_label") == "negative", 1).otherwise(0)).alias("negative_count"),
            sum(when(col("sentiment_label") == "neutral", 1).otherwise(0)).alias("neutral_count"),
            count("*").alias("total_mentions")
        )
        
        # Calculate sentiment ratios
        sent_metrics = sent_metrics.withColumn(
            "positive_ratio", col("positive_count") / col("total_mentions")
        ).withColumn(
            "negative_ratio", col("negative_count") / col("total_mentions")
        ).withColumn(
            "sentiment_polarity", (col("positive_count") - col("negative_count")) / col("total_mentions")
        )
        
        # Merge all features
        print("   Merging all features...")
        features_df = parks_df.join(aqi_metrics, "park_id", "left") \
                             .join(foot_metrics, "park_id", "left") \
                             .join(sent_metrics, "park_id", "left")
        
        # Create derived features
        print("  • Creating derived features...")
        features_df = features_df.withColumn(
            "pollution_footfall_ratio", 
            when(col("total_footfall") > 0, col("avg_aqi") / col("total_footfall")).otherwise(0)
        ).withColumn(
            "area_efficiency", 
            when(col("area_sqm") > 0, col("total_footfall") / col("area_sqm") * 1000).otherwise(0)  # visitors per 1000 sqm
        ).withColumn(
            "sentiment_engagement_ratio",
            when(col("total_footfall") > 0, col("total_mentions") / col("total_footfall") * 100).otherwise(0)
        ).withColumn(
            "air_quality_category",
            when(col("avg_aqi") <= 50, "Good")
            .when(col("avg_aqi") <= 100, "Moderate")
            .when(col("avg_aqi") <= 150, "Unhealthy for Sensitive")
            .when(col("avg_aqi") <= 200, "Unhealthy")
            .otherwise("Very Unhealthy")
        ).withColumn(
            "usage_category",
            when(col("total_footfall") < 100, "Low")
            .when(col("total_footfall") < 500, "Medium")
            .when(col("total_footfall") < 1000, "High")
            .otherwise("Very High")
        ).withColumn(
            "park_size_category",
            when(col("area_sqm") < 50000, "Small")
            .when(col("area_sqm") < 200000, "Medium")
            .when(col("area_sqm") < 500000, "Large")
            .otherwise("Very Large")
        )
        
        # Fill null values with appropriate bronzes
        fill_values = {
            "avg_aqi": 0.0,
            "total_footfall": 0,
            "avg_sentiment": 0.0,
            "aqi_measurements": 0,
            "footfall_records": 0,
            "total_mentions": 0,
            "min_aqi": 0.0,
            "max_aqi": 0.0,
            "aqi_volatility": 0.0,
            "median_aqi": 0.0,
            "avg_footfall": 0.0,
            "peak_footfall": 0,
            "footfall_volatility": 0.0,
            "weekend_visits": 0,
            "event_days": 0,
            "peak_hour_avg": 0.0,
            "positive_count": 0,
            "negative_count": 0,
            "neutral_count": 0,
            "positive_ratio": 0.0,
            "negative_ratio": 0.0,
            "sentiment_polarity": 0.0,
            "pollution_footfall_ratio": 0.0,
            "area_efficiency": 0.0,
            "sentiment_engagement_ratio": 0.0
        }
        
        features_df = features_df.fillna(fill_values)
        
        print(f"   Created feature set with {len(features_df.columns)} columns")
        
        return features_df
        
    except Exception as e:
        print(f" Error in feature engineering: {str(e)}")
        print(" Attempting simplified feature engineering...")
        
        # Fallback to basic feature engineering
        try:
            # Basic aggregations only
            aqi_basic = aq_df.groupBy("park_id").agg(avg("AQI").alias("avg_aqi"))
            foot_basic = foot_df.groupBy("park_id").agg(sum("visitor_count").alias("total_footfall"))
            sent_basic = sent_df.groupBy("park_id").agg(avg("sentiment_score").alias("avg_sentiment"))
            
            # Basic merge
            features_basic = parks_df.join(aqi_basic, "park_id", "left") \
                                   .join(foot_basic, "park_id", "left") \
                                   .join(sent_basic, "park_id", "left")
            
            # Basic categories
            features_basic = features_basic.withColumn(
                "usage_category",
                when(col("total_footfall") < 100, "Low")
                .when(col("total_footfall") < 500, "Medium")
                .otherwise("High")
            ).fillna({"avg_aqi": 0.0, "total_footfall": 0, "avg_sentiment": 0.0})
            
            print("   Created basic feature set")
            return features_basic
            
        except Exception as fallback_error:
            print(f" Fallback also failed: {str(fallback_error)}")
            # Return just the parks data if everything fails
            return parks_df

# Load and process data
aq_df, foot_df, sent_df, parks_df = load_and_validate_data()
features_df = create_comprehensive_features(aq_df, foot_df, sent_df, parks_df)


# Create view and display sample
features_df.createOrReplaceTempView("features_view")
print("\n Feature Summary:")
features_df.select("park_id", "name", "city", "avg_aqi", "total_footfall", 
                   "avg_sentiment", "usage_category", "air_quality_category").show(100, truncate=False)

In [0]:
# Enhanced Urban Green Space Management System

from pyspark.sql.functions import (
    avg, sum, min, max, stddev, count, col, when, isnan, isnull, 
    date_format, hour, dayofweek, month, percentile_approx,
    regexp_extract, split, size, desc, asc, lit
)
from pyspark.sql.types import DoubleType, IntegerType
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=== URBAN GREEN SPACE MANAGEMENT SYSTEM ===")
print("Feature Engineering & Exploratory Data Analysis")
print("=" * 50)

# =============================================================================
# 2. ENHANCED FEATURE ENGINEERING & ENRICHMENT
# =============================================================================

def load_and_validate_data():
    """Load datasets with validation and error handling"""
    try:
        print("\n Loading datasets...")
        
        # Load datasets
        aq_df = spark.read.table("ml_project.bronze.air_quality")
        foot_df = spark.read.table("ml_project.bronze.footfall")
        sent_df = spark.read.table("ml_project.bronze.sentiment")
        parks_df = spark.read.table("ml_project.bronze.parks")
        
        # Validate data
        datasets = {
            'Air Quality': aq_df,
            'Footfall': foot_df,
            'Sentiment': sent_df,
            'Parks': parks_df
        }
        
        for name, df in datasets.items():
            count = df.count()
            print(f"   {name}: {count:,} records")
            
            # Check for nulls
            null_counts = []
            for column in df.columns:
                null_count = df.filter(col(column).isNull()).count()
                if null_count > 0:
                    null_counts.append(f"{column}: {null_count}")
            
            if null_counts:
                print(f"      Null values: {', '.join(null_counts)}")
        
        return aq_df, foot_df, sent_df, parks_df
        
    except Exception as e:
        print(f" Error loading data: {str(e)}")
        raise

def create_comprehensive_features(aq_df, foot_df, sent_df, parks_df):
    """Create comprehensive feature set with temporal and statistical features"""
    
    print("\n Creating comprehensive features...")
    
    try:
        # Enhanced Air Quality Metrics
        print("   Processing air quality metrics...")
        aqi_metrics = aq_df.groupBy("park_id").agg(
            avg("AQI").alias("avg_aqi"),
            min("AQI").alias("min_aqi"),
            max("AQI").alias("max_aqi"),
            stddev("AQI").alias("aqi_volatility"),
            percentile_approx("AQI", 0.5).alias("median_aqi"),
            avg("no2_level").alias("avg_no2"),
            avg("pm25_level").alias("avg_pm25"),
            avg("o3_level").alias("avg_o3"),
            count("*").alias("aqi_measurements")
        )
        
        # Enhanced Footfall Metrics with Temporal Analysis
        print("   Processing footfall metrics...")
        
        # Check if timestamp column exists and try temporal features
        footfall_columns = foot_df.columns
        
        if "timestamp" in footfall_columns:
            try:
                # Add temporal features to footfall data
                foot_temporal = foot_df.withColumn("hour", hour(col("timestamp"))) \
                                      .withColumn("day_of_week", dayofweek(col("timestamp"))) \
                                      .withColumn("month", month(col("timestamp"))) \
                                      .withColumn("is_weekend", when(col("day_of_week").isin([1, 7]), 1).otherwise(0)) \
                                      .withColumn("is_event_day", when(col("event_day") == True, 1).otherwise(0))
            except Exception as temporal_error:
                print(f"      Temporal features failed: {str(temporal_error)}")
                foot_temporal = foot_df.withColumn("is_weekend", lit(0)) \
                                      .withColumn("is_event_day", when(col("event_day") == True, 1).otherwise(0))
        else:
            print("      No timestamp column found, using basic footfall metrics")
            foot_temporal = foot_df.withColumn("is_weekend", lit(0)) \
                                  .withColumn("is_event_day", when(col("event_day") == True, 1).otherwise(0))
        
        foot_metrics = foot_temporal.groupBy("park_id").agg(
            sum("visitor_count").alias("total_footfall"),
            avg("visitor_count").alias("avg_footfall"),
            max("visitor_count").alias("peak_footfall"),
            stddev("visitor_count").alias("footfall_volatility"),
            sum("is_weekend").alias("weekend_visits"),
            sum("is_event_day").alias("event_days"),
            count("*").alias("footfall_records")
        )
        
        # Peak hour analysis (if temporal features worked)
        try:
            if "hour" in foot_temporal.columns:
                peak_hours = foot_temporal.groupBy("park_id", "hour") \
                                         .agg(avg("visitor_count").alias("avg_hourly_visitors")) \
                                         .groupBy("park_id") \
                                         .agg(max("avg_hourly_visitors").alias("peak_hour_avg"))
                foot_metrics = foot_metrics.join(peak_hours, "park_id", "left")
            else:
                foot_metrics = foot_metrics.withColumn("peak_hour_avg", lit(0.0))
        except Exception as peak_error:
            print(f"      Peak hour analysis failed: {str(peak_error)}")
            foot_metrics = foot_metrics.withColumn("peak_hour_avg", lit(0.0))
        
        # Enhanced Sentiment Metrics
        print("   Processing sentiment metrics...")
        sent_metrics = sent_df.groupBy("park_id").agg(
            avg("sentiment_score").alias("avg_sentiment"),
            min("sentiment_score").alias("min_sentiment"),
            max("sentiment_score").alias("max_sentiment"),
            stddev("sentiment_score").alias("sentiment_volatility"),
            sum(when(col("sentiment_label") == "positive", 1).otherwise(0)).alias("positive_count"),
            sum(when(col("sentiment_label") == "negative", 1).otherwise(0)).alias("negative_count"),
            sum(when(col("sentiment_label") == "neutral", 1).otherwise(0)).alias("neutral_count"),
            count("*").alias("total_mentions")
        )
        
        # Calculate sentiment ratios
        sent_metrics = sent_metrics.withColumn(
            "positive_ratio", col("positive_count") / col("total_mentions")
        ).withColumn(
            "negative_ratio", col("negative_count") / col("total_mentions")
        ).withColumn(
            "sentiment_polarity", (col("positive_count") - col("negative_count")) / col("total_mentions")
        )
        
        # Merge all features
        print("   Merging all features...")
        features_df = parks_df.join(aqi_metrics, "park_id", "left") \
                             .join(foot_metrics, "park_id", "left") \
                             .join(sent_metrics, "park_id", "left")
        
        # Create derived features
        print("  • Creating derived features...")
        features_df = features_df.withColumn(
            "pollution_footfall_ratio", 
            when(col("total_footfall") > 0, col("avg_aqi") / col("total_footfall")).otherwise(0)
        ).withColumn(
            "area_efficiency", 
            when(col("area_sqm") > 0, col("total_footfall") / col("area_sqm") * 1000).otherwise(0)  # visitors per 1000 sqm
        ).withColumn(
            "sentiment_engagement_ratio",
            when(col("total_footfall") > 0, col("total_mentions") / col("total_footfall") * 100).otherwise(0)
        ).withColumn(
            "air_quality_category",
            when(col("avg_aqi") <= 50, "Good")
            .when(col("avg_aqi") <= 100, "Moderate")
            .when(col("avg_aqi") <= 150, "Unhealthy for Sensitive")
            .when(col("avg_aqi") <= 200, "Unhealthy")
            .otherwise("Very Unhealthy")
        ).withColumn(
            "usage_category",
            when(col("total_footfall") < 100, "Low")
            .when(col("total_footfall") < 500, "Medium")
            .when(col("total_footfall") < 1000, "High")
            .otherwise("Very High")
        ).withColumn(
            "park_size_category",
            when(col("area_sqm") < 50000, "Small")
            .when(col("area_sqm") < 200000, "Medium")
            .when(col("area_sqm") < 500000, "Large")
            .otherwise("Very Large")
        )
        
        # Fill null values with appropriate bronzes
        fill_values = {
            "avg_aqi": 0.0,
            "total_footfall": 0,
            "avg_sentiment": 0.0,
            "aqi_measurements": 0,
            "footfall_records": 0,
            "total_mentions": 0,
            "min_aqi": 0.0,
            "max_aqi": 0.0,
            "aqi_volatility": 0.0,
            "median_aqi": 0.0,
            "avg_footfall": 0.0,
            "peak_footfall": 0,
            "footfall_volatility": 0.0,
            "weekend_visits": 0,
            "event_days": 0,
            "peak_hour_avg": 0.0,
            "positive_count": 0,
            "negative_count": 0,
            "neutral_count": 0,
            "positive_ratio": 0.0,
            "negative_ratio": 0.0,
            "sentiment_polarity": 0.0,
            "pollution_footfall_ratio": 0.0,
            "area_efficiency": 0.0,
            "sentiment_engagement_ratio": 0.0
        }
        
        features_df = features_df.fillna(fill_values)
        
        print(f"   Created feature set with {len(features_df.columns)} columns")
        
        return features_df
        
    except Exception as e:
        print(f" Error in feature engineering: {str(e)}")
        print(" Attempting simplified feature engineering...")
        
        # Fallback to basic feature engineering
        try:
            # Basic aggregations only
            aqi_basic = aq_df.groupBy("park_id").agg(avg("AQI").alias("avg_aqi"))
            foot_basic = foot_df.groupBy("park_id").agg(sum("visitor_count").alias("total_footfall"))
            sent_basic = sent_df.groupBy("park_id").agg(avg("sentiment_score").alias("avg_sentiment"))
            
            # Basic merge
            features_basic = parks_df.join(aqi_basic, "park_id", "left") \
                                   .join(foot_basic, "park_id", "left") \
                                   .join(sent_basic, "park_id", "left")
            
            # Basic categories
            features_basic = features_basic.withColumn(
                "usage_category",
                when(col("total_footfall") < 100, "Low")
                .when(col("total_footfall") < 500, "Medium")
                .otherwise("High")
            ).fillna({"avg_aqi": 0.0, "total_footfall": 0, "avg_sentiment": 0.0})
            
            print("   Created basic feature set")
            return features_basic
            
        except Exception as fallback_error:
            print(f" Fallback also failed: {str(fallback_error)}")
            # Return just the parks data if everything fails
            return parks_df

# Load and process data
aq_df, foot_df, sent_df, parks_df = load_and_validate_data()
features_df = create_comprehensive_features(aq_df, foot_df, sent_df, parks_df)


# Create view and display sample
features_df.createOrReplaceTempView("features_view")
print("\n Feature Summary:")
features_df.select("park_id", "name", "city", "avg_aqi", "total_footfall", 
                   "avg_sentiment", "usage_category", "air_quality_category").show(10, truncate=False)

# =============================================================================
# 3. ENHANCED EXPLORATORY DATA ANALYSIS (EDA)
# =============================================================================

print("\n Starting Exploratory Data Analysis...")

# Set up matplotlib for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def create_comprehensive_eda(features_df):
    """Create comprehensive EDA with multiple visualizations"""
    
    # Convert to Pandas for plotting - handle potential errors
    try:
        features_pd = features_df.toPandas()
    except Exception as e:
        print(f" Error converting to Pandas: {str(e)}")
        print("Attempting with limited columns...")
        
        # Try with essential columns only
        essential_cols = ["park_id", "name", "city", "avg_aqi", "total_footfall", 
                         "avg_sentiment", "area_sqm", "usage_category"]
        available_cols = [col for col in essential_cols if col in features_df.columns]
        features_pd = features_df.select(*available_cols).toPandas()
    
    print(f"   Converted {len(features_pd)} records to Pandas DataFrame")
    
    # Handle missing columns gracefully
    required_columns = {
        'avg_aqi': 0.0,
        'total_footfall': 0,
        'avg_sentiment': 0.0,
        'area_sqm': 0.0,
        'usage_category': 'Unknown',
        'air_quality_category': 'Unknown'
    }
    
    for col, bronze_val in required_columns.items():
        if col not in features_pd.columns:
            features_pd[col] = bronze_val
            print(f"    Added missing column '{col}' with bronze value")
    
    # Remove any infinite or extremely large values
    numeric_cols = features_pd.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        features_pd[col] = features_pd[col].replace([np.inf, -np.inf], np.nan)
        if features_pd[col].dtype in ['float64', 'float32']:
            features_pd[col] = features_pd[col].fillna(features_pd[col].median())
        else:
            features_pd[col] = features_pd[col].fillna(0)
    
    # Create comprehensive plots
    fig = plt.figure(figsize=(20, 24))
    
    # 1. Air Quality vs Footfall Analysis
    plt.subplot(4, 3, 1)
    valid_data = features_pd[(features_pd['avg_aqi'] > 0) & (features_pd['total_footfall'] > 0)]
    plt.scatter(valid_data['avg_aqi'], valid_data['total_footfall'], 
                alpha=0.6, s=50, c=valid_data['avg_sentiment'], cmap='RdYlGn')
    plt.xlabel('Average AQI')
    plt.ylabel('Total Footfall')
    plt.title('Air Quality vs Footfall\n(Color = Sentiment)')
    plt.colorbar(label='Avg Sentiment')
    
    # Add correlation coefficient
    if len(valid_data) > 1:
        corr = valid_data['avg_aqi'].corr(valid_data['total_footfall'])
        plt.text(0.05, 0.95, f'Correlation: {corr:.3f}', transform=plt.gca().transAxes, 
                bbox=dict(boxstyle="round", facecolor='wheat', alpha=0.5))
    
    # 2. Usage Category Distribution
    plt.subplot(4, 3, 2)
    usage_counts = features_pd['usage_category'].value_counts()
    colors = plt.cm.Set3(np.linspace(0, 1, len(usage_counts)))
    bars = plt.bar(usage_counts.index, usage_counts.values, color=colors)
    plt.title('Park Usage Category Distribution')
    plt.xlabel('Usage Category')
    plt.ylabel('Number of Parks')
    plt.xticks(rotation=45)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                f'{int(height)}', ha='center', va='bottom')
    
    # 3. Air Quality Category Distribution
    plt.subplot(4, 3, 3)
    if 'air_quality_category' in features_pd.columns:
        aqi_counts = features_pd['air_quality_category'].value_counts()
        colors = ['green', 'yellow', 'orange', 'red', 'purple'][:len(aqi_counts)]
        plt.pie(aqi_counts.values, labels=aqi_counts.index, autopct='%1.1f%%', 
                colors=colors, startangle=90)
        plt.title('Air Quality Distribution')
    
    # 4. Sentiment Analysis
    plt.subplot(4, 3, 4)
    sentiment_data = features_pd[features_pd['avg_sentiment'].notna()]
    if len(sentiment_data) > 0:
        plt.hist(sentiment_data['avg_sentiment'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        plt.axvline(sentiment_data['avg_sentiment'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {sentiment_data["avg_sentiment"].mean():.3f}')
        plt.xlabel('Average Sentiment Score')
        plt.ylabel('Number of Parks')
        plt.title('Sentiment Score Distribution')
        plt.legend()
    
    # 5. Park Size vs Usage
    plt.subplot(4, 3, 5)
    size_usage = features_pd[(features_pd['area_sqm'] > 0) & (features_pd['total_footfall'] > 0)]
    if len(size_usage) > 0:
        plt.scatter(size_usage['area_sqm']/1000, size_usage['total_footfall'], 
                   alpha=0.6, s=50)
        plt.xlabel('Park Area (1000 sqm)')
        plt.ylabel('Total Footfall')
        plt.title('Park Size vs Usage')
        plt.xscale('log')
        plt.yscale('log')
        
        # Add trend line
        if len(size_usage) > 1:
            z = np.polyfit(np.log(size_usage['area_sqm']), np.log(size_usage['total_footfall']), 1)
            p = np.poly1d(z)
            x_trend = np.logspace(np.log10(size_usage['area_sqm'].min()), 
                                np.log10(size_usage['area_sqm'].max()), 100)
            y_trend = np.exp(p(np.log(x_trend)))
            plt.plot(x_trend/1000, y_trend, "r--", alpha=0.8, label='Trend')
            plt.legend()
    
    # 6. City-wise Analysis
    plt.subplot(4, 3, 6)
    city_stats = features_pd.groupby('city').agg({
        'total_footfall': 'sum',
        'avg_aqi': 'mean'
    }).reset_index()
    
    if len(city_stats) > 0:
        city_stats = city_stats.sort_values('total_footfall', ascending=True).tail(10)
        bars = plt.barh(range(len(city_stats)), city_stats['total_footfall'])
        plt.yticks(range(len(city_stats)), city_stats['city'])
        plt.xlabel('Total Footfall')
        plt.title('Top 10 Cities by Total Footfall')
        
        # Color bars by AQI
        for i, (bar, aqi) in enumerate(zip(bars, city_stats['avg_aqi'])):
            if not np.isnan(aqi):
                color = 'green' if aqi < 50 else 'yellow' if aqi < 100 else 'red'
                bar.set_color(color)
    
    # 7. Correlation Heatmap
    plt.subplot(4, 3, 7)
    numeric_features = ['avg_aqi', 'total_footfall', 'avg_sentiment', 'area_sqm', 
                       'area_efficiency', 'pollution_footfall_ratio']
    correlation_data = features_pd[numeric_features].select_dtypes(include=[np.number])
    
    if len(correlation_data.columns) > 1:
        correlation_matrix = correlation_data.corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                   square=True, fmt='.2f')
        plt.title('Feature Correlation Matrix')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
    
    # 8. Pollution vs Sentiment
    plt.subplot(4, 3, 8)
    pollution_sentiment = features_pd[(features_pd['avg_aqi'] > 0) & (features_pd['avg_sentiment'].notna())]
    if len(pollution_sentiment) > 0:
        plt.scatter(pollution_sentiment['avg_aqi'], pollution_sentiment['avg_sentiment'], 
                   alpha=0.6, s=50)
        plt.xlabel('Average AQI')
        plt.ylabel('Average Sentiment')
        plt.title('Air Quality vs Public Sentiment')
        
        # Add trend line
        if len(pollution_sentiment) > 1:
            z = np.polyfit(pollution_sentiment['avg_aqi'], pollution_sentiment['avg_sentiment'], 1)
            p = np.poly1d(z)
            plt.plot(pollution_sentiment['avg_aqi'], p(pollution_sentiment['avg_aqi']), 
                    "r--", alpha=0.8)
            
            corr = pollution_sentiment['avg_aqi'].corr(pollution_sentiment['avg_sentiment'])
            plt.text(0.05, 0.95, f'Correlation: {corr:.3f}', transform=plt.gca().transAxes,
                    bbox=dict(boxstyle="round", facecolor='wheat', alpha=0.5))
    
    # 9. Weekend vs Weekday Usage
    plt.subplot(4, 3, 9)
    if 'weekend_visits' in features_pd.columns and 'footfall_records' in features_pd.columns:
        valid_weekend_data = features_pd[
            (features_pd['weekend_visits'] >= 0) & 
            (features_pd['footfall_records'] > 0) &
            (features_pd['weekend_visits'].notna()) &
            (features_pd['footfall_records'].notna())
        ].copy()
        
        if len(valid_weekend_data) > 0:
            valid_weekend_data['weekend_ratio'] = (
                valid_weekend_data['weekend_visits'] / valid_weekend_data['footfall_records']
            )
            # Remove any invalid ratios
            valid_weekend_data = valid_weekend_data[
                (valid_weekend_data['weekend_ratio'] >= 0) & 
                (valid_weekend_data['weekend_ratio'] <= 1)
            ]
            
            if len(valid_weekend_data) > 0:
                plt.hist(valid_weekend_data['weekend_ratio'], bins=15, alpha=0.7, 
                        color='lightcoral', edgecolor='black')
                plt.xlabel('Weekend Visit Ratio')
                plt.ylabel('Number of Parks')
                plt.title('Weekend vs Weekday Usage Pattern')
                mean_ratio = valid_weekend_data['weekend_ratio'].mean()
                plt.axvline(mean_ratio, color='red', 
                           linestyle='--', label=f'Mean: {mean_ratio:.2f}')
                plt.legend()
            else:
                plt.text(0.5, 0.5, 'No valid weekend data', ha='center', va='center', 
                        transform=plt.gca().transAxes)
                plt.title('Weekend vs Weekday Usage Pattern')
        else:
            plt.text(0.5, 0.5, 'No weekend data available', ha='center', va='center', 
                    transform=plt.gca().transAxes)
            plt.title('Weekend vs Weekday Usage Pattern')
    else:
        plt.text(0.5, 0.5, 'Weekend data not available', ha='center', va='center', 
                transform=plt.gca().transAxes)
        plt.title('Weekend vs Weekday Usage Pattern')
    
    # 10. Area Efficiency Distribution
    plt.subplot(4, 3, 10)
    if 'area_efficiency' in features_pd.columns:
        efficiency_data = features_pd[
            (features_pd['area_efficiency'] > 0) & 
            (features_pd['area_efficiency'].notna()) &
            (features_pd['area_efficiency'] < features_pd['area_efficiency'].quantile(0.95))  # Remove outliers
        ]
        if len(efficiency_data) > 0:
            plt.hist(efficiency_data['area_efficiency'], bins=20, alpha=0.7, 
                    color='lightgreen', edgecolor='black')
            plt.xlabel('Area Efficiency (visitors per 1000 sqm)')
            plt.ylabel('Number of Parks')
            plt.title('Park Area Efficiency Distribution')
            mean_eff = efficiency_data['area_efficiency'].mean()
            plt.axvline(mean_eff, color='red', 
                       linestyle='--', label=f'Mean: {mean_eff:.2f}')
            plt.legend()
        else:
            plt.text(0.5, 0.5, 'No efficiency data available', ha='center', va='center', 
                    transform=plt.gca().transAxes)
            plt.title('Park Area Efficiency Distribution')
    else:
        plt.text(0.5, 0.5, 'Area efficiency not calculated', ha='center', va='center', 
                transform=plt.gca().transAxes)
        plt.title('Park Area Efficiency Distribution')
    
    # 11. Top Performing Parks
    plt.subplot(4, 3, 11)
    top_parks = features_pd.nlargest(10, 'total_footfall')[['name', 'total_footfall', 'city']]
    if len(top_parks) > 0:
        bars = plt.barh(range(len(top_parks)), top_parks['total_footfall'])
        plt.yticks(range(len(top_parks)), 
                  [f"{name[:20]}... ({city})" if len(name) > 20 else f"{name} ({city})" 
                   for name, city in zip(top_parks['name'], top_parks['city'])])
        plt.xlabel('Total Footfall')
        plt.title('Top 10 Parks by Footfall')
        
        # Add value labels
        for i, (bar, value) in enumerate(zip(bars, top_parks['total_footfall'])):
            plt.text(value + max(top_parks['total_footfall']) * 0.01, i, 
                    f'{int(value):,}', va='center', fontsize=8)
    
    # 12. Sentiment Engagement Analysis
    plt.subplot(4, 3, 12)
    if 'sentiment_engagement_ratio' in features_pd.columns:
        engagement_data = features_pd[
            (features_pd['sentiment_engagement_ratio'] > 0) &
            (features_pd['sentiment_engagement_ratio'].notna()) &
            (features_pd['total_footfall'] > 0) &
            (features_pd['avg_sentiment'].notna())
        ]
        if len(engagement_data) > 0:
            plt.scatter(engagement_data['total_footfall'], 
                       engagement_data['sentiment_engagement_ratio'], 
                       alpha=0.6, s=50, c=engagement_data['avg_sentiment'], cmap='RdYlGn')
            plt.xlabel('Total Footfall')
            plt.ylabel('Sentiment Engagement Ratio (%)')
            plt.title('Footfall vs Social Media Engagement')
            plt.colorbar(label='Avg Sentiment')
            if engagement_data['total_footfall'].min() > 0:
                plt.xscale('log')
        else:
            plt.text(0.5, 0.5, 'No engagement data available', ha='center', va='center', 
                    transform=plt.gca().transAxes)
            plt.title('Footfall vs Social Media Engagement')
    else:
        plt.text(0.5, 0.5, 'Engagement ratio not calculated', ha='center', va='center', 
                transform=plt.gca().transAxes)
        plt.title('Footfall vs Social Media Engagement')
    
    plt.tight_layout(pad=2.0)
    plt.show()
    
    return features_pd

# Generate comprehensive EDA with error handling
try:
    print("\n Starting Exploratory Data Analysis...")
    features_pd = create_comprehensive_eda(features_df)
except Exception as eda_error:
    print(f" EDA failed with error: {str(eda_error)}")
    print(" Attempting basic EDA...")
    
    # Fallback to basic EDA
    try:
        # Simple conversion to pandas with basic columns only
        basic_columns = ["park_id", "name", "city"]
        
        # Add available numeric columns
        all_columns = features_df.columns
        for col in ["avg_aqi", "total_footfall", "avg_sentiment", "area_sqm"]:
            if col in all_columns:
                basic_columns.append(col)
        
        features_pd = features_df.select(*basic_columns).toPandas()
        
        # Create simple visualizations
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Plot 1: Simple histogram of footfall (if available)
        if 'total_footfall' in features_pd.columns:
            footfall_data = features_pd[features_pd['total_footfall'] > 0]['total_footfall']
            if len(footfall_data) > 0:
                axes[0, 0].hist(footfall_data, bins=20, alpha=0.7)
                axes[0, 0].set_title('Footfall Distribution')
                axes[0, 0].set_xlabel('Total Footfall')
                axes[0, 0].set_ylabel('Number of Parks')
        
        # Plot 2: Simple histogram of AQI (if available)
        if 'avg_aqi' in features_pd.columns:
            aqi_data = features_pd[features_pd['avg_aqi'] > 0]['avg_aqi']
            if len(aqi_data) > 0:
                axes[0, 1].hist(aqi_data, bins=20, alpha=0.7)
                axes[0, 1].set_title('Air Quality Distribution')
                axes[0, 1].set_xlabel('Average AQI')
                axes[0, 1].set_ylabel('Number of Parks')
        
        # Plot 3: Simple histogram of sentiment (if available)
        if 'avg_sentiment' in features_pd.columns:
            sent_data = features_pd[features_pd['avg_sentiment'].notna()]['avg_sentiment']
            if len(sent_data) > 0:
                axes[1, 0].hist(sent_data, bins=20, alpha=0.7)
                axes[1, 0].set_title('Sentiment Distribution')
                axes[1, 0].set_xlabel('Average Sentiment')
                axes[1, 0].set_ylabel('Number of Parks')
        
        # Plot 4: Simple scatter plot (if data available)
        if 'avg_aqi' in features_pd.columns and 'total_footfall' in features_pd.columns:
            valid_data = features_pd[(features_pd['avg_aqi'] > 0) & (features_pd['total_footfall'] > 0)]
            if len(valid_data) > 0:
                axes[1, 1].scatter(valid_data['avg_aqi'], valid_data['total_footfall'], alpha=0.6)
                axes[1, 1].set_title('AQI vs Footfall')
                axes[1, 1].set_xlabel('Average AQI')
                axes[1, 1].set_ylabel('Total Footfall')
        
        plt.tight_layout()
        plt.show()
        
        print(" Basic EDA completed successfully")
        
    except Exception as basic_eda_error:
        print(f" Basic EDA also failed: {str(basic_eda_error)}")
        print("  Skipping visualization, proceeding with analysis...")
        
        # Just get basic pandas dataframe for summary stats
        try:
            features_pd = features_df.select("park_id", "name", "city").limit(100).toPandas()
        except:
            features_pd = pd.DataFrame()  # Empty dataframe as last resort

# =============================================================================
# 4. SUMMARY STATISTICS AND INSIGHTS
# =============================================================================

print("\n SUMMARY STATISTICS AND INSIGHTS")
print("=" * 50)

def generate_insights(features_pd, features_df=None):
    """Generate actionable insights from the analysis"""
    
    print(f"📈 Dataset Overview:")
    
    # Handle case where features_pd might be empty or have limited columns
    if len(features_pd) == 0:
        print("    No data available for analysis")
        if features_df is not None:
            try:
                total_parks = features_df.count()
                print(f"  • Total Parks (from Spark DF): {total_parks:,}")
            except:
                print("  • Unable to get park count")
        return []
    
    total_parks = len(features_pd)
    print(f"   Total Parks: {total_parks:,}")
    
    # Basic statistics with error handling
    numeric_columns = features_pd.select_dtypes(include=[np.number]).columns
    
    for col in ['total_footfall', 'avg_aqi', 'avg_sentiment']:
        if col in features_pd.columns:
            try:
                total_val = features_pd[col].sum() if col == 'total_footfall' else features_pd[col].mean()
                if col == 'total_footfall':
                    print(f"  • Total Visitors: {total_val:,.0f}")
                elif col == 'avg_aqi':
                    print(f"  • Average AQI: {total_val:.1f}")
                elif col == 'avg_sentiment':
                    print(f"  • Average Sentiment: {total_val:.3f}")
            except Exception as e:
                print(f"    Could not calculate {col}: {str(e)}")
    
    # Usage patterns with error handling
    if 'usage_category' in features_pd.columns:
        try:
            usage_breakdown = features_pd['usage_category'].value_counts()
            print(f"\n Usage Patterns:")
            for category, count in usage_breakdown.items():
                percentage = (count/total_parks)*100
                print(f"  • {category} Usage: {count} parks ({percentage:.1f}%)")
        except Exception as e:
            print(f"\n Usage Patterns: Unable to calculate - {str(e)}")
    
    # Air quality breakdown with error handling
    if 'air_quality_category' in features_pd.columns:
        try:
            aqi_breakdown = features_pd['air_quality_category'].value_counts()
            print(f"\n Air Quality Breakdown:")
            for category, count in aqi_breakdown.items():
                percentage = (count/total_parks)*100
                print(f"  • {category}: {count} parks ({percentage:.1f}%)")
        except Exception as e:
            print(f"\n Air Quality: Unable to calculate - {str(e)}")
    
    # Top performers with error handling
    if 'total_footfall' in features_pd.columns and 'name' in features_pd.columns:
        try:
            top_footfall = features_pd.nlargest(3, 'total_footfall')[['name', 'city', 'total_footfall']]
            if len(top_footfall) > 0:
                print(f"\n Top Parks by Footfall:")
                for idx, (_, row) in enumerate(top_footfall.iterrows(), 1):
                    city_info = f" ({row['city']})" if 'city' in row and pd.notna(row['city']) else ""
                    print(f"  {idx}. {row['name']}{city_info}: {row['total_footfall']:,.0f} visitors")
        except Exception as e:
            print(f"\n Top Parks: Unable to calculate - {str(e)}")
    
    # Simple correlations with error handling
    try:
        correlation_insights = []
        
        if len(numeric_columns) >= 2:
            # Try basic correlations
            for col1, col2 in [('avg_aqi', 'total_footfall'), ('avg_aqi', 'avg_sentiment'), ('area_sqm', 'total_footfall')]:
                if col1 in features_pd.columns and col2 in features_pd.columns:
                    valid_data = features_pd[[col1, col2]].dropna()
                    if len(valid_data) > 1:
                        corr = valid_data[col1].corr(valid_data[col2])
                        if not np.isnan(corr):
                            correlation_insights.append(f"  • {col1.replace('_', ' ').title()} vs {col2.replace('_', ' ').title()}: {corr:.3f}")
        
        if correlation_insights:
            print(f"\n🔗 Key Correlations:")
            for insight in correlation_insights:
                print(insight)
    except Exception as e:
        print(f"\n🔗 Correlations: Unable to calculate - {str(e)}")
    
    # Basic recommendations
    print(f"\n Basic Recommendations:")
    
    try:
        # Poor air quality parks
        if 'avg_aqi' in features_pd.columns:
            poor_aqi_parks = features_pd[features_pd['avg_aqi'] > 100]
            if len(poor_aqi_parks) > 0:
                print(f"  • {len(poor_aqi_parks)} parks have poor air quality (AQI > 100) - consider air purification measures")
        
        # Check for data availability
        available_metrics = [col for col in ['avg_aqi', 'total_footfall', 'avg_sentiment'] if col in features_pd.columns]
        if len(available_metrics) > 0:
            print(f"  • Continue analysis with available metrics: {', '.join(available_metrics)}")
        else:
            print(f"  • Limited data available - focus on data collection and validation")
            
    except Exception as e:
        print(f"    Could not generate specific recommendations: {str(e)}")
    
    print(f"\n Basic analysis completed!")
    
    return []

# Generate final insights with error handling
try:
    insights = generate_insights(features_pd, features_df)
except Exception as insights_error:
    print(f" Insights generation failed: {str(insights_error)}")
    print(" Attempting basic dataset summary...")
    
    try:
        park_count = features_df.count()
        print(f" Successfully processed {park_count} parks")
    except:
        print("  Unable to get basic statistics")

# Save processed features for next steps
print(f"\n Saving processed features...")
features_df.write.mode("overwrite").format("delta").saveAsTable("ml_project.silver.processed_features")

print(" Features saved to 'ml_project.silver.processed_features' table")

display(features_pd)

In [0]:
df = spark.read.table("ml_project.silver.processed_features")
display(df)