In [0]:
display(spark.table("ml_project.bronze.sentiment"))

In [0]:
# =============================================================================
# MODULE 3: MACHINE LEARNING PIPELINE & MODEL DEVELOPMENT
# Urban Green Space Management System 
# =============================================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc
)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=== URBAN GREEN SPACE MANAGEMENT SYSTEM ===")
print("Module 3: Machine Learning Pipeline & Model Development (Improved)")
print("=" * 50)

# =============================================================================
# 1. INITIALIZATION AND CONFIGURATION
# =============================================================================

# Initialize Spark with optimized settings
spark = SparkSession.builder \
    .appName("UGSM_ML_Pipeline_Improved") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Configuration
CATALOG = "ml_project"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

# MLflow configuration
mlflow.set_registry_uri("databricks")
experiment_name = "/Users/dilshanchanuka.bc@gmail.com/Urban Green Space Management"

try:
    mlflow.set_experiment(experiment_name)
    print(f" MLflow experiment set: {experiment_name}")
except Exception as e:
    print(f" MLflow experiment setup: {str(e)}")

# =============================================================================
# 2. DATA LOADING AND PREPROCESSING
# =============================================================================

def load_and_prepare_ml_data():
    """Load processed features and prepare for ML with comprehensive error handling"""
    
    print("\n📊 Loading processed features for ML...")
    
    try:
        # Load processed features
        df_spark = spark.read.table(f"{CATALOG}.{SILVER_SCHEMA}.processed_features")
        print(f"  ✅ Loaded features from {CATALOG}.{SILVER_SCHEMA}.processed_features")
        
        # Convert to Pandas for sklearn
        df = df_spark.toPandas()
        print(f"  ✅ Converted to Pandas: {len(df)} records, {len(df.columns)} columns")
        
        # Validate essential columns exist
        required_cols = ['avg_aqi', 'avg_sentiment']
        missing_cols = [col for col in required_cols if col not in df.columns]
        
        if missing_cols:
            print(f"  ⚠️  Missing required columns: {missing_cols}")
            # Add missing columns with default values
            for col in missing_cols:
                if col == 'avg_aqi':
                    df[col] = 75.0  # Default AQI
                elif col == 'avg_sentiment':
                    df[col] = 0.0   # Neutral sentiment
            print(f"  ✅ Added missing columns with default values")
        
        return df
    except Exception as e:
        print(f"  ❌ Error loading features: {str(e)}")

def create_target_variable(df):
    """Create intervention required target variable with business logic"""
    
    print("\n🎯 Creating target variable...")
    
    # Enhanced business logic for intervention requirement
    # Parks need intervention if:
    # 1. High pollution (AQI > 75) AND negative sentiment (< 0), OR
    # 2. Very high pollution (AQI > 100), OR  
    # 3. Very negative sentiment (< -0.3) AND moderate usage
    
    intervention_conditions = (
        ((df["avg_aqi"] > 75) & (df["avg_sentiment"] < 0)) |
        (df["avg_aqi"] > 100) |
        ((df["avg_sentiment"] < -0.3) & (df.get("total_footfall", 0) > 200))
    )
    
    df["intervention_required"] = intervention_conditions.astype(int)
    
    # Print target distribution
    target_dist = df["intervention_required"].value_counts()
    total = len(df)
    
    print(f"  📊 Target Variable Distribution:")
    print(f"    • No Intervention (0): {target_dist.get(0, 0)} parks ({target_dist.get(0, 0)/total*100:.1f}%)")
    print(f"    • Intervention Required (1): {target_dist.get(1, 0)} parks ({target_dist.get(1, 0)/total*100:.1f}%)")
    
    if target_dist.get(1, 0) == 0:
        print("  ⚠️  No positive cases found, adjusting thresholds...")
        # More lenient criteria
        df["intervention_required"] = ((df["avg_aqi"] > 70) | (df["avg_sentiment"] < 0.1)).astype(int)
        target_dist = df["intervention_required"].value_counts()
        print(f"    • Adjusted - Intervention Required: {target_dist.get(1, 0)} parks")
    
    return df

def prepare_features_for_ml(df):
    """Prepare features for machine learning with proper handling"""
    
    print("\n🔧 Preparing features for ML...")
    
    # Define feature categories
    exclude_cols = {
        "park_id", "name", "city", "air_quality_category",
        "park_size_category", "usage_category", "intervention_required"
    }
    
    # Get all potential feature columns
    all_cols = set(df.columns)
    feature_cols = list(all_cols - exclude_cols)
    
    # Filter to numeric columns only
    numeric_features = []
    for col in feature_cols:
        if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
            numeric_features.append(col)
    
    print(f"  📋 Selected {len(numeric_features)} numeric features:")
    for i, feat in enumerate(numeric_features, 1):
        print(f"    {i:2d}. {feat}")
    
    # Handle missing values
    X = df[numeric_features].copy()
    
    # Fill missing values with median for numeric columns
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
            print(f"  🔧 Filled {X[col].isnull().sum()} missing values in {col} with median: {median_val:.2f}")
    
    # Handle infinite values
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    
    y = df["intervention_required"]
    
    print(f"  ✅ Feature matrix shape: {X.shape}")
    print(f"  ✅ Target vector shape: {y.shape}")
    
    return X, y, numeric_features

# =============================================================================
# 3. MODEL DEVELOPMENT AND TRAINING
# =============================================================================

def create_ml_pipelines():
    """Create multiple ML pipelines for comparison"""
    
    print("\n🤖 Creating ML pipelines...")
    
    pipelines = {
        'logistic_regression': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(random_state=42, max_iter=1000))
        ]),
        
        'random_forest': Pipeline([
            ('scaler', RobustScaler()),  # RF doesn't need scaling but helps with consistency
            ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
        ]),
        
        'gradient_boosting': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', GradientBoostingClassifier(random_state=42))
        ])
    }
    
    # Hyperparameter grids
    param_grids = {
        'logistic_regression': {
            'classifier__C': [0.1, 1.0, 10.0],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']
        },
        
        'random_forest': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [5, 10, None],
            'classifier__min_samples_split': [2, 5, 10]
        },
        
        'gradient_boosting': {
            'classifier__n_estimators': [50, 100],
            'classifier__learning_rate': [0.1, 0.2],
            'classifier__max_depth': [3, 5]
        }
    }
    
    print(f"  ✅ Created {len(pipelines)} ML pipelines")
    return pipelines, param_grids

def evaluate_model_performance(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    
    print(f"\n📊 Evaluating {model_name} Performance...")
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='binary', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='binary', zero_division=0),
        'f1': f1_score(y_test, y_pred, average='binary', zero_division=0)
    }
    
    if y_pred_proba is not None and len(np.unique(y_test)) > 1:
        try:
            metrics['auc_roc'] = roc_auc_score(y_test, y_pred_proba)
        except:
            metrics['auc_roc'] = 0.5
    
    # Print results
    print(f"  📈 Performance Metrics:")
    for metric, value in metrics.items():
        print(f"    • {metric.replace('_', ' ').title()}: {value:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"  📊 Confusion Matrix:")
    print(f"    • True Negatives: {cm[0,0]}")
    print(f"    • False Positives: {cm[0,1]}")
    print(f"    • False Negatives: {cm[1,0]}")
    print(f"    • True Positives: {cm[1,1]}")
    
    # Plot Confusion Matrix Heatmap
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name.replace("_", " ").title()}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()
    
    # Plot ROC-AUC Curve
    if y_pred_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC-AUC Curve - {model_name.replace("_", " ").title()}')
        plt.legend(loc="lower right")
        plt.show()
        
        # Plot Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = auc(recall, precision)
        plt.figure(figsize=(8, 6))
        plt.plot(recall, precision, color='darkorange', lw=2, label=f'Precision-Recall Curve (AUC = {pr_auc:.2f})')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve - {model_name.replace("_", " ").title()}')
        plt.legend(loc="lower left")
        plt.show()
    
    return metrics, y_pred, y_pred_proba

def train_and_evaluate_models(X, y, feature_names):
    """Train and evaluate multiple models with cross-validation"""
    
    print("\n🚀 Starting Model Training and Evaluation...")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, 
        stratify=y if y.nunique() > 1 else None
    )
    
    print(f"  📊 Data Split:")
    print(f"    • Training: {len(X_train)} samples")
    print(f"    • Testing: {len(X_test)} samples")
    
    # Create pipelines
    pipelines, param_grids = create_ml_pipelines()
    
    # Store results
    model_results = {}
    best_model = None
    best_score = 0
    
    # Train and evaluate each model
    for model_name, pipeline in pipelines.items():
        print(f"\n🔄 Training {model_name.replace('_', ' ').title()}...")
        
        try:
            # Hyperparameter tuning with GridSearchCV
            cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            
            grid_search = GridSearchCV(
                pipeline,
                param_grids[model_name],
                cv=cv_strategy,
                scoring='f1',
                n_jobs=1,  # Avoid multiprocessing issues in Databricks
                verbose=0
            )
            
            # Fit the model
            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
            
            print(f"  ✅ Best parameters: {grid_search.best_params_}")
            print(f"  ✅ Best CV score: {grid_search.best_score_:.4f}")
            
            # Evaluate on test set
            metrics, y_pred, y_pred_proba = evaluate_model_performance(
                best_pipeline, X_test, y_test, model_name
            )
            
            # Store results
            model_results[model_name] = {
                'model': best_pipeline,
                'params': grid_search.best_params_,
                'cv_score': grid_search.best_score_,
                'metrics': metrics,
                'predictions': y_pred,
                'probabilities': y_pred_proba
            }
            
            # Track best model
            if metrics['f1'] > best_score:
                best_score = metrics['f1']
                best_model = model_name
            
            # Log to MLflow
            try:
                with mlflow.start_run(run_name=f"{model_name}_improved"):
                    # Log parameters
                    mlflow.log_params(grid_search.best_params_)
                    
                    # Log metrics
                    mlflow.log_metrics(metrics)
                    mlflow.log_metric("cv_score", grid_search.best_score_)
                    
                    # Log model
                    mlflow.sklearn.log_model(best_pipeline, f"{model_name}_pipeline")
                    
                    print(f"  ✅ Logged to MLflow")
                    
            except Exception as mlflow_error:
                print(f"  ⚠️  MLflow logging failed: {str(mlflow_error)}")
        
        except Exception as model_error:
            print(f"  ❌ Error training {model_name}: {str(model_error)}")
            continue
    
    print(f"\n🏆 Model Training Summary:")
    print(f"  • Trained {len(model_results)} models successfully")
    print(f"  • Best model: {best_model} (F1 Score: {best_score:.4f})")
    
    return model_results, best_model, X_test, y_test

def create_model_comparison_visualization(model_results):
    """Create visualization comparing model performance"""
    
    print("\n📊 Creating model comparison visualization...")
    
    try:
        # Extract metrics for comparison
        models = list(model_results.keys())
        metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']
        
        # Create comparison plot
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Plot 1: Metrics comparison
        metric_data = {}
        for metric in metrics_to_plot:
            metric_data[metric] = [model_results[model]['metrics'].get(metric, 0) for model in models]
        
        x = np.arange(len(models))
        width = 0.2
        
        for i, metric in enumerate(metrics_to_plot):
            axes[0].bar(x + i*width, metric_data[metric], width, label=metric.title())
        
        axes[0].set_xlabel('Models')
        axes[0].set_ylabel('Score')
        axes[0].set_title('Model Performance Comparison')
        axes[0].set_xticks(x + width * 1.5)
        axes[0].set_xticklabels([m.replace('_', ' ').title() for m in models])
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Plot 2: CV Score vs Test F1 Score
        cv_scores = [model_results[model]['cv_score'] for model in models]
        test_f1_scores = [model_results[model]['metrics']['f1'] for model in models]
        
        axes[1].scatter(cv_scores, test_f1_scores, s=100, alpha=0.7)
        for i, model in enumerate(models):
            axes[1].annotate(model.replace('_', ' ').title(), 
                           (cv_scores[i], test_f1_scores[i]),
                           xytext=(5, 5), textcoords='offset points')
        
        axes[1].set_xlabel('Cross-Validation F1 Score')
        axes[1].set_ylabel('Test F1 Score')
        axes[1].set_title('CV Score vs Test Performance')
        axes[1].grid(True, alpha=0.3)
        
        # Add diagonal line for reference
        min_score = min(min(cv_scores), min(test_f1_scores))
        max_score = max(max(cv_scores), max(test_f1_scores))
        axes[1].plot([min_score, max_score], [min_score, max_score], 'r--', alpha=0.5, label='Perfect Agreement')
        axes[1].legend()
        
        plt.tight_layout()
        plt.show()
        
        print("  ✅ Model comparison visualization created")
        
    except Exception as viz_error:
        print(f"  ❌ Visualization error: {str(viz_error)}")

# =============================================================================
# 4. EXECUTE ML PIPELINE
# =============================================================================

# Load and prepare data
df = load_and_prepare_ml_data()
df = create_target_variable(df)
X, y, feature_names = prepare_features_for_ml(df)

# Train and evaluate models
model_results, best_model_name, X_test, y_test = train_and_evaluate_models(X, y, feature_names)

# Create visualization
if model_results:
    create_model_comparison_visualization(model_results)

# =============================================================================
# 5. FEATURE IMPORTANCE ANALYSIS
# =============================================================================

def analyze_feature_importance(model_results, feature_names, best_model_name):
    """Analyze and visualize feature importance"""
    
    print(f"\n🎯 Analyzing Feature Importance for {best_model_name}...")
    
    try:
        best_model = model_results[best_model_name]['model']
        
        # Get feature importance (works for tree-based models)
        if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
            importances = best_model.named_steps['classifier'].feature_importances_
            
            # Create feature importance dataframe
            feature_importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importances
            }).sort_values('importance', ascending=False)
            
            print(f"  📊 Top 10 Most Important Features:")
            for i, (_, row) in enumerate(feature_importance_df.head(10).iterrows(), 1):
                print(f"    {i:2d}. {row['feature']}: {row['importance']:.4f}")
            
            # Visualization
            plt.figure(figsize=(12, 8))
            top_features = feature_importance_df.head(15)
            sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
            plt.title(f'Feature Importance - {best_model_name.replace("_", " ").title()}')
            plt.xlabel('Importance Score')
            plt.tight_layout()
            plt.show()
            
            print("  ✅ Feature importance analysis completed")
            return feature_importance_df
            
        else:
            print("  ⚠️  Selected model doesn't support feature importance")
            return None
            
    except Exception as fi_error:
        print(f"  ❌ Feature importance analysis failed: {str(fi_error)}")
        return None

# Analyze feature importance
if model_results and best_model_name:
    feature_importance_df = analyze_feature_importance(model_results, feature_names, best_model_name)

# =============================================================================
# 6. SAVE PREDICTIONS AND MODEL ARTIFACTS
# =============================================================================

def save_predictions_to_spark(df, model_results, best_model_name):
    """Save model predictions back to Spark tables"""
    
    print(f"\n💾 Saving predictions to Spark tables...")
    
    try:
        if best_model_name and best_model_name in model_results:
            # Get best model
            best_model = model_results[best_model_name]['model']
            
            # Make predictions on full dataset
            X_full, _, _ = prepare_features_for_ml(df)
            predictions = best_model.predict(X_full)
            prediction_proba = best_model.predict_proba(X_full)[:, 1]
            
            # Create predictions dataframe
            predictions_df = pd.DataFrame({
                'park_id': df['park_id'],
                'intervention_pred': predictions,
                'intervention_probability': prediction_proba
            })
            
            # Convert to Spark DataFrame and save
            predictions_spark = spark.createDataFrame(predictions_df)
            predictions_spark.write.mode("overwrite") \
                .saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.urban_green_space_model_ce")
            
            print(f"✅ Predictions saved to {CATALOG}.{GOLD_SCHEMA}.urban_green_space_model_ce")
            
            # Display sample predictions
            print(f"\n📋 Sample Predictions:")
            predictions_spark.show(10)
            
            return predictions_spark
        else:
            print("❌ No valid model available for predictions")
            return None
            
    except Exception as save_error:
        print(f"❌ Error saving predictions: {str(save_error)}")
        return None

# Save predictions
predictions_spark = save_predictions_to_spark(df, model_results, best_model_name)

# =============================================================================
# 7. MODEL SUMMARY AND NEXT STEPS
# =============================================================================

print(f"\n🎯 ML PIPELINE SUMMARY")
print("=" * 50)

if model_results:
    print(f"✅ Successfully trained {len(model_results)} models")
    print(f"🏆 Best performing model: {best_model_name}")
    
    if best_model_name:
        best_metrics = model_results[best_model_name]['metrics']
        print(f"📊 Best model performance:")
        for metric, value in best_metrics.items():
            print(f"  • {metric.replace('_', ' ').title()}: {value:.4f}")
    
    print(f"\n📋 Model artifacts saved:")
    print(f"  • Predictions: ml_project.default.urban_green_space_model_ce")
    print(f"  • MLflow experiments: {experiment_name}")
    
else:
    print("⚠️  No models were successfully trained")