# Notebook 05: Model Evaluation
Comprehensive model evaluation, business impact analysis, and deployment recommendations
Serverless-compatible version

# Import libraries

In [0]:
%python
import mlflow
from mlflow.tracking import MlflowClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Initialize
spark = SparkSession.builder \
    .appName("Hotel_Churn_Model_Evaluation") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

mlflow_client = MlflowClient()

## Load test data

In [0]:
%python
print("Loading test data...")

# Try to load test set from previous notebook
try:
    test_df = spark.table("hotel_catalog.gold.model_test_set")
    print("Loaded test set from gold.model_test_set")
    print(f"Test set size: {test_df.count():,} records")
except:
    # Try to load from features table and create test split
    try:
        features_df = spark.table("hotel_catalog.gold.hotel_features_final")
        # Create 70/15/15 split if not done before
        train_df, val_df, test_df = features_df.randomSplit([0.7, 0.15, 0.15], seed=42)
        print("Created new train/val/test split")
        print(f"Test set size: {test_df.count():,} records")
    except:
        # Fallback: use simple features
        silver_df = spark.table("hotel_catalog.silver.cleaned_hotel_bookings")
        
        # Create simple features
        features_df = silver_df.select(
            "hotel", "churn", "lead_time", "arrival_date_year",
            "arrival_date_month", "stays_in_weekend_nights", "stays_in_week_nights",
            "adults", col("children").cast("double"), "babies",
            "previous_cancellations", "adr", "deposit_type", "customer_type"
        ).fillna(0)
        
        features_df = features_df.withColumn(
            "total_nights", col("stays_in_weekend_nights") + col("stays_in_week_nights")
        ).withColumn(
            "total_guests", col("adults") + col("children") + col("babies")
        ).withColumn(
            "is_weekend_stay", when(col("stays_in_weekend_nights") > 0, 1).otherwise(0)
        ).withColumn(
            "hotel_code", when(col("hotel") == "Resort Hotel", 0).otherwise(1)
        ).withColumn(
            "deposit_code", 
            when(col("deposit_type") == "No Deposit", 0)
            .when(col("deposit_type") == "Non Refund", 1)
            .otherwise(2)
        )
        
        # Create feature vector
        from pyspark.ml.feature import VectorAssembler
        feature_cols = ["lead_time", "total_nights", "total_guests", 
                       "previous_cancellations", "adr", "is_weekend_stay", 
                       "hotel_code", "deposit_code"]
        
        assembler = VectorAssembler(
            inputCols=feature_cols,
            outputCol="features",
            handleInvalid="skip"
        )
        
        features_df = assembler.transform(features_df)
        train_df, val_df, test_df = features_df.randomSplit([0.7, 0.15, 0.15], seed=42)
        print("Created features and test split from silver data")
        print(f"Test set size: {test_df.count():,} records")

## Load the best model

In [0]:
%python
print("\n" + "="*60)
print("LOADING MODEL")
print("="*60)

model_name = "hotel_churn_predictor"
best_model = None
model_source = None

# Try multiple methods to load the model
try:
    # Method 1: Load from Model Registry (Production/Staging)
    model_versions = mlflow_client.get_latest_versions(model_name, stages=["Production", "Staging"])
    if model_versions:
        model_version = model_versions[0]
        model_uri = f"models:/{model_name}/{model_version.version}"
        best_model = mlflow.spark.load_model(model_uri)
        model_source = f"Model Registry (Version {model_version.version}, Stage: {model_version.current_stage})"
        print(f"Loaded model from Model Registry: {model_uri}")
    else:
        # Method 2: Search for best run in MLflow
        experiment = mlflow.get_experiment_by_name("/Shared/hotel_churn_prediction")
        if experiment is None:
            experiment = mlflow.get_experiment_by_name("hotel_churn_prediction")
        
        if experiment:
            runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
            if len(runs) > 0:
                # Find run with highest validation AUC
                best_run = runs.sort_values("metrics.val_auc", ascending=False).iloc[0]
                model_uri = f"runs:/{best_run.run_id}/model"
                best_model = mlflow.spark.load_model(model_uri)
                model_source = f"MLflow Run: {best_run.run_id} (AUC: {best_run['metrics.val_auc']:.4f})"
                print(f"Loaded model from MLflow run: {best_run.run_id}")
        
except Exception as e:
    print(f"Could not load model from MLflow: {e}")

# Fallback: Use a simple model if MLflow fails
if best_model is None:
    print("Creating a simple model as fallback...")
    from pyspark.ml.classification import LogisticRegression
    
    # Train a simple model on training data
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="churn",
        maxIter=10
    )
    
    # Use a sample for training
    train_sample = train_df.limit(10000)
    best_model = lr.fit(train_sample)
    model_source = "Fallback: Simple Logistic Regression"
    print("Created fallback model")

print(f"Model source: {model_source}")

## Generate predictions

In [0]:
%python
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# Minimal fix: UDF to extract class 1 probability from probability vector
get_churn_prob_udf = udf(lambda v: float(v[1]) if v is not None and len(v) > 1 else None, DoubleType())

print("\n" + "="*60)
print("GENERATING PREDICTIONS")
print("="*60)

print("Generating predictions on test set...")
predictions = best_model.transform(test_df)

# Add probability columns using UDF
predictions = predictions.withColumn(
    "churn_probability",
    get_churn_prob_udf(col("probability"))  # Probability of class 1 (canceled)
).withColumn(
    "predicted_label",
    when(col("churn_probability") > 0.5, 1).otherwise(0)
).withColumn(
    "prediction_correct",
    when(col("predicted_label") == col("churn"), 1).otherwise(0)
)

print("Predictions generated:")
predictions.select(
    "hotel", "churn", "predicted_label", 
    "churn_probability", "prediction_correct"
).show(10, truncate=False)

## Calculate comprehensive metrics

In [0]:
%python
print("\n" + "="*60)
print("COMPREHENSIVE METRICS")
print("="*60)

# Initialize evaluators
evaluator_auc = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderPR"
)

try:
    # Calculate metrics
    test_auc = evaluator_auc.evaluate(predictions)
    test_auprc = evaluator_pr.evaluate(predictions)
    metrics_ok = True
except Exception as e:
    print(f"Error evaluating metrics: {e}")
    print("This usually means the model was trained on only one class (all churn=0 or all churn=1). BinaryClassificationEvaluator requires both classes in the training data.")
    test_auc = None
    test_auprc = None
    metrics_ok = False

# Calculate confusion matrix values
TP = predictions.filter((col("churn") == 1) & (col("predicted_label") == 1)).count()
TN = predictions.filter((col("churn") == 0) & (col("predicted_label") == 0)).count()
FP = predictions.filter((col("churn") == 0) & (col("predicted_label") == 1)).count()
FN = predictions.filter((col("churn") == 1) & (col("predicted_label") == 0)).count()

# Calculate derived metrics
total = TP + TN + FP + FN
accuracy = (TP + TN) / total if total > 0 else 0
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Calculate prevalence
actual_positives = predictions.filter(col("churn") == 1).count()
prevalence = actual_positives / total if total > 0 else 0

print(f"Test Set Size: {total:,}")
print(f"Actual Cancellations: {actual_positives:,} ({prevalence:.1%})")
if metrics_ok:
    print(f"\nPerformance Metrics:")
    print(f"AUC-ROC:            {test_auc:.4f}")
    print(f"AUC-PR:             {test_auprc:.4f}")
else:
    print("\nPerformance Metrics: Cannot compute AUC/PR due to single-class model.")
print(f"Accuracy:           {accuracy:.4f}")
print(f"Precision:          {precision:.4f}")
print(f"Recall:             {recall:.4f}")
print(f"F1-Score:           {f1_score:.4f}")

print(f"\nConfusion Matrix:")
print(f"                   Predicted")
print(f"                   No     Yes")
print(f"Actual No   [{TN:6d}  {FP:6d}] â†’ {TN+FP:6d}")
print(f"Actual Yes  [{FN:6d}  {TP:6d}] â†’ {FN+TP:6d}")
print(f"               {TN+FN:6d}  {FP+TP:6d}")

## Threshold analysis

In [0]:
%python
print("\n" + "="*60)
print("THRESHOLD ANALYSIS")
print("="*60)

# Convert probabilities to pandas for threshold analysis
prob_df = predictions.select("churn", "churn_probability").toPandas()

# Calculate metrics at different thresholds
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
results = []

for threshold in thresholds:
    pred_labels = (prob_df["churn_probability"] >= threshold).astype(int)
    
    # Calculate confusion matrix
    TP_t = ((prob_df["churn"] == 1) & (pred_labels == 1)).sum()
    TN_t = ((prob_df["churn"] == 0) & (pred_labels == 0)).sum()
    FP_t = ((prob_df["churn"] == 0) & (pred_labels == 1)).sum()
    FN_t = ((prob_df["churn"] == 1) & (pred_labels == 0)).sum()
    
    # Calculate metrics (cast to Python float)
    precision_t = float(TP_t / (TP_t + FP_t)) if (TP_t + FP_t) > 0 else 0.0
    recall_t = float(TP_t / (TP_t + FN_t)) if (TP_t + FN_t) > 0 else 0.0
    f1_t = float(2 * precision_t * recall_t / (precision_t + recall_t)) if (precision_t + recall_t) > 0 else 0.0
    
    results.append({
        "threshold": float(threshold),
        "precision": precision_t,
        "recall": recall_t,
        "f1_score": f1_t,
        "TP": int(TP_t),
        "FP": int(FP_t),
        "FN": int(FN_t)
    })

# Convert to DataFrame
threshold_df = spark.createDataFrame(results)

print("Performance at Different Thresholds:")
threshold_df.select("threshold", "precision", "recall", "f1_score", "TP", "FP", "FN").show()

# Find optimal threshold (maximizing F1)
optimal_row = threshold_df.orderBy(col("f1_score").desc()).first()
optimal_threshold = optimal_row["threshold"] if optimal_row else 0.5

print(f"\n Optimal Threshold: {optimal_threshold:.2f}")
print(f"   F1-Score at optimal: {optimal_row['f1_score']:.4f}")
print(f"   Precision at optimal: {optimal_row['precision']:.4f}")
print(f"   Recall at optimal: {optimal_row['recall']:.4f}")

## Business impact analysis

In [0]:
%python
print("\n" + "="*60)
print("BUSINESS IMPACT ANALYSIS")
print("="*60)

# Business assumptions (simplified)
AVERAGE_BOOKING_VALUE = 150  # Average revenue per booking
RETENTION_COST = 25  # Cost to retain a customer (discount, upgrade, etc.)
FALSE_POSITIVE_COST = RETENTION_COST  # Cost of unnecessary retention efforts
FALSE_NEGATIVE_COST = AVERAGE_BOOKING_VALUE * 0.5  # Lost revenue opportunity

print("Business Assumptions:")
print(f"  â€¢ Average booking value: ${AVERAGE_BOOKING_VALUE}")
print(f"  â€¢ Cost of retention action: ${RETENTION_COST}")
print(f"  â€¢ Cost of false positive (unnecessary retention): ${FALSE_POSITIVE_COST}")
print(f"  â€¢ Cost of false negative (missed cancellation): ${FALSE_NEGATIVE_COST}")

# Calculate costs
total_fp_cost = FP * FALSE_POSITIVE_COST
total_fn_cost = FN * FALSE_NEGATIVE_COST
total_misclassification_cost = total_fp_cost + total_fn_cost

# Calculate value if we could prevent cancellations
potential_saved_revenue = TP * AVERAGE_BOOKING_VALUE
retention_cost_for_tp = TP * RETENTION_COST
net_value = potential_saved_revenue - retention_cost_for_tp - total_fp_cost

print(f"\n Financial Impact Analysis:")
print(f"Total misclassification cost: ${total_misclassification_cost:,.2f}")
print(f"  â€¢ False Positive cost ({FP} Ã— ${FALSE_POSITIVE_COST}): ${total_fp_cost:,.2f}")
print(f"  â€¢ False Negative cost ({FN} Ã— ${FALSE_NEGATIVE_COST}): ${total_fn_cost:,.2f}")
print(f"\nPotential value with perfect predictions:")
print(f"  Revenue saved from prevented cancellations: ${potential_saved_revenue:,.2f}")
print(f"  Cost of retention actions: ${retention_cost_for_tp:,.2f}")
print(f"  Net value (saved - costs): ${net_value:,.2f}")

# Calculate ROI
if total_misclassification_cost > 0:
    roi = (net_value / total_misclassification_cost) * 100
    print(f"\nðŸ“ˆ Return on Investment (ROI): {roi:.1f}%")


## Performance by segments (Fairness/Bias Analysis)

In [0]:
%python
print("\n" + "="*60)
print("FAIRNESS / BIAS ANALYSIS")
print("="*60)

# Ensure we have the necessary columns
if "hotel" in predictions.columns:
    # Performance by hotel type
    print("\nPerformance by Hotel Type:")
    hotel_perf = predictions.groupBy("hotel").agg(
        count("*").alias("total_bookings"),
        avg("churn").alias("actual_cancel_rate"),
        avg("predicted_label").alias("predicted_cancel_rate"),
        avg("prediction_correct").alias("accuracy"),
        sum("churn").alias("actual_cancellations"),
        sum("predicted_label").alias("predicted_cancellations")
    ).orderBy("total_bookings", ascending=False)
    
    hotel_perf.show(truncate=False)

# Performance by lead time groups
print("\nPerformance by Lead Time Group:")
lead_time_perf = predictions.withColumn(
    "lead_time_group",
    when(col("lead_time") <= 7, "0-7 days")
    .when(col("lead_time") <= 30, "8-30 days")
    .when(col("lead_time") <= 90, "31-90 days")
    .when(col("lead_time") <= 180, "91-180 days")
    .otherwise("180+ days")
).groupBy("lead_time_group").agg(
    count("*").alias("total_bookings"),
    avg("churn").alias("actual_cancel_rate"),
    avg("predicted_label").alias("predicted_cancel_rate"),
    avg("prediction_correct").alias("accuracy")
).orderBy("lead_time_group")

lead_time_perf.show(truncate=False)

# Performance by deposit type (if available)
if "deposit_type" in predictions.columns:
    print("\nPerformance by Deposit Type:")
    deposit_perf = predictions.groupBy("deposit_type").agg(
        count("*").alias("total_bookings"),
        avg("churn").alias("actual_cancel_rate"),
        avg("predicted_label").alias("predicted_cancel_rate"),
        avg("prediction_correct").alias("accuracy")
    ).orderBy("total_bookings", ascending=False)
    
    deposit_perf.show(truncate=False)

## Model calibration analysis

In [0]:
%python
print("\n" + "="*60)
print("MODEL CALIBRATION ANALYSIS")
print("="*60)

# Create probability bins
calibration_df = predictions.withColumn(
    "probability_bin",
    when(col("churn_probability") < 0.1, "0.0-0.1")
    .when(col("churn_probability") < 0.2, "0.1-0.2")
    .when(col("churn_probability") < 0.3, "0.2-0.3")
    .when(col("churn_probability") < 0.4, "0.3-0.4")
    .when(col("churn_probability") < 0.5, "0.4-0.5")
    .when(col("churn_probability") < 0.6, "0.5-0.6")
    .when(col("churn_probability") < 0.7, "0.6-0.7")
    .when(col("churn_probability") < 0.8, "0.7-0.8")
    .when(col("churn_probability") < 0.9, "0.8-0.9")
    .otherwise("0.9-1.0")
)

# Calculate calibration metrics
calibration_stats = calibration_df.groupBy("probability_bin").agg(
    count("*").alias("count"),
    avg("churn_probability").alias("avg_predicted_prob"),
    avg("churn").alias("actual_cancel_rate"),
    (avg("churn") - avg("churn_probability")).alias("calibration_error")
).orderBy("probability_bin")

print("Model Calibration by Probability Bins:")
calibration_stats.show(truncate=False)

# Calculate overall calibration error
calibration_error = calibration_stats.select(
    (sum(abs(col("calibration_error") * col("count"))) / sum(col("count"))).alias("mean_abs_calibration_error")
).first()[0]

print(f"\n Mean Absolute Calibration Error: {calibration_error:.4f}")
print("   â€¢ < 0.01: Excellent calibration")
print("   â€¢ 0.01-0.03: Good calibration")
print("   â€¢ 0.03-0.05: Moderate calibration")
print("   â€¢ > 0.05: Poor calibration")

## Create evaluation summary

In [0]:
%python
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

print("\n" + "="*60)
print("EVALUATION SUMMARY")
print("="*60)

# Create comprehensive evaluation summary
evaluation_summary = {
    "evaluation_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model_source": model_source,
    "test_set_size": int(total),
    "test_auc": float(test_auc) if test_auc is not None else None,
    "test_auprc": float(test_auprc) if test_auprc is not None else None,
    "accuracy": float(accuracy) if accuracy is not None else None,
    "precision": float(precision) if precision is not None else None,
    "recall": float(recall) if recall is not None else None,
    "f1_score": float(f1_score) if f1_score is not None else None,
    "true_positives": int(TP),
    "true_negatives": int(TN),
    "false_positives": int(FP),
    "false_negatives": int(FN),
    "optimal_threshold": float(optimal_threshold) if optimal_threshold is not None else None,
    "calibration_error": float(calibration_error) if calibration_error is not None else None,
    "total_misclassification_cost": float(total_misclassification_cost) if total_misclassification_cost is not None else None,
    "potential_net_value": float(net_value) if net_value is not None else None
}

# Explicitly define schema to avoid inference errors
schema = StructType([
    StructField("evaluation_timestamp", StringType(), True),
    StructField("model_source", StringType(), True),
    StructField("test_set_size", IntegerType(), True),
    StructField("test_auc", DoubleType(), True),
    StructField("test_auprc", DoubleType(), True),
    StructField("accuracy", DoubleType(), True),
    StructField("precision", DoubleType(), True),
    StructField("recall", DoubleType(), True),
    StructField("f1_score", DoubleType(), True),
    StructField("true_positives", IntegerType(), True),
    StructField("true_negatives", IntegerType(), True),
    StructField("false_positives", IntegerType(), True),
    StructField("false_negatives", IntegerType(), True),
    StructField("optimal_threshold", DoubleType(), True),
    StructField("calibration_error", DoubleType(), True),
    StructField("total_misclassification_cost", DoubleType(), True),
    StructField("potential_net_value", DoubleType(), True)
])

# Convert to DataFrame with explicit schema
summary_df = spark.createDataFrame([evaluation_summary], schema=schema)

# Save to Delta table
try:
    summary_df.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable("hotel_catalog.gold.model_evaluation_summary")
    print(" Evaluation summary saved to hotel_catalog.gold.model_evaluation_summary")
except Exception as e:
    print(f"Could not save to Delta table: {e}")
    # Create temp view
    summary_df.createOrReplaceTempView("model_evaluation_summary_temp")
    print("   Created temp view: model_evaluation_summary_temp")

# Display summary
print("\n Evaluation Summary:")
print("-" * 50)
for key, value in evaluation_summary.items():
    if isinstance(value, float):
        if key in ["test_auc", "test_auprc", "accuracy", "precision", "recall", "f1_score", "calibration_error"]:
            print(f"{key:30s}: {value:.4f}" if value is not None else f"{key:30s}: None")
        elif key in ["total_misclassification_cost", "potential_net_value"]:
            print(f"{key:30s}: ${value:,.2f}" if value is not None else f"{key:30s}: None")
        else:
            print(f"{key:30s}: {value}" if value is not None else f"{key:30s}: None")
    else:
        print(f"{key:30s}: {value}")

## Generate deployment recommendations

In [0]:
%python
print("\n" + "="*60)
print("DEPLOYMENT RECOMMENDATIONS")
print("="*60)

# Create deployment checklist
checklist_items = []

# Model performance checklist
if test_auc is not None:
    if test_auc >= 0.8:
        checklist_items.append("AUC-ROC >= 0.8 (Excellent)")
    elif test_auc >= 0.7:
        checklist_items.append("AUC-ROC >= 0.7 (Good)")
    else:
        checklist_items.append(f"AUC-ROC {test_auc:.3f} (Needs improvement)")
else:
    checklist_items.append("AUC-ROC not available (metrics could not be computed)")

if calibration_error is not None:
    if calibration_error < 0.03:
        checklist_items.append("Good calibration (< 0.03)")
    else:
        checklist_items.append(f"Calibration error {calibration_error:.3f}")
else:
    checklist_items.append("Calibration error not available")

if net_value is not None:
    if net_value > 0:
        checklist_items.append(f"Positive business value: ${net_value:,.2f}")
    else:
        checklist_items.append(f"Negative business value: ${net_value:,.2f}")
else:
    checklist_items.append("Business value not available")

# Bias/fairness checklist
if "hotel" in predictions.columns:
    hotel_acc = hotel_perf.select(stddev("accuracy")).collect()[0][0]
    if hotel_acc is not None and hotel_acc < 0.05:
        checklist_items.append("Low performance variance across hotels")
    elif hotel_acc is not None:
        checklist_items.append(f"Performance varies across hotels (std: {hotel_acc:.3f})")
    else:
        checklist_items.append("Hotel accuracy variance not available")

print("\nDeployment Checklist:")
for item in checklist_items:
    print(f"  {item}")

# Recommendations
print("\n Recommendations:")
print("1. Model Performance:")
if test_auc is not None:
    print(f"   â€¢ Current AUC: {test_auc:.3f} - {'Ready for production' if test_auc >= 0.75 else 'Needs improvement'}")
else:
    print("   â€¢ Current AUC: Not available")
print(f"   â€¢ Optimal threshold: {optimal_threshold:.2f} (adjust based on business needs)")

print("\n2. Business Impact:")
if net_value is not None:
    print(f"   â€¢ Net value per {total:,} bookings: ${net_value:,.2f}")
else:
    print(f"   â€¢ Net value per {total:,} bookings: Not available")
if total_misclassification_cost is not None and net_value is not None and total_misclassification_cost > 0:
    print(f"   â€¢ ROI: {(net_value/total_misclassification_cost*100):.1f}%")
else:
    print(f"   â€¢ ROI: Not available")

print("\n3. Monitoring:")
print("   â€¢ Monitor model performance weekly")
print("   â€¢ Track calibration drift monthly")
print("   â€¢ Set up alerts for performance degradation")

print("\n4. Actionable Insights:")
print("   â€¢ High-risk bookings (probability > 0.7): Consider proactive retention")
print("   â€¢ Medium-risk (0.4-0.7): Standard monitoring")
print("   â€¢ Low-risk (< 0.4): No action needed")

## Create monitoring dataset

In [0]:
%python
print("\n" + "="*60)
print("MONITORING SETUP")
print("="*60)

# Create a dataset for monitoring
monitoring_data = predictions.select(
    "hotel",
    "churn",
    "predicted_label",
    "churn_probability",
    "prediction_correct",
    current_date().alias("evaluation_date"),
    lit(datetime.now().strftime("%Y%m%d_%H%M%S")).alias("evaluation_batch")
)

# Add risk categories
monitoring_data = monitoring_data.withColumn(
    "risk_category",
    when(col("churn_probability") < 0.3, "Low")
    .when(col("churn_probability") < 0.7, "Medium")
    .otherwise("High")
)

# Save monitoring data
try:
    monitoring_data.write \
        .format("delta") \
        .mode("append") \
        .partitionBy("evaluation_date") \
        .saveAsTable("hotel_catalog.gold.model_monitoring_predictions")
    print("Monitoring data saved with partitioning by date")
except Exception as e:
    print(f"Could not save monitoring data: {e}")
    # Create sample for demonstration
    monitoring_sample = monitoring_data.limit(1000)
    monitoring_sample.createOrReplaceTempView("model_monitoring_sample")
    print("Created temp view: model_monitoring_sample")

# Create monitoring summary
monitoring_summary = monitoring_data.groupBy("risk_category").agg(
    count("*").alias("count"),
    avg("churn").alias("actual_cancel_rate"),
    avg("predicted_label").alias("predicted_cancel_rate"),
    avg("prediction_correct").alias("accuracy")
).orderBy("risk_category")

print("\nRisk Category Summary:")
monitoring_summary.show(truncate=False)

## Generate SQL queries for dashboard

In [0]:
%python
print("\n" + "="*60)
print("DASHBOARD QUERIES")
print("="*60)

# SQL queries for monitoring dashboard
dashboard_queries = """
-- 1. Daily Performance Metrics
SELECT 
    evaluation_date,
    COUNT(*) as total_predictions,
    AVG(is_canceled) as actual_cancel_rate,
    AVG(predicted_label) as predicted_cancel_rate,
    AVG(prediction_correct) as accuracy,
    SUM(CASE WHEN predicted_label = 1 AND is_canceled = 1 THEN 1 ELSE 0 END) as true_positives,
    SUM(CASE WHEN predicted_label = 1 AND is_canceled = 0 THEN 1 ELSE 0 END) as false_positives
FROM hotel_catalog.gold.model_monitoring_predictions
GROUP BY evaluation_date
ORDER BY evaluation_date DESC;

-- 2. Risk Category Distribution
SELECT 
    risk_category,
    COUNT(*) as count,
    COUNT(*) * 100.0 / SUM(COUNT(*)) OVER () as percentage,
    AVG(churn_probability) as avg_probability,
    AVG(is_canceled) as actual_cancel_rate
FROM hotel_catalog.gold.model_monitoring_predictions
WHERE evaluation_date = CURRENT_DATE()
GROUP BY risk_category
ORDER BY risk_category;

-- 3. High-Risk Bookings for Today
SELECT 
    hotel,
    churn_probability,
    DATE_ADD(CURRENT_DATE(), lead_time) as estimated_arrival_date
FROM hotel_catalog.gold.model_monitoring_predictions
WHERE evaluation_date = CURRENT_DATE()
    AND risk_category = 'High'
    AND predicted_label = 1
ORDER BY churn_probability DESC
LIMIT 50;

-- 4. Model Performance Trend (Last 30 days)
SELECT 
    evaluation_date,
    AVG(prediction_correct) as accuracy,
    COUNT(*) as prediction_count
FROM hotel_catalog.gold.model_monitoring_predictions
WHERE evaluation_date >= DATE_ADD(CURRENT_DATE(), -30)
GROUP BY evaluation_date
ORDER BY evaluation_date;
"""

print("SQL Queries for Monitoring Dashboard:")
print(dashboard_queries)

## Final report

In [0]:
%python
print("\n" + "="*60)
print("EVALUATION REPORT")
print("="*60)

# Generate final report
report = f"""
HOTEL CHURN PREDICTION MODEL - EVALUATION REPORT
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

1. EXECUTIVE SUMMARY
   â€¢ Model Performance: {'Production Ready' if test_auc is not None and test_auc >= 0.75 else 'Needs Improvement' if test_auc is not None else 'Not available'}
   â€¢ Business Value: {f'${net_value:,.2f} potential net value' if net_value is not None else 'Not available'}
   â€¢ ROI: {((net_value/total_misclassification_cost)*100 if total_misclassification_cost is not None and net_value is not None and total_misclassification_cost > 0 else 0):.1f}%

2. MODEL PERFORMANCE
   â€¢ AUC-ROC: {f'{test_auc:.4f}' if test_auc is not None else 'Not available'} {'(Excellent)' if test_auc is not None and test_auc >= 0.8 else '(Good)' if test_auc is not None and test_auc >= 0.7 else '(Needs Improvement)' if test_auc is not None else ''}
   â€¢ Precision: {f'{precision:.4f}' if precision is not None else 'Not available'}
   â€¢ Recall: {f'{recall:.4f}' if recall is not None else 'Not available'}
   â€¢ F1-Score: {f'{f1_score:.4f}' if f1_score is not None else 'Not available'}
   â€¢ Accuracy: {f'{accuracy:.4f}' if accuracy is not None else 'Not available'}

3. BUSINESS IMPACT
   â€¢ High-risk bookings identified: {TP + FP if TP is not None and FP is not None else 'Not available'}
   â€¢ Actual cancellations predicted correctly: {TP if TP is not None else 'Not available'}
   â€¢ False Alarms (False Positives): {FP if FP is not None else 'Not available'}
   â€¢ Missed Cancellations (False Negatives): {FN if FN is not None else 'Not available'}
   â€¢ Net Business Value: {f'${net_value:,.2f}' if net_value is not None else 'Not available'}

4. RECOMMENDATIONS
   â€¢ Use threshold {f'{optimal_threshold:.2f}' if optimal_threshold is not None else 'Not available'} for production
   â€¢ Monitor weekly performance metrics
   â€¢ Focus retention efforts on High-risk category (> 0.7 probability)
   â€¢ Review False Positives to refine model

5. MONITORING METRICS TO TRACK
   â€¢ Daily AUC degradation (< 0.02 drop)
   â€¢ Calibration error (< 0.03)
   â€¢ Business value trend (should be positive)
   â€¢ Prediction volume by risk category
"""

print(report)

# Save report to file (in Databricks FileStore)
try:
    report_path = f"/FileStore/tables/hotel_churn_evaluation_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    with open(report_path, "w") as f:
        f.write(report)
    print(f"\n Report saved to: {report_path}")
except:
    print("\n Could not save report to file")

print("\n" + "="*60)
print("EVALUATION COMPLETE!")
print("="*60)
print("\n Model evaluation completed successfully!")
print("  Next steps:")
print("   1. Review the evaluation summary")
print("   2. Implement monitoring dashboard")
print("   3. Deploy model with optimal threshold")
print("   4. Set up automated retraining pipeline")