# Notebook 04: Model Training (Serverless-Compatible)
Train models with MLflow tracking, handle class imbalance, and register best model

## Import libraries

In [0]:
%python
import mlflow
import mlflow.spark
from mlflow.tracking import MlflowClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, 
    GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import numpy as np
import pandas as pd
from datetime import datetime
import getpass

## Initialize with optimization for Serverless

In [0]:
%python
spark = SparkSession.builder \
    .appName("Hotel_Churn_Model_Training") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

mlflow_client = MlflowClient()

## Set up MLflow experiment

In [0]:
%python
try:
    # Try to get username from Python
    current_user = getpass.getuser()
    experiment_name = f"/Users/{current_user}/hotel_churn_prediction"
except:
    # Fallback to shared experiment
    experiment_name = "/Shared/hotel_churn_prediction"

print(f"Using experiment: {experiment_name}")

try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name)
        print(f"Created new experiment: {experiment_name}")
    else:
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: {experiment_name}")
except Exception as e:
    print(f"Error setting up experiment: {e}")
    # Create with a simple name
    experiment_name = "hotel_churn_prediction"
    try:
        experiment_id = mlflow.create_experiment(experiment_name)
    except:
        experiment = mlflow.get_experiment_by_name(experiment_name)
        if experiment is not None:
            experiment_id = experiment.experiment_id
        else:
            print(f"Failed to create or find experiment: {experiment_name}")
            experiment_id = None

if experiment_id is not None:
    mlflow.set_experiment(experiment_name)
    print(f"MLflow Experiment ID: {experiment_id}")
else:
    print("MLflow Experiment ID could not be determined. Skipping set_experiment.")

## Load features from Gold layer

In [0]:
%python
print("\nLoading features from Gold layer...")

# Try different feature tables
feature_tables = [
    "hotel_catalog.gold.hotel_features_final",
    "hotel_catalog.gold.hotel_features_sql",
    "hotel_catalog.gold.hotel_bookings_features"
]

df = None
for table in feature_tables:
    try:
        df = spark.table(table)
        print(f"Loaded features from: {table}")
        break
    except:
        continue

if df is None:
    print("No feature tables found. Creating simple features...")
    # Fallback: Create simple features from silver
    silver_df = spark.table("hotel_catalog.silver.cleaned_hotel_bookings")
    
    # Create basic features
    df = silver_df.select(
        "hotel",
        "chrun",
        "lead_time",
        "arrival_date_year",
        "arrival_date_month",
        "stays_in_weekend_nights",
        "stays_in_week_nights",
        "adults",
        col("children").cast("double").alias("children"),
        "babies",
        "previous_cancellations",
        "adr",
        "deposit_type",
        "customer_type"
    ).fillna(0)
    
    # Create derived features
    df = df.withColumn(
        "total_nights",
        col("stays_in_weekend_nights") + col("stays_in_week_nights")
    ).withColumn(
        "total_guests",
        col("adults") + col("children") + col("babies")
    ).withColumn(
        "is_weekend_stay",
        when(col("stays_in_weekend_nights") > 0, 1).otherwise(0)
    )
    
    # Manual encoding
    df = df.withColumn(
        "hotel_code",
        when(col("hotel") == "Resort Hotel", 0).otherwise(1)
    ).withColumn(
        "deposit_code",
        when(col("deposit_type") == "No Deposit", 0)
        .when(col("deposit_type") == "Non Refund", 1)
        .otherwise(2)
    )
    
    # Create feature vector manually
    from pyspark.ml.feature import VectorAssembler
    
    feature_cols = [
        "lead_time", "total_nights", "total_guests",
        "previous_cancellations", "adr",
        "is_weekend_stay", "hotel_code", "deposit_code"
    ]
    
    for col_name in feature_cols:
        df = df.fillna({col_name: 0})
    
    assembler = VectorAssembler(
        inputCols=feature_cols,
        outputCol="features",
        handleInvalid="skip"
    )
    
    df = assembler.transform(df)

print(f"Total records loaded: {df.count():,}")

# Check data quality
print("\n=== Data Quality Check ===")
cancel_count = df.filter(col("churn") == 1).count()
total_count = df.count()
cancel_rate = cancel_count / total_count

print(f"Total bookings: {total_count:,}")
print(f"Canceled bookings: {cancel_count:,}")
print(f"Cancelation rate: {cancel_rate:.2%}")

## Create train/validation/test splits

In [0]:
%python
print("\n=== Creating Train/Validation/Test Splits ===")

# Simple random split (we'll use this to avoid complexity)
splits = df.randomSplit([0.7, 0.15, 0.15], seed=42)
train_df, val_df, test_df = splits[0], splits[1], splits[2]

print(f"Training set: {train_df.count():,} records")
print(f"Validation set: {val_df.count():,} records")
print(f"Test set: {test_df.count():,} records")

# Check class distribution
train_pos_rate = train_df.filter(col("churn") == 1).count() / train_df.count()
val_pos_rate = val_df.filter(col("churn") == 1).count() / val_df.count()
test_pos_rate = test_df.filter(col("churn") == 1).count() / test_df.count()

print(f"\nClass distribution:")
print(f"  Train: {train_pos_rate:.2%} positive")
print(f"  Val: {val_pos_rate:.2%} positive")
print(f"  Test: {test_pos_rate:.2%} positive")

## Define evaluators

In [0]:
%python
print("\n=== Setting up Model Evaluation ===")

evaluator_auc = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderPR"
)


## Model 1 - Logistic Regression

In [0]:
%python
print("\n" + "="*60)
print("MODEL 1: Logistic Regression")
print("="*60)

with mlflow.start_run(run_name="LogisticRegression_Simple"):
    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("seed", 42)
    
    # Create and train model (simplified for Serverless)
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="churn",
        maxIter=20,
        regParam=0.01,
        elasticNetParam=0.5
    )
    
    print("Training Logistic Regression...")
    lr_model = lr.fit(train_df)
    
    # Make predictions
    print("Making predictions...")
    val_predictions = lr_model.transform(val_df)
    
    # Calculate metrics
    val_auc = evaluator_auc.evaluate(val_predictions)
    val_pr = evaluator_pr.evaluate(val_predictions)
    
    # Calculate accuracy
    val_accuracy = val_predictions.filter(
        col("prediction") == col("churn")
    ).count() / val_predictions.count()
    
    # Log metrics
    mlflow.log_metrics({
        "val_auc": val_auc,
        "val_auprc": val_pr,
        "val_accuracy": val_accuracy
    })
    
    # Log model with UC volume path for serverless compatibility
    mlflow.spark.log_model(lr_model, "model", dfs_tmpdir="/Volumes/mlflow/tmp")
    
    print(f"Validation AUC: {val_auc:.4f}")
    print(f"Validation AUPRC: {val_pr:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Store for comparison
    model_results = {
        "LogisticRegression": {
            "val_auc": val_auc,
            "val_auprc": val_pr,
            "val_accuracy": val_accuracy,
            "run_id": mlflow.active_run().info.run_id
        }
    }

## Model 2 - Random Forest

In [0]:
%python
print("\n" + "="*60)
print("MODEL 2: Random Forest (Lightweight)")
print("="*60)

with mlflow.start_run(run_name="RandomForest_Light"):
    
    # Create lightweight model for Serverless
    rf = RandomForestClassifier(
        featuresCol="features",
        labelCol="churn",
        numTrees=30,  # Reduced for Serverless
        maxDepth=5,   # Reduced for Serverless
        seed=42
    )
    
    # Log parameters
    mlflow.log_params({
        "model_type": "RandomForest",
        "num_trees": 30,
        "max_depth": 5,
        "seed": 42
    })
    
    # Train (no cross-validation to save resources)
    print("Training Random Forest...")
    rf_model = rf.fit(train_df)
    
    # Evaluate
    val_predictions = rf_model.transform(val_df)
    val_auc = evaluator_auc.evaluate(val_predictions)
    val_pr = evaluator_pr.evaluate(val_predictions)
    val_accuracy = val_predictions.filter(
        col("prediction") == col("churn")
    ).count() / val_predictions.count()
    
    # Log metrics
    mlflow.log_metrics({
        "val_auc": val_auc,
        "val_auprc": val_pr,
        "val_accuracy": val_accuracy
    })
    
    
    print(f"Validation AUC: {val_auc:.4f}")
    print(f"Validation AUPRC: {val_pr:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Store for comparison
    model_results["RandomForest"] = {
        "val_auc": val_auc,
        "val_auprc": val_pr,
        "val_accuracy": val_accuracy,
        "run_id": mlflow.active_run().info.run_id
    }

##  Model 3 - Gradient Boosted Trees

In [0]:
%python
print("\n" + "="*60)
print("MODEL 3: Gradient Boosted Trees")
print("="*60)

with mlflow.start_run(run_name="GradientBoostedTrees"):
    
    # Create model with minimal iterations
    gbt = GBTClassifier(
        featuresCol="features",
        labelCol="churn",
        maxIter=20,  # Reduced for Serverless
        maxDepth=3,  # Reduced for Serverless
        stepSize=0.1,
        seed=42
    )
    
    # Log parameters
    mlflow.log_params({
        "model_type": "GradientBoostedTrees",
        "max_iter": 20,
        "max_depth": 3,
        "step_size": 0.1
    })
    
    # Train
    print("Training Gradient Boosted Trees...")
    gbt_model = gbt.fit(train_df)
    
    # Evaluate
    val_predictions = gbt_model.transform(val_df)
    val_auc = evaluator_auc.evaluate(val_predictions)
    val_pr = evaluator_pr.evaluate(val_predictions)
    val_accuracy = val_predictions.filter(
        col("prediction") == col("churn")
    ).count() / val_predictions.count()
    
    # Log metrics
    mlflow.log_metrics({
        "val_auc": val_auc,
        "val_auprc": val_pr,
        "val_accuracy": val_accuracy
    })
    
    print(f"Validation AUC: {val_auc:.4f}")
    print(f"Validation AUPRC: {val_pr:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Store for comparison
    model_results["GradientBoostedTrees"] = {
        "val_auc": val_auc,
        "val_auprc": val_pr,
        "val_accuracy": val_accuracy,
        "run_id": mlflow.active_run().info.run_id
    }

## Compare all models

In [0]:
%python
print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)

if 'model_results' in locals():
    print("\nModel Performance Summary:")
    print("-" * 70)
    print(f"{'Model':25s} | {'AUC':8s} | {'AUPRC':8s} | {'Accuracy':10s} | Run ID")
    print("-" * 70)
    
    best_auc = 0
    best_model_name = None
    best_run_id = None
    
    for model_name, results in model_results.items():
        print(f"{model_name:25s} | {results['val_auc']:.4f} | {results['val_auprc']:.4f} | {results['val_accuracy']:.4f} | {results['run_id']}")
        
        if results['val_auc'] > best_auc:
            best_auc = results['val_auc']
            best_model_name = model_name
            best_run_id = results['run_id']
    
    print("-" * 70)
    print(f"\n BEST MODEL: {best_model_name}")
    print(f"   Validation AUC: {best_auc:.4f}")
    print(f"   Run ID: {best_run_id}")
else:
    print("No model results to compare!")
    best_run_id = None
    best_model_name = None


## Register the best model

In [0]:
%python
print("\n" + "="*60)
print("MODEL REGISTRATION")
print("="*60)

if best_run_id:
    model_name = "hotel_churn_predictor"
    
    try:
        # Register the best model
        model_uri = f"runs:/{best_run_id}/model"
        
        registered_model = mlflow.register_model(
            model_uri=model_uri,
            name=model_name
        )
        
        print(f"\n Model registered successfully!")
        print(f"   Model Name: {registered_model.name}")
        print(f"   Model Version: {registered_model.version}")
        
        # Add model description
        try:
            mlflow_client.update_model_version(
                name=model_name,
                version=registered_model.version,
                description=f"Hotel booking churn predictor. Best model: {best_model_name} with AUC: {best_auc:.4f}"
            )
        except:
            print("   Could not update model description (Serverless limitation)")
        
        print(f"   Model URI: models:/{model_name}/{registered_model.version}")
        
    except Exception as e:
        print(f"Model registration failed: {e}")
        print("Trying alternative registration method...")
        
        # Alternative: Save model directly
        try:
            # Load the best model
            if best_model_name == "LogisticRegression":
                best_model = lr_model
            elif best_model_name == "RandomForest":
                best_model = rf_model
            else:
                best_model = gbt_model
            
            # Save model to MLflow without registry
            with mlflow.start_run(run_id=best_run_id):
                mlflow.spark.save_model(best_model, "best_model")
                print(f"Model saved to MLflow run: {best_run_id}")
        except Exception as e2:
            print(f"Could not save model: {e2}")
else:
    print("No best model to register!")

## Evaluate on test set

In [0]:
%python
print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)

if best_run_id:
    try:
        # Load the best model from MLflow
        model_uri = f"runs:/{best_run_id}/model"
        best_model = mlflow.spark.load_model(model_uri)
        
        # Make predictions on test set
        print("Evaluating on test set...")
        test_predictions = best_model.transform(test_df)
        
        # Calculate test metrics
        test_auc = evaluator_auc.evaluate(test_predictions)
        test_pr = evaluator_pr.evaluate(test_predictions)
        test_accuracy = test_predictions.filter(
            col("prediction") == col("churn")
        ).count() / test_predictions.count()
        
        print(f"\nTest Set Performance:")
        print(f"AUC-ROC: {test_auc:.4f}")
        print(f"AUC-PR: {test_pr:.4f}")
        print(f"Accuracy: {test_accuracy:.4f}")
        
        # Calculate confusion matrix
        print("\nConfusion Matrix:")
        confusion = test_predictions.groupBy("churn", "prediction").count()
        confusion_pivot = confusion.groupBy("is_canceled").pivot("prediction").sum("count").fillna(0)
        confusion_pivot.show()
        
        # Log test metrics to MLflow
        try:
            with mlflow.start_run(run_id=best_run_id):
                mlflow.log_metrics({
                    "test_auc": test_auc,
                    "test_auprc": test_pr,
                    "test_accuracy": test_accuracy
                })
        except:
            print("Could not log test metrics to MLflow run")
            
    except Exception as e:
        print(f"Test evaluation failed: {e}")
        print("Using last trained model for evaluation...")
        
        # Use the last model if MLflow loading fails
        if 'gbt_model' in locals():
            test_predictions = gbt_model.transform(test_df)
            test_auc = evaluator_auc.evaluate(test_predictions)
            print(f"Test AUC (using GBT): {test_auc:.4f}")

## Create model performance summary

In [0]:
%python
print("\n" + "="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)

# Create summary DataFrame
summary_data = []

if 'model_results' in locals():
    for model_name, results in model_results.items():
        summary_data.append({
            "model_name": model_name,
            "val_auc": float(results["val_auc"]),
            "val_auprc": float(results["val_auprc"]),
            "val_accuracy": float(results["val_accuracy"]),
            "run_id": results["run_id"],
            "training_timestamp": datetime.now()
        })

if summary_data:
    # Save to Delta table
    summary_df = spark.createDataFrame(summary_data)
    
    try:
        summary_df.write \
            .format("delta") \
            .mode("append") \
            .saveAsTable("hotel_catalog.gold.model_performance_summary")
        print("Model performance summary saved to gold layer")
    except Exception as e:
        print(f"Could not save summary table: {e}")
        # Create temp view instead
        summary_df.createOrReplaceTempView("model_performance_summary_temp")
        print("Created temporary view: model_performance_summary_temp")
    
    # Display summary
    print("\nModel Performance Summary:")
    summary_pd = pd.DataFrame(summary_data)
    print(summary_pd[['model_name', 'val_auc', 'val_auprc', 'val_accuracy']].to_string(index=False))

# Cell 13: Create simple prediction function
print("\n" + "="*60)
print("PREDICTION FUNCTION")
print("="*60)

prediction_code = """
# Simple function to predict hotel booking cancellations
def predict_booking_churn(input_df):
    '''
    Predict churn for new hotel bookings
    
    Args:
        input_df: Spark DataFrame with the same features as training
        
    Returns:
        DataFrame with predictions
    '''
    # Load the model (adjust the URI as needed)
    model_uri = "YOUR_MODEL_URI_HERE"  # e.g., "runs:/<run_id>/model"
    model = mlflow.spark.load_model(model_uri)
    
    # Make predictions
    predictions = model.transform(input_df)
    
    # Add probability column
    predictions = predictions.withColumn(
        "churn_probability",
        expr("probability[1]")  # Probability of class 1 (canceled)
    ).withColumn(
        "prediction_label",
        when(col("churn_probability") > 0.5, 1).otherwise(0)
    )
    
    return predictions

# Example usage:
# new_bookings = spark.read.table("new_hotel_bookings")
# predictions = predict_booking_churn(new_bookings)
# predictions.select("hotel", "lead_time", "churn_probability", "prediction_label").show()
"""

print("Prediction function template:")
print(prediction_code)

# Cell 14: Final summary
print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)

print("\n Model training completed successfully!")
print(f" Models trained: {len(model_results) if 'model_results' in locals() else 0}")
print(f" Best model: {best_model_name if 'best_model_name' in locals() else 'N/A'}")
print(f" Best validation AUC: {best_auc if 'best_auc' in locals() else 'N/A':.4f}")
print(f" MLflow Experiment: {experiment_name}")
print(f" Next step: Run Notebook 05 for detailed evaluation and business insights")