# Part 3: The ML Capstone Pipeline

**Objective**: Build, train, and track a complete, production-style ML pipeline using MLlib and MLflow.


In [None]:
# Setup: Import required libraries
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
import mlflow

# Load TPC-H datasets (built into Databricks)
customers_df = spark.read.parquet("/databricks-datasets/tpch/data-001/customer.parquet")
orders_df = spark.read.parquet("/databricks-datasets/tpch/data-001/orders.parquet")


## Module 3.1: Feature Engineering Pipeline

**Goal**: Convert raw data into ML-ready "features" vectors.


In [None]:
# Prepare data for ML - join customers with orders (simple!)
ml_data = customers_df.join(
    orders_df,
    customers_df.c_custkey == orders_df.o_custkey,
    "inner"
).dropna(subset=["o_totalprice", "c_acctbal"])


In [None]:
# Extract simple time features from order date
ml_data = ml_data.withColumn("month", month(col("o_orderdate")))

# Show the data we'll use for ML
ml_data.select(
    "c_acctbal", "c_mktsegment", "o_totalprice", "month"
).show(5)


### Step 1: StringIndexer (for categorical columns)

Converts categorical text into numeric indices that ML models can use.


In [None]:
# Index the one categorical column we need
market_segment_indexer = StringIndexer(
    inputCol="c_mktsegment",
    outputCol="market_segment_index"
)

# Fit and transform (learns the mapping, then applies it)
ml_data = market_segment_indexer.fit(ml_data).transform(ml_data)

ml_data.select("c_mktsegment", "market_segment_index").show(5)


### Step 2: VectorAssembler (combine all features)

Combines all feature columns into a single vector that ML models require.


In [None]:
# Define feature columns (just 3 features - keep it simple!)
feature_columns = [
    "c_acctbal",
    "market_segment_index",
    "month"
]

# Create VectorAssembler (combines features into one vector)
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)

# Transform data
ml_data_vectorized = assembler.transform(ml_data)

# Show the features vector and label
ml_data_vectorized.select("features", "o_totalprice").show(5, truncate=False)


### Step 3: Build the Pipeline

A Pipeline chains multiple transformers together for easy reuse.


In [None]:
# Create a simple pipeline
pipeline = Pipeline(stages=[
    market_segment_indexer,
    assembler
])

print("‚úì Feature engineering pipeline created!")


## Module 3.2: Model Training & MLflow

**Goal**: Train a model to predict ride duration and track it automatically.


In [None]:
# Prepare final dataset with label (predicting order total - regression)
model_data = ml_data_vectorized.select("features", col("o_totalprice").alias("label"))

# Split data (80% train, 20% test)
train_df, test_df = model_data.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train_df.count():,} orders")
print(f"Test set: {test_df.count():,} orders")


In [None]:
# Add RandomForestRegressor to the pipeline
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="label",
    numTrees=10,
    maxDepth=5,
    seed=42
)

# Complete pipeline: feature engineering + model
full_pipeline = Pipeline(stages=[
    market_segment_indexer,
    assembler,
    rf
])


### üéØ MLflow: Automatic Experiment Tracking

MLflow automatically tracks:
- Model parameters
- Training metrics
- Model artifacts
- Code version


In [None]:
# Prepare data for training (just the features we need)
train_data = ml_data.select(
    "c_acctbal",
    "c_mktsegment",
    "month",
    col("o_totalprice").alias("label")
)

# Split data (80% train, 20% test)
train_df, test_df = train_data.randomSplit([0.8, 0.2], seed=42)

# Train model with MLflow tracking
with mlflow.start_run(run_name="TPCH_Order_Value_Model"):
    # Fit the pipeline (handles all transformations + training)
    print("‚è≥ Training model...")
    pipeline_model = full_pipeline.fit(train_df)
    
    # Make predictions
    predictions = pipeline_model.transform(test_df)
    
    # Calculate RMSE
    from pyspark.ml.evaluation import RegressionEvaluator
    evaluator = RegressionEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="rmse"
    )
    rmse = evaluator.evaluate(predictions)
    
    # Log to MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.spark.log_model(pipeline_model, "my_tpch_order_value_model")
    
    print(f"‚úì Model trained! RMSE: {rmse:.2f}")


### üéâ "Wow" Moment: View Your Experiment

**Check the Experiments tab**:
1. Click **Experiments** in the left sidebar (Databricks)
2. Find your run "TPCH_Order_Value_Model"
3. See your tracked metrics and model artifact!

This is production-grade ML tracking - automatically!


## Module 3.3: Save & Load

**Goal**: Prove the pipeline is a real, reusable "artifact."


In [None]:
# Get the latest run ID from MLflow
runs = mlflow.search_runs()
latest_run = runs.iloc[0]
run_id = latest_run['run_id']
print(f"Run ID: {run_id}")


In [None]:
# Load model from MLflow
model_uri = f"runs:/{run_id}/my_tpch_order_value_model"
loaded_model = mlflow.spark.load_model(model_uri)

print("‚úì Model loaded from MLflow!")


In [None]:
# Apply model to new data (needs raw features)
new_data = ml_data.select(
    "c_acctbal",
    "c_mktsegment",
    "month",
    col("o_totalprice").alias("label")
).limit(100)

predictions = loaded_model.transform(new_data)

# Show predictions
predictions.select("label", "prediction").show(10)


### üéØ Key Takeaways

1. **Feature Engineering**: Transform raw data into ML features
2. **Pipelines**: Chain transformations for reusability
3. **MLflow**: Automatic experiment tracking and model versioning
4. **Model Persistence**: Save and load models for production use
