# Machine Learning Experiment

## Import Modules

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, col 
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

import mlflow
import mlflow.spark
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

## Data Preparation

In [0]:
# Initiate Spark session
spark = SparkSession.builder.appName("SalesPredictionWithMLflow").getOrCreate()

In [0]:
sales_df = spark.read.format("parquet").load("dbfs:/FileStore/salesdata/published")

In [0]:
aggregated_df = (
    sales_df
    .groupBy("City", "ReportYear", "ReportMonth")
    .agg(spark_sum(col("Price") * col("Quantity")).alias("TotalSales"))
)

aggregated_df.show(10)

+-----------+----------+-----------+----------+
|       City|ReportYear|ReportMonth|TotalSales|
+-----------+----------+-----------+----------+
|Los Angeles|      2019|         12| 684044.84|
|     Boston|      2019|         12| 509599.16|
|     Austin|      2019|          4| 172683.59|
|     Austin|      2019|         10| 203196.12|
|     Austin|      2019|         12| 233777.09|
|     Dallas|      2019|          4| 251360.48|
|     Dallas|      2019|         10| 323135.60|
|   Portland|      2019|         12| 303714.11|
|   Portland|      2019|          4| 239978.12|
|   Portland|      2019|         10| 254100.86|
+-----------+----------+-----------+----------+
only showing top 10 rows



## Feature Engineering

In [0]:
# Index the City column (convert the city names to numeric values)
city_indexer = StringIndexer(inputCol="City", outputCol="CityIndex")

# One-Hot-Encode (OHE)the indexed city
city_encoder = OneHotEncoder(inputCols=["CityIndex"], outputCols=["CityOHE"])

# Assemble all the features into a single vector
feature_assembler = VectorAssembler(
  inputCols=["CityOHE", "ReportYear", "ReportMonth"],
  outputCol="features"
)

## Model Training and Tracking with MLflow

In [0]:
# Split the data 80/20 for training and testing
train_df, test_df = aggregated_df.randomSplit([0.8, 0.2], seed=42)

In [0]:
# Enable Mlflow autologging
mlflow.pyspark.ml.autolog()

# Define Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="TotalSales", predictionCol="prediction")

# Create the ML Pipeline
pipeline = Pipeline(stages=[city_indexer, city_encoder, feature_assembler, lr])

In [0]:
# Start the MLflow experiment
experiment_name = "/Users/cesar-martinez1@northwestern.edu/sales-prediction-experiment"
mlflow.set_experiment(experiment_name)

2025/03/17 15:39:41 INFO mlflow.tracking.fluent: Experiment with name '/Users/cesar-martinez1@northwestern.edu/sales-prediction-experiment' does not exist. Creating a new experiment.


Out[12]: <Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3642522158388303', creation_time=1742225981386, experiment_id='3642522158388303', last_update_time=1742225981386, lifecycle_stage='active', name='/Users/cesar-martinez1@northwestern.edu/sales-prediction-experiment', tags={'mlflow.experiment.sourceName': '/Users/cesar-martinez1@northwestern.edu/sales-prediction-experiment',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'cesar-martinez1@northwestern.edu',
 'mlflow.ownerId': '3846535268250455'}>

In [0]:
with mlflow.start_run():

    # Train the model
    model = pipeline.fit(train_df)

    # Predict on the test dataset
    predictions = model.transform(test_df)

    # Evaluate the model
    evaluator_rmse = RegressionEvaluator(labelCol="TotalSales", predictionCol="prediction", metricName="rmse")
    evaluator_mae = RegressionEvaluator(labelCol="TotalSales", predictionCol="prediction", metricName="mae")
    evaluator_r2 = RegressionEvaluator(labelCol="TotalSales", predictionCol="prediction", metricName="r2")

    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    r2 = evaluator_r2.evaluate(predictions)

    # Log metrics to MLflow
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R-squared", r2)

    # Save model in MLflow
    mlflow.spark.log_model(model, "sales_prediction_model")

    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.2f}")

2025/03/17 15:52:31 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().
2025/03/17 15:54:01 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


RMSE: 69265.94
MAE: 60073.23
R²: 0.84


In [0]:
# Show a few prediction results
predictions.select("City", "ReportYear", "ReportMonth", "TotalSales", "prediction").show(5)

+-------+----------+-----------+----------+------------------+
|   City|ReportYear|ReportMonth|TotalSales|        prediction|
+-------+----------+-----------+----------+------------------+
|Atlanta|      2019|          3| 231905.38|173040.52566999197|
|Atlanta|      2019|          7| 211766.47|232318.95373147726|
|Atlanta|      2019|          9| 171278.89| 261958.1677621603|
| Austin|      2019|          2| 108787.40|130581.04956585169|
| Austin|      2019|          8| 125713.61|219498.69165802002|
+-------+----------+-----------+----------+------------------+
only showing top 5 rows



## Make Future Predictions with Saved Model

In [0]:
# Prepare new prediction data
new_data = spark.createDataFrame([
    ("New York City", 2020, 2),
    ("San Francisco", 2020, 2),
    ("Los Angeles", 2020, 2)
], ["City", "ReportYear", "ReportMonth"])

new_data.show()

+-------------+----------+-----------+
|         City|ReportYear|ReportMonth|
+-------------+----------+-----------+
|New York City|      2020|          2|
|San Francisco|      2020|          2|
|  Los Angeles|      2020|          2|
+-------------+----------+-----------+



In [0]:
# Load the saved model from MLflow
logged_model = 'runs:/9065070942034d3089dc9aab3eeb5094/sales_prediction_model'
loaded_model = mlflow.spark.load_model(logged_model)

2025/03/17 17:29:08 INFO mlflow.spark: 'runs:/9065070942034d3089dc9aab3eeb5094/sales_prediction_model' resolved as 'dbfs:/databricks/mlflow-tracking/3642522158388303/9065070942034d3089dc9aab3eeb5094/artifacts/sales_prediction_model'


In [0]:
# Perform inference via model.transform()
new_predictions = loaded_model.transform(new_data)
new_predictions.select("City", "ReportYear", "ReportMonth", "prediction").show()

+-------------+----------+-----------+------------------+
|         City|ReportYear|ReportMonth|        prediction|
+-------------+----------+-----------+------------------+
|New York City|      2020|          2|  72888.5575813055|
|San Francisco|      2020|          2|334278.54314494133|
|  Los Angeles|      2020|          2|128456.74791198969|
+-------------+----------+-----------+------------------+

