In [0]:
from sklearn.model_selection import train_test_split
df = spark.table("workspace.default.gold_product_performance") \
    .select("purchases", "revenue") \
    .dropna() \
    .toPandas()
X = df[["purchases"]]
y = df["revenue"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [0]:
# Cell 1: Imports
import mlflow
import mlflow.sklear
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [0]:
# Cell 2: Model definitions
models = {
    "linear_regression": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [0]:
mlflow.set_experiment("/Shared/IDC_Day13_MultiModel_Comparison")
results = {}
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        mlflow.log_param("model_type", name)
        mlflow.log_param("feature", "UnitsSold")
        model.fit(X_train, y_train)
        r2 = model.score(X_test, y_test)
        mlflow.log_metric("r2_score", r2)
        mlflow.sklearn.log_model(
            model,
            artifact_path="model",
            input_example=X_train.head(5)
        )
        results[name] = r2
        print(f"{name}: R2 = {r2:.4f}")

In [0]:
# Cell 5: Select best model
best_model = max(results, key=results.get)
best_score = results[best_model]
print(f"Best model: {best_model} with R2 = {best_score:.4f}")

In [0]:
# Cell 6: Load Spark DataFrame
spark_df = spark.table("workspace.default.gold_product_performance") \
    .select("purchases", "revenue") \
    .dropna()

In [0]:
# Cell 7: PySpark ML imports
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

In [0]:
# Cell 8: Build Spark ML pipeline
assembler = VectorAssembler(
    inputCols=["purchases"],
    outputCol="features"
)
lr = SparkLR(
    featuresCol="features",
    labelCol="revenue"
)
pipeline = Pipeline(stages=[assembler, lr])

In [0]:
# Cell 9: Train and predict with Spark ML
train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)
spark_model = pipeline.fit(train_df)
predictions = spark_model.transform(test_df)


In [0]:
# Cell 10: Evaluate Spark model
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="revenue",
    predictionCol="prediction",
    metricName="r2"
)
spark_r2 = evaluator.evaluate(predictions)
print(f"Spark Linear Regression R2 = {spark_r2:.4f}")


In [0]:
%sql
-- Cell 11: Create MLflow volume
CREATE VOLUME IF NOT EXISTS workspace.default.mlflow_tmp;

In [0]:
# Cell 12: Log Spark model to MLflow
with mlflow.start_run(run_name="spark_linear_regression"):
    mlflow.log_param("model_type", "spark_linear_regression")
    mlflow.log_metric("r2_score", spark_r2)

    mlflow.spark.log_model(
        spark_model,
        "model",
        dfs_tmpdir="/Volumes/workspace/default/mlflow_tmp"
    )

In [0]:
# Cell 13: Final model comparison output
print("Final selection based on RÂ²:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

print(f"Spark LR: {spark_r2:.4f}")
