In [0]:
# ======================================
# Fabric Notebook 1: Regression Models (Enhanced)
# ======================================

from pyspark.sql.functions import col, lag, avg
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import mlflow
import mlflow.spark
from mlflow import MlflowClient
import plotly.graph_objects as go

# Optional XGBoost
try:
    from sparkxgb import XGBoostRegressor
    has_xgb = True
except:
    has_xgb = False

# ======================================
# MLflow setup
# ======================================
mlflow.set_experiment("Retail_ML_Experiments")
client = MlflowClient()
registry_name = "retail_ML_regression"
try:
    client.create_registered_model(registry_name)
    print(f"✅ Created registry {registry_name}")
except:
    print(f"ℹ️ Registry {registry_name} already exists")

# ======================================
# Load and prepare data
# ======================================
df = spark.table("Gold_Sales_By_City_Month").withColumn("Month_Num", col("Year")*12 + col("Month"))

# Add lag features and rolling averages
window = Window.partitionBy("City").orderBy("Month_Num")
df = df.withColumn("lag_1", lag("Total_Sales", 1).over(window)) \
       .withColumn("lag_2", lag("Total_Sales", 2).over(window)) \
       .withColumn("rolling_avg_3", avg("Total_Sales").over(window.rowsBetween(-2, 0)))

# Fill nulls (first rows)
df = df.fillna(0, subset=["lag_1", "lag_2", "rolling_avg_3"])

# Features
feature_cols = ["Month_Num", "lag_1", "lag_2", "rolling_avg_3"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(df.select("Total_Sales", *feature_cols))

# Train/test split
train, test = data.randomSplit([0.8, 0.2], seed=42)

# ======================================
# Models
# ======================================
models = {
    "LinearRegression": LinearRegression(featuresCol="features", labelCol="Total_Sales"),
    "DecisionTree": DecisionTreeRegressor(featuresCol="features", labelCol="Total_Sales"),
    "RandomForest": RandomForestRegressor(featuresCol="features", labelCol="Total_Sales", numTrees=50),
    "GBT": GBTRegressor(featuresCol="features", labelCol="Total_Sales", maxIter=50)
}
if has_xgb:
    models["XGBoost"] = XGBoostRegressor(objective="reg:squarederror", featuresCol="features",
                                         labelCol="Total_Sales", numRound=50)

# ======================================
# Training + MLflow logging
# ======================================
evaluator = RegressionEvaluator(labelCol="Total_Sales", predictionCol="prediction", metricName="rmse")
predictions_dict = {}
results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        pipeline = Pipeline(stages=[model])

        # Optional: Cross-validation (example for RandomForest and GBT)
        if name in ["RandomForest", "GBT"]:
            paramGrid = ParamGridBuilder().build()  # expand if tuning
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
            fitted = cv.fit(train)
        else:
            fitted = pipeline.fit(train)

        # Predictions
        preds = fitted.transform(test).select("Month_Num", "Total_Sales", "prediction").orderBy("Month_Num")
        rmse = evaluator.evaluate(preds)

        # MLflow log
        mlflow.log_param("model_type", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.spark.log_model(fitted, "model")

        try:
            mv = client.create_model_version(name=registry_name,
                                             source=f"runs:/{run.info.run_id}/model",
                                             run_id=run.info.run_id)
            print(f"{name}: registered as version {mv.version}")
        except:
            print(f"{name}: registry skipped")

        results[name] = rmse
        predictions_dict[name] = preds.toPandas()

        # Save table for Power BI
        table_name = f"PowerBI_Regression_{name}"
        preds.write.format("delta").mode("overwrite").saveAsTable(table_name)
        print(f"Saved predictions to {table_name}")

# ======================================
# Plotly visualization
# ======================================
fig = go.Figure()
actual = list(predictions_dict.values())[0]
fig.add_trace(go.Scatter(x=actual["Month_Num"], y=actual["Total_Sales"], mode='lines+markers', name="Actual", line=dict(color='black', width=3)))
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown']
for i, (name, pdf) in enumerate(predictions_dict.items()):
    fig.add_trace(go.Scatter(x=pdf["Month_Num"], y=pdf["prediction"], mode='lines+markers', name=name, line=dict(color=colors[i%len(colors)], width=2)))

fig.update_layout(title="Actual vs Predicted Total_Sales per Month",
                  xaxis_title="Month_Num",
                  yaxis_title="Total_Sales",
                  legend_title="Models",
                  template="plotly_white")
fig.show()

print("\nRMSE summary:", results)
