## PHASE 4: AI & ML (Days 12-14)

### **DAY 12 (20/01/26) ‚Äì MLflow Basics**

### Learn:

- MLflow components (tracking, registry, models)
- Experiment tracking
- Model logging
- MLflow UI

### üõ†Ô∏è Tasks:

1. Train simple regression model
2. Log parameters, metrics, model
3. View in MLflow UI
4. Compare runs

# Prepare data

In [0]:
gold_df = spark.sql('select * from ecommerce.default.gold_ecommerce_full_data').toPandas()

In [0]:
from sklearn.model_selection import train_test_split
X = gold_df[["views", "cart"]]
y = gold_df["purchases"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

print("Train data records :",X_train.shape[0])
print("Test data records :",X_test.shape[0])

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings("ignore")

In [0]:
models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}

In [0]:
# Multiple models without autolog
mlflow.set_experiment('/Day_12_Experiments')
mlflow.end_run()
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        mlflow.log_param("model_type", name)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        mlflow.log_metric("r2_score", score)
        mlflow.sklearn.log_model(model, "model",input_example=X_train.iloc[:1])
        print(f"{name}: R¬≤ = {score:.4f}")

In [0]:
# Multiple models with autolog

mlflow.sklearn.autolog()
mlflow.set_experiment('/Day_12_Experiments')
mlflow.end_run()
for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model_v2"):
        mlflow.log_param("model_type", name)
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        mlflow.log_metric("r2_score", score)
        mlflow.sklearn.log_model(model, "model",input_example=X_train.iloc[:1])
        print(f"{name}: R¬≤ = {score:.4f}")

# Spark ML Pipeline

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR

In [0]:

assembler = VectorAssembler(inputCols=["views","cart"], outputCol="features")
lr = SparkLR(featuresCol="features", labelCol="purchases")
pipeline = Pipeline(stages=[assembler, lr])

In [0]:
sparkdf = spark.table('ecommerce.default.gold_ecommerce_full_data')
train, test = sparkdf.randomSplit([0.8, 0.2])
model = pipeline.fit(train)

In [0]:
prediction = model.transform(test)
prediction.show(5)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
metrics = ['r2','mae','mse','rmse']
result={}
for i in metrics:
    evaluator = RegressionEvaluator(
        labelCol="purchases", 
        predictionCol="prediction", 
        metricName=i)
    result[i]=evaluator.evaluate(prediction)
print(result)