In [0]:
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = spark.table("workspace.default.gold_product_performance").toPandas()
df.head()

In [0]:
#Data quality check
print(df.isnull().sum())
print(df.duplicated().sum())

In [0]:
# Drop rows where target is NaN
df = df.dropna(subset=["revenue"])
# Optional: keep only required columns and drop remaining NaNs
df = df[["views", "purchases", "conversion_rate", "revenue"]].dropna()


In [0]:
# Cell 2: Train-test split and scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df[["views", "purchases", "conversion_rate"]]
y = df["revenue"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [0]:
# Cell 3: MLflow experiment
import mlflow
import mlflow.sklearn
def model_run(model):
    with mlflow.start_run(run_name="Regression_v1"):
        # Log parameters
        mlflow.log_param("model_type", "Regression")
        mlflow.log_param("test_size", 0.2)
        # Train
        model.fit(X_train, y_train)
        # Evaluate
        score = model.score(X_test, y_test)
        mlflow.log_metric("r2_score", score)
        # Log model
        mlflow.sklearn.log_model(model, "model")
        print(f"R2 Score: {score:.4f}")

In [0]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# Objects
model1 = LinearRegression()
model2 = Ridge()
model3 = RandomForestRegressor()
model4 = GradientBoostingRegressor()
# Call function
model_run(model1)
model_run(model2)
model_run(model3)
model_run(model4)