In [0]:
import os, joblib, mlflow
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from mlflow.models.signature import infer_signature
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def train_and_register(train_path, test_path, model_output_path, registered_model_name):
    # Load parquet into Pandas
    train_df = spark.read.parquet(train_path).toPandas()
    test_df = spark.read.parquet(test_path).toPandas()

    target = "PaymentIsOutstanding"

    # Encode target only
    le = LabelEncoder()
    train_df[target] = le.fit_transform(train_df[target].astype(str))
    test_df[target] = le.transform(test_df[target].astype(str))

    # Split features/labels
    X_train = train_df.drop(columns=[target])
    y_train = train_df[target]
    X_test = test_df.drop(columns=[target])
    y_test = test_df[target]

    # 🔹 Handle categorical features here
    cat_cols = X_train.select_dtypes(include=['object']).columns
    X_train = pd.get_dummies(X_train, columns=cat_cols)
    X_test = pd.get_dummies(X_test, columns=cat_cols)

    # Align train/test columns
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

    # Train model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Save model
    os.makedirs(model_output_path, exist_ok=True)
    model_file = os.path.join(model_output_path, "model.joblib")
    joblib.dump(model, model_file)

    # Log with MLflow
    signature = infer_signature(X_train, model.predict(X_train))
    y_pred = model.predict(X_test)

    acc, rec, prec, f1 = (accuracy_score(y_test, y_pred),
                          recall_score(y_test, y_pred),
                          precision_score(y_test, y_pred),
                          f1_score(y_test, y_pred))

    with mlflow.start_run():
        mlflow.sklearn.log_model(
            model,
            artifact_path="model",
            registered_model_name=registered_model_name,
            signature=signature
        )
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("f1_score", f1)

    print(f"✅ Model trained and registered as '{registered_model_name}'")


# Run manually
train_path = "/Volumes/workspace/default/tutorial/train"
test_path = "/Volumes/workspace/default/tutorial/test"
model_path = "/Volumes/workspace/default/tutorial/models"
registered_name = "my_spark_native_model"

train_and_register(train_path, test_path, model_path, registered_name)
