MLFLow Relates Libraries

In [138]:
import mlflow
from mlflow.models import infer_signature

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import model.preprocessors as pp
import model.config as config
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import shutil
import os

By default mlflow tracking is set to mlruns directory

In [139]:
mlflow.tracking.get_tracking_uri()

'file:///Users/cipri/Documents/GitHub/IntroMLOps/mlruns'

## Loan Prediction (Example)

In [140]:
train_df = pd.read_csv('model/train.csv')
test_df = pd.read_csv('model/test.csv')
print("Train Size", train_df.shape)
print("Test Size", test_df.shape)

Train Size (614, 13)
Test Size (362, 12)


In [141]:
X = train_df.drop([config.TARGET], axis=1)
y = train_df[config.TARGET].map({"N": 0, "Y": 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

In [142]:
def create_model_pipeline(classifier):
    loan_pipe = Pipeline(
    [
        ("Numerical Imputer", pp.NumericalImputer(variables=config.NUMERICAL_FEATURES)),
        (
            "Categorical Imputer",
            pp.CategoricalImputer(variables=config.CATEGORICAL_FEATURES),
        ),
        (
            "Temporal Features",
            pp.TemporalVariableEstimator(
                variables=config.TEMPORAL_FEATURES,
                reference_variable=config.TEMPORAL_ADDITION,
            ),
        ),
        (
            "Categorical Encoder",
            pp.CategoricalEncoder(variables=config.FEATURES_TO_ENCODE),
        ),
        ("Log Transform", pp.LogTransformation(variables=config.LOG_FEATURES)),
        ("Drop Features", pp.DropFeatures(variables_to_drop=config.DROP_FEATURES)),
        ("Scaler Transform", MinMaxScaler()),
        ("Linear Model",classifier),
            ]
    )
    return loan_pipe

def model_metrics(actual, pred):
    accuracy = metrics.accuracy_score(actual, pred)
    f1 = metrics.f1_score(actual, pred, pos_label=1)
    fpr, tpr, thresholds1 = metrics.roc_curve(actual, pred)
    auc = metrics.auc(fpr, tpr)
    return {"accuracy": accuracy,
            "f1": f1,
            "auc": auc}



# Train Model

In [143]:
rf_params = {
    "n_estimators": 300,  # Number of trees in the forest
    "criterion": "gini",  # "gini" for Gini Impurity, "entropy" for Information Gain
    "max_depth": None,  # Maximum depth of the tree (None = expand fully)
    "min_samples_split": 2,  # Minimum number of samples to split a node
    "min_samples_leaf": 1,  # Minimum number of samples per leaf node
    "max_features": "sqrt",  # Number of features to consider for best split
    "bootstrap": True,  # Whether to bootstrap samples
    "oob_score": False,  # Whether to use out-of-bag samples for validation
    "n_jobs": -1,  # Use all processors for parallel processing
    "random_state": 42,  # Ensures reproducibility
}

pipeline_model= create_model_pipeline(RandomForestClassifier(**rf_params))
pipeline_model.fit(X_train[config.FEATURES], y_train)
y_pred = pipeline_model.predict(X_test[config.FEATURES])


metrics_model = model_metrics(y_test, y_pred)
input_example = pd.read_json("model/input_example.json")[config.FEATURES].dropna()
pipeline_model.predict (input_example)

array([1, 0])

# Model Tracking: 

MLflow Tracking is an API and user interface component that records data about machine learning experiments and lets you query it

You can use the tracking UI to visualize, compare, and search runs. Additionally, it lets you download metadata or artifacts for runs, which you can input for analysis in other tools. MLflow logs information about runs in an mlruns directory; in order to view the data, you can run the MLflow UI one directory above the mlruns folder

* Add Experiment and Log Model

In [144]:
mlflow.set_experiment("Loan Prediction")

<Experiment: artifact_location='file:///Users/cipri/Documents/GitHub/IntroMLOps/mlruns/578337268861345417', creation_time=1741762737026, experiment_id='578337268861345417', last_update_time=1741762737026, lifecycle_stage='active', name='Loan Prediction', tags={}>

In [145]:
class SklearnWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        self.model = model

    def predict(self, context: mlflow.pyfunc.PythonModelContext, model_input: pd.DataFrame) -> np.ndarray:
        return self.model.predict(model_input)

In [146]:
with mlflow.start_run(run_name="RandomForestClassifier") as run:
        # Run id
        run_id = run.info.run_id
        mlflow.set_tag("run_id", run_id)

        y_pred = pipeline_model.predict(X_test[config.FEATURES])
        metrics_model = model_metrics(y_test, y_pred)
        signature = infer_signature(input_example, pipeline_model.predict(input_example))
        model_wrapped = SklearnWrapper(pipeline_model)
        
        mlflow.log_metrics(metrics=metrics_model)
        mlflow.log_params(params=rf_params)
        
        # log the sklearn model 
        model_info = mlflow.sklearn.log_model(
            sk_model=pipeline_model,
            signature=signature,
            artifact_path="model",
            input_example=input_example,
            registered_model_name="tracking-RandomForestClassifier"
        )
        
        # just to overwrite the existing directory
        model_path = "RandomForestClassifier"
        if os.path.exists(model_path):
            shutil.rmtree(model_path)
        
        # generalized manner: user a wrapper on top of the pipeline
        # this only saves the model locally
        signature = infer_signature(input_example, model_wrapped.predict(None, input_example))
        mlflow.pyfunc.save_model(path=model_path, python_model=SklearnWrapper(pipeline_model), signature=signature, input_example=input_example)

Registered model 'tracking-RandomForestClassifier' already exists. Creating a new version of this model...
Created version '20' of model 'tracking-RandomForestClassifier'.
2025/03/12 22:14:12 INFO mlflow.pyfunc: Validating input example against model signature


Inference with logged model

In [147]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
loaded_model.predict(input_example)

array([1, 0])

# 📌 MLflow Model UI

```bash
mlflow ui

# 📌 MLflow Model Serve

make sure you have installed pyenv, if needed run:

```bash
pip install pyenv


```bash
mlflow models serve -m model_uri -p port_id

# 📌 MLflow Model Inference: Correct JSON Request Format

When making predictions using an MLflow model, ensure the request format matches the **expected input schema**.

## ✅ **Correct `curl` Request (Using `dataframe_records`)**
Use the **`dataframe_records`** key to send structured data:

```bash
curl http://127.0.0.1:8000/invocations \
     -H "Content-Type: application/json" \
     --data '{
         "dataframe_records": [
             {
                 "Gender": "Male",
                 "Married": "Yes",
                 "Dependents": "0",
                 "Education": "Graduate",
                 "Self_Employed": "No",
                 "ApplicantIncome": 5720,
                 "CoapplicantIncome": 0,
                 "LoanAmount": 110,
                 "Loan_Amount_Term": 360,
                 "Credit_History": 1,
                 "Property_Area": "Urban"
             }
         ]
     }'