# Training and tracking an XGBoost classifier with MLflow
- [Dataset](https://azuremlexampledata.blob.core.windows.net/data/heart-disease-uci/data/heart.csv) 

In [47]:
import pandas as pd
import mlflow
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder



from xgboost import XGBClassifier


In [2]:
df = pd.read_csv('../data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0


In [3]:
# encode cat col
df["thal"] = df["thal"].astype("category").cat.codes

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1), df["target"], test_size=0.3
)

## Model Training

In [7]:
TRACKING_SERVER_HOST = "0.0.0.0"
PORT='8080'
MLFLOW_EXPERIMENT_NAME = "heart-condition-classifier"

# Specify Public URL of EC2 instance where the MLflow tracking server is running

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{PORT}") 
# print(f"Tracking Server URI: '{mlflow.get_tracking_uri()}'")

mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)


mlflow.xgboost.autolog()

2023/08/24 11:04:20 INFO mlflow.tracking.fluent: Experiment with name 'heart-condition-classifier' does not exist. Creating a new experiment.


In [9]:
# Start the mlflow run
run = mlflow.start_run()

In [11]:
model = XGBClassifier(eval_metric="logloss")
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)



### Logging evaluation metrics

In [14]:
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")



Accuracy: 0.78
Recall: 0.57


In [15]:
# End the experiment run
mlflow.end_run()

### Explore the experiment logged by Mlflow

In [17]:
print(run.info.run_id)

fba423b22e434ebd8b498ffc6a9c5a90


In [16]:
retrieved_run = mlflow.get_run(run.info.run_id)

In [20]:
# explore the parameters that got logged
pd.DataFrame(data=[retrieved_run.data.params], index=["Value"]).T.head()


Unnamed: 0,Value
objective,binary:logistic
base_score,
booster,
colsample_bylevel,
colsample_bynode,


In [21]:
# Let's explore the metrics values:
pd.DataFrame(data=[retrieved_run.data.metrics], index=["Value"]).T


Unnamed: 0,Value
validation_0-logloss,0.619928
accuracy_score_X_test,0.78022
recall_score_X_test,0.571429


#### Retrieve artifacts

In [22]:
client = mlflow.tracking.MlflowClient()
client.list_artifacts(run_id=run.info.run_id)

[<FileInfo: file_size=190, is_dir=False, path='feature_importance_weight.json'>,
 <FileInfo: file_size=23108, is_dir=False, path='feature_importance_weight.png'>,
 <FileInfo: file_size=148, is_dir=False, path='metric_info.json'>,
 <FileInfo: file_size=None, is_dir=True, path='model'>]

As you can see in this example, three artifacts are availble in the run:

* `feature_importance_weight.json` -> the feature importance of the model we created.
* `feature_importance_weight.png` -> a plot of the feature importance mentioned above, stored as an image.
* `metric_info.json` -> contains a json representation of all the metrics captured by the XGBoost.
* `model`, the path where the model is stored. Note that this artifact is a directory.

You can download any artifact using the method `download_artifact`

In [36]:
model_local_path = f"/Users/quannguyen/repos/mlops/model/{MLFLOW_EXPERIMENT_NAME}"

In [32]:
file_path = mlflow.artifacts.download_artifacts(
    run_id=retrieved_run.info.run_id, 
    artifact_path="model",
    dst_path=model_local_path
)

In [37]:
classifier = mlflow.xgboost.load_model(f"{model_local_path}/model")


In [41]:
# classifier.fit(X_test, y_test)

In [38]:
classifier.predict(X_test)

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0])

## MLFlow Pipeline Logging

In [46]:
encoder = ColumnTransformer(
    [
        (
            "cat_encoding",
            OrdinalEncoder(
                categories="auto",
                encoded_missing_value=np.nan,
            ),
            ["thal"],
        )
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
)

model = XGBClassifier(eval_metric="logloss")

In [48]:
pipeline = Pipeline(steps=[("encoding", encoder), ("model", model)])

### Signatures

- **Signatures** are use by MLflow to know what type of inputs are expected for a given model. 
- This allows the model builder to be explicit about which types are being expected. 
- In the first model we logged, all inputs needed to be numeric, including the column `thal`.
- However, our new pipeline can encode this values automatically so we can take `thal` values in string format.

In [49]:
from mlflow.models import infer_signature

signature = infer_signature(X_test, y_test)

  inputs = _infer_schema(model_input)
  outputs = _infer_schema(model_output) if model_output is not None else None


In [50]:
signature

inputs: 
  ['age': long, 'sex': long, 'cp': long, 'trestbps': long, 'chol': long, 'fbs': long, 'restecg': long, 'thalach': long, 'exang': long, 'oldpeak': double, 'slope': long, 'ca': long, 'thal': integer]
outputs: 
  ['target': long]

### Logging the pipeline model

Now, it's time to to fit our entire pipeline and log it inside the run.
> **Note:**  Since `pipeline` is a Scikit-Learn object, we will log using `sklearn` flavor instead of `xgboost`. 

In [51]:
with mlflow.start_run() as run:
    pipeline.fit(X_train, y_train)
    # Since `pipeline` is a Scikit-Learn object, we will log using `sklearn`  flavor instead of `xgboost`
    mlflow.sklearn.log_model(pipeline, artifact_path="pipeline", signature=signature)

In [52]:
pipeline_model = mlflow.sklearn.load_model(f"runs:/{run.info.run_id}/pipeline")

In [53]:
type(pipeline_model)

sklearn.pipeline.Pipeline

In [54]:
pipeline_model.predict(X_test)

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0])