# Registar um pipeline no mlflow

Um pipeline não é mais que a composição de vários modelos/transformações

In [1]:
import mlflow

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
ROOT_PATH = '../../data/'
SEED = 42
TARGET_COL = "Outcome"

## Definir a diretoria onde as experiências são guardadas

In [None]:
from pathlib import Path

uri = "../../mlruns"

Path(uri).mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(uri)

## Fazer set da experiência "Diabetes Prediction Experiment"

In [None]:
mlflow.set_experiment("Diabetes Prediction Experiment")

## Criar os datasets

In [None]:
train_path = ROOT_PATH + 'diabetes_train.csv'
test_path = ROOT_PATH + 'diabetes_test.csv'

train_set = pd.read_csv(train_path)
test_set = pd.read_csv(test_path)

X_train = train_set.drop([TARGET_COL], axis = 1)
y_train = train_set[TARGET_COL]

X_test = test_set.drop([TARGET_COL], axis = 1)
y_test = test_set[TARGET_COL]

X_train.head()

## Criar uma run

In [None]:
run = mlflow.start_run(run_name="Linear Regression Run - C0.1 - pipeline")
RUN_ID = run.info.run_uuid
RUN_ID

## Guardar datasets, modelos, artefactos, métricas e parametros da run

In [None]:
# guardarmos o dataset de treino e de teste associado à run
train_dataset = mlflow.data.from_pandas(train_set, source=train_path, targets=TARGET_COL, name="Diabetes Train Dataset")
test_dataset = mlflow.data.from_pandas(test_set, source=test_path, targets=TARGET_COL, name="Diabetes Test Dataset")
mlflow.log_input(train_dataset, context="train")
mlflow.log_input(test_dataset, context="test")

# Guardamos a seed utilizado como parametro
mlflow.log_param("seed", SEED)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler()),  # Normaliza os dados
        ("random_forest", RandomForestClassifier(random_state=SEED, n_estimators=100))
    ]
)

# Treinar o modelo
rf_pipeline.fit(X_train, y_train)

# Logar o modelo no MLflow
mlflow.sklearn.log_model(rf_pipeline, artifact_path="rf_pipeline", registered_model_name="random_forest")

rf_pipeline

In [None]:
params=rf_pipeline.get_params()

modified_params = {}
for k, v in params.items():
    new_key = k.replace("Random_Forest__", '')
    modified_params[new_key] = v

mlflow.log_params(modified_params)
modified_params

In [None]:
y_preds = rf_pipeline.predict(X_test)
acc = accuracy_score(y_test, y_preds)
mlflow.log_metric("accuracy", acc)
acc

## Terminar a run

In [None]:
mlflow.end_run()

## Consultar uma run já concluida

In [None]:
run = mlflow.get_run(RUN_ID)

In [None]:
run.data

## Ver a experiência na UI do mlflow

A UI do mlflow permite ver de forma visual todas as experiências criadas e permite por exemplo, comparar, filtar e ordenar, as runs dentro de uma experiência de forma visual.

Para correr a UI do mflow é necessário executar, na raiz deste projeto (pasta rumos) e tendo activo o ambiente utilizado neste projeto, o comando:

`mlflow ui --backend-store-uri ./mlruns`

**Nota:** O comando em cima irá iniciar a UI de mlflow na porta 5000. Caso queiram mudar esta porta devem acrescentar `--port <PORT>` ao comando (em que <PORT> deve ser substituido pela porta desejada). 

O comando acima não irá funcionar caso tenham tido alguns problemas no Windows com a instalação do mlflow. Caso tenham problemas, considerem instalar o `mlflow-ui`, ao invés do mlflow.

Após executarem este comando, vão poder ver a UI do mlflow no vosso browser acedendo a 

`http://127.0.0.1:5000`

(se tiverem alterado a porta em que o mlflow UI é iniciado então devem de alterar também aqui o 5000 por essa porta)

Na tab de `Experiments` podem explorar as experiências e runs que criaram.

In [2]:
import requests
import pandas as pd

In [3]:
# Ler os dados
data_path = '../../data/diabetes_test.csv'
df = pd.read_csv(data_path)

In [4]:
# vamos usar uma linha aleatória do dataframe apenas para ter um exemplo de input
input_df = df.sample(10)

In [5]:
input_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
43,5,112,66,0,0,37.8,0.261,41,1
88,2,197,70,99,0,34.7,0.575,62,1
64,3,162,52,38,0,37.2,0.652,24,1
86,0,177,60,29,478,34.6,1.072,21,1
4,3,87,60,18,0,21.8,0.444,21,0
19,3,191,68,15,130,30.9,0.299,34,0
56,0,84,64,22,66,35.8,0.545,21,0
10,0,141,84,26,0,32.4,0.433,22,0
113,2,93,64,32,160,38.0,0.674,23,1
34,10,168,74,0,0,38.0,0.537,34,1


In [6]:
# retiramos o target (Outcome) dos dados, para termos o input que o nosso modelo está à espera 
input_data = input_df.drop("Outcome", axis=1)
input_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
43,5,112,66,0,0,37.8,0.261,41
88,2,197,70,99,0,34.7,0.575,62
64,3,162,52,38,0,37.2,0.652,24
86,0,177,60,29,478,34.6,1.072,21
4,3,87,60,18,0,21.8,0.444,21
19,3,191,68,15,130,30.9,0.299,34
56,0,84,64,22,66,35.8,0.545,21
10,0,141,84,26,0,32.4,0.433,22
113,2,93,64,32,160,38.0,0.674,23
34,10,168,74,0,0,38.0,0.537,34


In [7]:
# devemos ir à UI do mlflow, À model registry, perceber qual é o input de dados esperado
records = input_data.to_dict(orient="records")

In [8]:
records

[{'Pregnancies': 5,
  'Glucose': 112,
  'BloodPressure': 66,
  'SkinThickness': 0,
  'Insulin': 0,
  'BMI': 37.8,
  'DiabetesPedigreeFunction': 0.261,
  'Age': 41},
 {'Pregnancies': 2,
  'Glucose': 197,
  'BloodPressure': 70,
  'SkinThickness': 99,
  'Insulin': 0,
  'BMI': 34.7,
  'DiabetesPedigreeFunction': 0.575,
  'Age': 62},
 {'Pregnancies': 3,
  'Glucose': 162,
  'BloodPressure': 52,
  'SkinThickness': 38,
  'Insulin': 0,
  'BMI': 37.2,
  'DiabetesPedigreeFunction': 0.652,
  'Age': 24},
 {'Pregnancies': 0,
  'Glucose': 177,
  'BloodPressure': 60,
  'SkinThickness': 29,
  'Insulin': 478,
  'BMI': 34.6,
  'DiabetesPedigreeFunction': 1.072,
  'Age': 21},
 {'Pregnancies': 3,
  'Glucose': 87,
  'BloodPressure': 60,
  'SkinThickness': 18,
  'Insulin': 0,
  'BMI': 21.8,
  'DiabetesPedigreeFunction': 0.444,
  'Age': 21},
 {'Pregnancies': 3,
  'Glucose': 191,
  'BloodPressure': 68,
  'SkinThickness': 15,
  'Insulin': 130,
  'BMI': 30.9,
  'DiabetesPedigreeFunction': 0.299,
  'Age': 34},
 {

In [11]:
response = requests.post("http://127.0.0.1:5001//invocations", json={"dataframe_records": records})
response.json()

{'predictions': [1, 1, 1, 1, 0, 1, 0, 0, 0, 1]}

In [None]:
import fastapi
from fastapi.middleware.cors import CORSMiddleware

import mlflow
from pydantic import BaseModel, conint
import pandas as pd
import json
import uvicorn