# Usar o mlflow para dar track √† experi√™ncia deste notebook

* Registar o modelo Random Forest
    -   Modelo escolhido por apresentar custos mais baixos.

In [2]:
# importar as bibliotecas necess√°rias
import mlflow
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from pathlib import Path
seed = 3

In [3]:
root_path = '../data/'

In [4]:
data_path = root_path + 'lending_data.csv'
df = pd.read_csv(data_path)

In [5]:
#df = pd.read_csv(root_path + 'lending_data.csv')

In [6]:
df.head(2)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1


In [7]:
df.columns = df.columns.str.strip().str.replace('"', '')
# Remover a coluna 'ID'
df = df.drop('ID', axis=1)


In [8]:
df.head(1)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1


## Definir a diretoria onde as experi√™ncias s√£o guardadas

In [9]:
# Configura√ß√£o
#ROOT_PATH = '../../data/'
#root_path = '../data/'
TARGET_COL = "default.payment.next.month"
URI = "http://127.0.0.1:5000"

## Definir a diretoria onde as experi√™ncias s√£o guardadas

In [10]:
Path("./mlruns").mkdir(parents=True, exist_ok=True)
mlflow.set_tracking_uri(URI)
mlflow.set_experiment("Credit Card Default Prediction Experiment")

2025/03/14 17:50:44 INFO mlflow.tracking.fluent: Experiment with name 'Credit Card Default Prediction Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/870553150525548120', creation_time=1741974644509, experiment_id='870553150525548120', last_update_time=1741974644509, lifecycle_stage='active', name='Credit Card Default Prediction Experiment', tags={}>

In [11]:
print(df.columns)

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')


## Criar os datasets

In [12]:
#df = pd.read_csv(root_path + 'lending_data.csv')

train_set, test_set = train_test_split(df, test_size = 0.2, random_state = seed)

X_train = train_set.drop([TARGET_COL], axis = 'columns')
y_train = train_set[TARGET_COL]

X_test = test_set.drop([TARGET_COL], axis=1)
y_test = test_set[TARGET_COL]

## Escalando as features

In [13]:

scaler = MinMaxScaler()
features_names = X_train.columns

X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=features_names)

X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=features_names)

## Criar uma run

In [14]:
run = mlflow.start_run(run_name="RandomForest")
RUN_ID = run.info.run_uuid
RUN_ID

'7ef7a6ba1a334a46b06d66a0548ae849'

## Guardar datasets, modelos, artefactos, m√©tricas e parametros da run

In [15]:
train_dataset = mlflow.data.from_pandas(train_set, source=data_path, targets=TARGET_COL, name="Credit Train Dataset")
test_dataset = mlflow.data.from_pandas(test_set, source=data_path, targets=TARGET_COL, name="Credit Test Dataset")
mlflow.log_input(train_dataset, context="train")
mlflow.log_input(test_dataset, context="test")

# Guardar a seed utilizada como parametro
mlflow.log_param("seed", seed)

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


3

In [16]:
# Criar e treinar o modelo Random Forest
rf = RandomForestClassifier(random_state=seed, class_weight='balanced')
parameters = {'n_estimators': [10, 100, 300, 1000]}
clf_rf = GridSearchCV(rf, parameters, cv=5).fit(X_train, y_train)
    
# Logar o modelo e registrar
mlflow.sklearn.log_model(clf_rf, artifact_path="random_forest", registered_model_name="random_forest")
   
# Logar parametros do modelo
params = clf_rf.best_params_
mlflow.log_params(params)

Successfully registered model 'random_forest'.
2025/03/14 17:59:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 1
Created version '1' of model 'random_forest'.


In [17]:
# Fazer previs√µes e calcular a acur√°cia
y_preds = clf_rf.predict(X_test)
acc = accuracy_score(y_test, y_preds)
mlflow.log_metric("accuracy", acc)
    
print("Acur√°cia:", acc)

Acur√°cia: 0.8256666666666667


## Terminar a run

In [18]:
mlflow.end_run()

üèÉ View run RandomForest at: http://127.0.0.1:5000/#/experiments/870553150525548120/runs/7ef7a6ba1a334a46b06d66a0548ae849
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/870553150525548120


# Consultar uma run j√° conclu√≠da

In [19]:
mlflow.get_run(RUN_ID)

<Run: data=<RunData: metrics={'accuracy': 0.8256666666666667}, params={'n_estimators': '300', 'seed': '3'}, tags={'mlflow.log-model.history': '[{"run_id": "7ef7a6ba1a334a46b06d66a0548ae849", '
                             '"artifact_path": "random_forest", '
                             '"utc_time_created": "2025-03-14 '
                             '17:58:55.709677", "model_uuid": '
                             '"c6743ae27f1246c69cbae81f873b3fcc", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"predict_fn": "predict", "loader_module": '
                             '"mlflow.sklearn", "python_version": "3.12.9", '
                             '"env": {"conda": "conda.yaml", "virtualenv": '
                             '"python_env.yaml"}}, "sklearn": '
                             '{"pickled_model": "model.pkl", '
                             '"sklearn_version": "1.6.1", '
                             '"seriali