## Utils

In [3]:
import pandas as pd
from catboost import CatBoostClassifier

### functions

In [4]:
def display_info(data, data_name):
    print((f"Size Of {data_name}: {data.shape}\n\nColumns: {data.columns}"))

## Code

In [5]:
original_data = pd.read_csv("train.csv", index_col="id")
original_data.head(1)

Unnamed: 0_level_0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence


### Split Data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_copy = original_data.sample(100)
print(f"Shape Of Sample: {df_copy.shape}\n\nColumns: {df_copy.columns}")

Shape Of Sample: (100, 14)

Columns: Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease'],
      dtype='object')


In [8]:
FEATURES = df_copy.drop(columns=["Heart Disease"])
display_info(FEATURES, "FEATURES")

Size Of FEATURES: (100, 13)

Columns: Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium'],
      dtype='object')


In [9]:
TARGET = df_copy["Heart Disease"]
print(f"Size: {TARGET.shape}")

Size: (100,)


In [10]:
X = FEATURES
y = TARGET

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,     
    random_state=42     
)

print(f"Train Shape Rows x Colums :{X_train.shape, y_train.shape}\n\nTest Shape Rows x Colums:{X_test.shape, y_test.shape}")

Train Shape Rows x Colums :((80, 13), (80,))

Test Shape Rows x Colums:((20, 13), (20,))


### Train

In [11]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.datasets import load_breast_cancer

In [12]:
models = {
    "RandomForest":        RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting":    GradientBoostingClassifier(random_state=42),
    "LogisticRegression":  LogisticRegression(max_iter=1000),
    "SVM":                 SVC(probability=True),
    "KNN":                 KNeighborsClassifier(n_neighbors=5),
}

### MLFLOW

In [13]:
import mlflow
import mlflow.sklearn

In [17]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Kaggle Experiment")


for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        
        # Treinar
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        # M√©tricas
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)

        # Logar par√¢metros, m√©tricas e modelo
        mlflow.log_param("model_type", model_name)
        mlflow.log_params(model.get_params())   # hiperpar√¢metros do modelo
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("roc_auc",  auc)
        
        mlflow.sklearn.log_model(model, artifact_path="model")

        print(f"{model_name}: acc={acc:.4f} | auc={auc:.4f}")

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


RandomForest: acc=0.8000 | auc=0.9505
üèÉ View run RandomForest at: http://127.0.0.1:5000/#/experiments/654964554820129690/runs/7ab22c6330f94306af4e3802190d69eb
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/654964554820129690


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


GradientBoosting: acc=0.7500 | auc=0.9451
üèÉ View run GradientBoosting at: http://127.0.0.1:5000/#/experiments/654964554820129690/runs/b1d0a1bae6df47fbb858d22fd205b8ce
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/654964554820129690


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


LogisticRegression: acc=0.7500 | auc=0.9121
üèÉ View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/654964554820129690/runs/35cda501bf93410d8810254a013756e8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/654964554820129690


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


SVM: acc=0.3500 | auc=0.5879
üèÉ View run SVM at: http://127.0.0.1:5000/#/experiments/654964554820129690/runs/815bfba20a1f4a46a0f035c7a0de6434
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/654964554820129690


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


KNN: acc=0.5000 | auc=0.5495
üèÉ View run KNN at: http://127.0.0.1:5000/#/experiments/654964554820129690/runs/e479e6b1e20c40c0ac456279de0c885d
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/654964554820129690


In [21]:
# Buscar todos os runs do experimento
runs = mlflow.search_runs(experiment_names=["Kaggle Experiment"])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,metrics.roc_auc,params.model_type,params.metric,...,params.min_impurity_decrease,params.max_leaf_nodes,params.monotonic_cst,params.max_samples,params.bootstrap,params.oob_score,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.source.type
0,e479e6b1e20c40c0ac456279de0c885d,654964554820129690,FINISHED,mlflow-artifacts:/654964554820129690/e479e6b1e...,2026-02-19 05:22:20.704000+00:00,2026-02-19 05:22:28.351000+00:00,0.5,0.549451,KNN,minkowski,...,,,,,,,KNN,S (1).ipynb,Seu Computador,NOTEBOOK
1,815bfba20a1f4a46a0f035c7a0de6434,654964554820129690,FINISHED,mlflow-artifacts:/654964554820129690/815bfba20...,2026-02-19 05:22:13.313000+00:00,2026-02-19 05:22:20.626000+00:00,0.35,0.587912,SVM,,...,,,,,,,SVM,S (1).ipynb,Seu Computador,NOTEBOOK
2,35cda501bf93410d8810254a013756e8,654964554820129690,FINISHED,mlflow-artifacts:/654964554820129690/35cda501b...,2026-02-19 05:22:05.034000+00:00,2026-02-19 05:22:13.252000+00:00,0.75,0.912088,LogisticRegression,,...,,,,,,,LogisticRegression,S (1).ipynb,Seu Computador,NOTEBOOK
3,b1d0a1bae6df47fbb858d22fd205b8ce,654964554820129690,FINISHED,mlflow-artifacts:/654964554820129690/b1d0a1bae...,2026-02-19 05:21:56.163000+00:00,2026-02-19 05:22:04.975000+00:00,0.75,0.945055,GradientBoosting,,...,0.0,,,,,,GradientBoosting,S (1).ipynb,Seu Computador,NOTEBOOK
4,7ab22c6330f94306af4e3802190d69eb,654964554820129690,FINISHED,mlflow-artifacts:/654964554820129690/7ab22c633...,2026-02-19 05:21:48.240000+00:00,2026-02-19 05:21:56.077000+00:00,0.8,0.950549,RandomForest,,...,0.0,,,,True,False,RandomForest,S (1).ipynb,Seu Computador,NOTEBOOK
5,1cd1a204f8b047e5b030960f0f37b8a3,654964554820129690,FAILED,mlflow-artifacts:/654964554820129690/1cd1a204f...,2026-02-19 05:21:21.799000+00:00,2026-02-19 05:21:32.964000+00:00,0.8,0.950549,RandomForest,,...,0.0,,,,True,False,RandomForest,S (1).ipynb,Seu Computador,NOTEBOOK
6,cb450bf4f8004213b4cfa8ae01383a2a,654964554820129690,FAILED,mlflow-artifacts:/654964554820129690/cb450bf4f...,2026-02-19 05:21:15.752000+00:00,2026-02-19 05:21:16.207000+00:00,0.8,,RandomForest,,...,0.0,,,,True,False,RandomForest,S (1).ipynb,Seu Computador,NOTEBOOK
7,b90a583b55bc4d9ba3db705088852c73,654964554820129690,FAILED,mlflow-artifacts:/654964554820129690/b90a583b5...,2026-02-19 05:20:47.385000+00:00,2026-02-19 05:20:47.864000+00:00,,,,,...,,,,,,,RandomForest,S (1).ipynb,Seu Computador,NOTEBOOK


In [22]:
colunas = ["tags.mlflow.runName", "metrics.accuracy", "metrics.roc_auc"]
print(runs[colunas].sort_values("metrics.roc_auc", ascending=False))

  tags.mlflow.runName  metrics.accuracy  metrics.roc_auc
4        RandomForest              0.80         0.950549
5        RandomForest              0.80         0.950549
3    GradientBoosting              0.75         0.945055
2  LogisticRegression              0.75         0.912088
1                 SVM              0.35         0.587912
0                 KNN              0.50         0.549451
6        RandomForest              0.80              NaN
7        RandomForest               NaN              NaN


In [None]:
# Pegar o run com melhor AUC
best_run = runs.sort_values("metrics.roc_auc", ascending=False).iloc[0]
best_run_id = best_run["run_id"]

# Registrar no Model Registry
mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/model",
    name="best model"
)

Successfully registered model 'melhor_modelo_producao'.
2026/02/19 02:34:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: melhor_modelo_producao, version 1
Created version '1' of model 'melhor_modelo_producao'.


<ModelVersion: aliases=[], creation_timestamp=1771479287676, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1771479287676, metrics=None, model_id=None, name='melhor_modelo_producao', params=None, run_id='7ab22c6330f94306af4e3802190d69eb', run_link='', source='models:/m-92f6aa611d0143b0b7e2b9532b2af024', status='READY', status_message=None, tags={}, user_id='', version='1'>