# N2 — Model Training Control Panel (Template)

> **Objetivo:** Este notebook implementa um **painel de controle** para seleção de modelos, ajuste de hiperparâmetros, treinamento, avaliação e exportação dos artefatos `.joblib`.

**Como funciona**
1. Selecione os modelos nos *checkboxes*.
2. As **abas** (tabs) aparecem com formulários específicos por modelo.
3. Clique no botão **Don't Panic** para treinar e avaliar.
4. (Opcional) Marque quais modelos exportar e confirme a exportação.

> Código em inglês; narrativa e comentários em português — para manter clareza e consistência.

In [1]:
# Imports principais
import os
import json
import math
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# Gráficos
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix,
    roc_auc_score, roc_curve
)

# Pré-processamento e modelagem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# LightGBM é opcional
try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

# Widgets (UI)
import ipywidgets as W
from IPython.display import display, HTML, clear_output

# Persistência
import joblib

# Estilos rápidos para a UI
HTML("""<style>
/* Deixa as abas e cartões mais agradáveis */
.widget-tab > .p-TabBar .p-TabBar-tabLabel { font-weight: 600; }
.card { border-radius: 16px; padding: 16px; box-shadow: 0 6px 24px rgba(0,0,0,.15); margin-bottom: 12px; }
.hrow { display:flex; gap:12px; flex-wrap:wrap; align-items:center; }
.label { font-weight:600; opacity:.9; }
hr { opacity:.25; }
</style>""")

## ⚙️ Configuração e caminhos

In [2]:
# Carregar config se existir
CONFIG_DIR = Path("config")
DEFAULTS = CONFIG_DIR / "defaults.json"
LOCAL = CONFIG_DIR / "local.json"

config = {}
if DEFAULTS.exists():
    config.update(json.loads(DEFAULTS.read_text(encoding="utf-8")))
if LOCAL.exists():
    # sobrepõe defaults com local
    config.update(json.loads(LOCAL.read_text(encoding="utf-8")))

print("Active config:")
print(json.dumps(config, indent=2, ensure_ascii=False))

# Caminhos padrão
DATA_RAW = Path(config.get("data_raw", "data/raw"))
DATA_PROCESSED = Path(config.get("data_processed", "data/processed"))
ARTIFACTS = Path(config.get("artifacts_dir", "artifacts"))
ART_MODELS = ARTIFACTS / "models"
ART_MODELS.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = int(config.get("random_state", 42))
TEST_SIZE = float(config.get("test_size", 0.2))
TARGET_COL = config.get("target_column", "target")  # ajuste conforme seu dataset

print("\nPaths:")
print(" RAW:", DATA_RAW.resolve())
print(" PROCESSED:", DATA_PROCESSED.resolve())
print(" ARTIFACTS:", ARTIFACTS.resolve())
print(" MODELS:", ART_MODELS.resolve())

Active config:
{}

Paths:
 RAW: C:\Users\fabio\Projetos DEV\data projects\data-project-template\notebooks\data\raw
 PROCESSED: C:\Users\fabio\Projetos DEV\data projects\data-project-template\notebooks\data\processed
 ARTIFACTS: C:\Users\fabio\Projetos DEV\data projects\data-project-template\notebooks\artifacts
 MODELS: C:\Users\fabio\Projetos DEV\data projects\data-project-template\notebooks\artifacts\models


## 📦 Carregamento de dados

In [3]:
# TODO: adapte para seu caso.
# Estratégias:
# 1) Ler um CSV já 'processado' (numérico) e separar features/target;
# 2) Ou montar um ColumnTransformer para lidar com numéricos/categóricos antes do treino.

from pathlib import Path
import json, sys
import pandas as pd
import numpy as np

def _find_processed_candidate():
    proc_dir = Path(config.get("data_processed_dir", "data/processed"))
    candidates = []
    if proc_dir.exists():
        candidates += sorted(proc_dir.glob("*.parquet"))
        candidates += sorted(proc_dir.glob("*.csv"))
        candidates += sorted(proc_dir.glob("*.xlsx"))
    return proc_dir, candidates

# Caminhos vindos da config
DATA_PROCESSED = Path(config.get("data_processed_file", "data/processed/processed.parquet"))
META_FILE = Path(config.get("meta_file", "artifacts/metadata/dataset_meta.json"))

# Se o arquivo configurado não existir, tenta auto-descobrir
if not DATA_PROCESSED.exists():
    proc_dir, cands = _find_processed_candidate()
    if cands:
        # Estratégia: prioriza parquet, depois csv, depois xlsx
        DATA_PROCESSED = cands[0]
        print(f"[info] data_processed_file não encontrado. Usando candidato: {DATA_PROCESSED}")
    else:
        raise FileNotFoundError(
            "Nenhum dataset processado encontrado.\n"
            f"Tente uma destas opções:\n"
            f"  1) Salvar seu dataset em {Path('data/processed/processed.parquet').as_posix()} ou\n"
            f"  2) Definir 'data_processed_file' em config/defaults.json apontando para o arquivo certo\n"
            f"  3) Colocar qualquer .parquet/.csv/.xlsx em data/processed/ para auto-descoberta"
        )

# Leitura do dataset
suffix = DATA_PROCESSED.suffix.lower()
if suffix == ".parquet":
    try:
        df = pd.read_parquet(DATA_PROCESSED)
    except Exception as e:
        print("[warn] Falha ao ler parquet. Você tem 'pyarrow' instalado? Tentando instrução:")
        print("      pip install pyarrow")
        raise
elif suffix == ".csv":
    df = pd.read_csv(DATA_PROCESSED)
elif suffix == ".xlsx":
    df = pd.read_excel(DATA_PROCESSED)
else:
    raise ValueError(f"Extensão não suportada: {suffix}")

print(f"Loaded: {DATA_PROCESSED} — shape: {df.shape}")

# Metadados (opcional)
meta = {}
if META_FILE.exists():
    try:
        meta = json.loads(META_FILE.read_text(encoding="utf-8"))
        print(f"[info] meta carregado de {META_FILE}")
    except Exception as e:
        print(f"[warn] falha lendo meta '{META_FILE}': {e}")

TARGET_COL = meta.get("target", config.get("target_column", "target"))
if TARGET_COL not in df.columns:
    raise KeyError(
        f"Target column '{TARGET_COL}' não encontrada no dataset.\n"
        "→ Defina 'target_column' em config/defaults.json ou ajuste 'meta_file'.\n"
        f"Colunas disponíveis: {list(df.columns)[:10]}..."
    )

# Map de classes (se vier do meta)
class_map = meta.get("class_map")
if class_map:
    df[TARGET_COL] = df[TARGET_COL].map(class_map)

# Definição de colunas (prioriza meta)
cols_meta = meta.get("columns", {})
ignored_cols = set(cols_meta.get("ignored", []))
candidate_features = [c for c in df.columns if c not in ignored_cols and c != TARGET_COL]

# Inferência caso meta não traga listas
numeric_cols = cols_meta.get("numeric") or df[candidate_features].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = cols_meta.get("categorical") or [c for c in candidate_features if c not in numeric_cols]

# Booleans (opcional no meta)
bool_cols = cols_meta.get("boolean", [])
for c in bool_cols:
    if c in candidate_features and c not in categorical_cols and c not in numeric_cols:
        categorical_cols.append(c)

feature_cols = numeric_cols + categorical_cols
X = df[feature_cols].copy()
y = df[TARGET_COL].copy()

print("numeric_cols:", numeric_cols[:8], "..." if len(numeric_cols) > 8 else "")
print("categorical_cols:", categorical_cols[:8], "..." if len(categorical_cols) > 8 else "")
print("ignored_cols:", list(ignored_cols))


Loaded: data\processed\processed.parquet — shape: (7043, 25)
[info] meta carregado de artifacts\metadata\dataset_meta.json
numeric_cols: ['SeniorCitizen', 'tenure', 'MonthlyCharges'] 
categorical_cols: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity'] ...
ignored_cols: []


## ✂️ Split e Pré‑processamento

In [4]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# Pré-processamento padrão (imputação + escala para numéricos; imputação + one-hot para categóricos)
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=True))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ],
    remainder="drop"
)

## 🧰 Seletor de Modelos e Formulários (UI)

In [5]:
# Definição dos modelos suportados
MODEL_SPECS = {
    "Dummy": {
        "class": DummyClassifier,
        "params": {
            "strategy": W.Dropdown(options=["most_frequent", "prior", "stratified", "uniform"], value="most_frequent"),
            "random_state": W.IntText(value=RANDOM_STATE)
        }
    },
    "KNN": {
        "class": KNeighborsClassifier,
        "params": {
            "n_neighbors": W.IntSlider(value=5, min=1, max=50, step=1),
            "weights": W.Dropdown(options=["uniform", "distance"], value="uniform"),
            "p": W.Dropdown(options=[1, 2], value=2)
        }
    },
    "RandomForest": {
        "class": RandomForestClassifier,
        "params": {
            "n_estimators": W.IntSlider(value=200, min=50, max=1000, step=50),
            "max_depth": W.IntSlider(value=None, min=1, max=50, step=1),
            "min_samples_split": W.IntSlider(value=2, min=2, max=20, step=1),
            "min_samples_leaf": W.IntSlider(value=1, min=1, max=20, step=1),
            "bootstrap": W.Checkbox(value=True),
            "random_state": W.IntText(value=RANDOM_STATE),
            "n_jobs": W.IntText(value=-1)
        }
    },
    "LogisticRegression": {
        "class": LogisticRegression,
        "params": {
            "C": W.FloatLogSlider(value=1.0, base=10, min=-2, max=2, step=0.1),
            "penalty": W.Dropdown(options=["l2", "l1", "elasticnet", "none"], value="l2"),
            "solver": W.Dropdown(options=["lbfgs", "liblinear", "saga", "newton-cg"], value="lbfgs"),
            "max_iter": W.IntSlider(value=1000, min=100, max=5000, step=100),
            "n_jobs": W.IntText(value=None)
        }
    },
    "DecisionTree": {
        "class": DecisionTreeClassifier,
        "params": {
            "criterion": W.Dropdown(options=["gini", "entropy", "log_loss"], value="gini"),
            "max_depth": W.IntSlider(value=None, min=1, max=50, step=1),
            "min_samples_split": W.IntSlider(value=2, min=2, max=20, step=1),
            "min_samples_leaf": W.IntSlider(value=1, min=1, max=20, step=1)
        }
    },
    "NaiveBayes": {
        "class": GaussianNB,
        "params": {
            # GaussianNB tem poucos hiperparâmetros
            "var_smoothing": W.FloatLogSlider(value=1e-9, base=10, min=-12, max=-3, step=0.1)
        }
    }
}

if HAS_LGBM:
    MODEL_SPECS["LightGBM"] = {
        "class": LGBMClassifier,
        "params": {
            "n_estimators": W.IntSlider(value=400, min=50, max=2000, step=50),
            "learning_rate": W.FloatLogSlider(value=0.05, base=10, min=-3, max=0, step=0.05),
            "num_leaves": W.IntSlider(value=31, min=8, max=512, step=1),
            "max_depth": W.IntSlider(value=-1, min=-1, max=64, step=1),
            "subsample": W.FloatSlider(value=1.0, min=0.5, max=1.0, step=0.05),
            "colsample_bytree": W.FloatSlider(value=1.0, min=0.5, max=1.0, step=0.05),
            "random_state": W.IntText(value=RANDOM_STATE),
            "n_jobs": W.IntText(value=-1)
        }
    }

# Checkbox por modelo
checkboxes = {name: W.Checkbox(description=name, value=(name in ["RandomForest", "LogisticRegression"])) 
              for name in MODEL_SPECS.keys()}

checkbox_row = W.HBox(list(checkboxes.values()))

# Abas dinâmicas com formulários
tab = W.Tab()
forms = {}
titles = []

def build_form_for(model_name):
    # monta um formulário (VBox) com widgets dos hiperparâmetros
    spec = MODEL_SPECS[model_name]
    widgets = []
    for p_name, widget in spec["params"].items():
        widgets.append(W.HBox([W.HTML(f"<div class='label' style='min-width:160px'>{p_name}</div>"), widget]))
    form = W.VBox(widgets, layout=W.Layout(border="1px solid #ddd", padding="10px", border_radius="12px"))
    return form

def refresh_tabs(*args):
    global titles, forms
    chosen = [name for name, cb in checkboxes.items() if cb.value]
    titles = chosen
    children = []
    forms = {}
    for name in chosen:
        forms[name] = build_form_for(name)
        children.append(forms[name])
    if not children:
        tab.children = ()
    else:
        tab.children = tuple(children)
        for i, title in enumerate(titles):
            tab.set_title(i, title)

for cb in checkboxes.values():
    cb.observe(refresh_tabs, names="value")

refresh_tabs()  # inicializa com seleção padrão

display(W.HTML("<h3>Model selection</h3>"))
display(W.VBox([W.HTML("<div class='card'>Selecione os modelos abaixo:</div>"), checkbox_row]))
display(W.HTML("<h3>Hyperparameters</h3>"))
display(tab)

HTML(value='<h3>Model selection</h3>')

VBox(children=(HTML(value="<div class='card'>Selecione os modelos abaixo:</div>"), HBox(children=(Checkbox(val…

HTML(value='<h3>Hyperparameters</h3>')

Tab(children=(VBox(children=(HBox(children=(HTML(value="<div class='label' style='min-width:160px'>n_estimator…

## 🚀 Treino, Avaliação e Exportação

In [6]:
# Botão de execução
run_btn = W.Button(description="Don't Panic", button_style='danger', icon="rocket")
export_label = W.HTML("<b>Export selected models after training?</b>")
export_checkboxes = {}
export_box = W.VBox()

output = W.Output()

def build_model_instance(name):
    spec = MODEL_SPECS[name]
    cls = spec["class"]
    # ler valores atuais dos widgets do formulário
    params = {}
    for p_name, widget in spec["params"].items():
        try:
            params[p_name] = widget.value
        except Exception:
            pass
    # ajustar casos especiais (None convertido indevidamente)
    for k, v in list(params.items()):
        if isinstance(v, str) and v.lower() == "none":
            params[k] = None
    model = cls(**params)
    pipe = Pipeline([
        ("pre", preprocessor),
        ("clf", model)
    ])
    return pipe, params

def plot_confusion(ax, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    im = ax.imshow(cm)
    ax.set_title("Confusion Matrix")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    for (i, j), val in np.ndenumerate(cm):
        ax.text(j, i, int(val), ha="center", va="center")
    return im

def plot_roc(ax, y_true, y_proba):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc_val = roc_auc_score(y_true, y_proba)
    ax.plot(fpr, tpr, label=f"ROC AUC = {auc_val:.3f}")
    ax.plot([0,1],[0,1], linestyle='--')
    ax.set_title("ROC Curve")
    ax.set_xlabel("FPR")
    ax.set_ylabel("TPR")
    ax.legend()

def on_click_run(_):
    output.clear_output()
    with output:
        chosen = [name for name, cb in checkboxes.items() if cb.value]
        if not chosen:
            print("Nenhum modelo selecionado.")
            return

        results = {}
        for name in chosen:
            print(f"\n=== Training: {name} ===")
            pipe, params = build_model_instance(name)
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            metrics = {
                "accuracy": accuracy_score(y_test, y_pred),
                "f1": f1_score(y_test, y_pred, average="binary") if len(np.unique(y_test)) == 2 else f1_score(y_test, y_pred, average="macro")
            }

            print("Params:", params)
            print("Accuracy:", f"{metrics['accuracy']:.4f}")
            print("F1:", f"{metrics['f1']:.4f}")

            # Plots
            fig1 = plt.figure()
            ax1 = fig1.add_subplot(111)
            plot_confusion(ax1, y_test, y_pred)
            plt.show()

            # ROC se proba disponível e problema binário
            proba_available = hasattr(pipe.named_steps["clf"], "predict_proba")
            if proba_available and len(np.unique(y_test)) == 2:
                y_proba = pipe.predict_proba(X_test)[:, 1]
                fig2 = plt.figure()
                ax2 = fig2.add_subplot(111)
                plot_roc(ax2, y_test, y_proba)
                plt.show()

            results[name] = {"pipeline": pipe, "metrics": metrics}

        # Montar checkboxes de exportação
        export_options = []
        export_checkboxes.clear()
        for name in results.keys():
            cb = W.Checkbox(description=f"Export {name}.joblib", value=True)
            export_checkboxes[name] = cb
            export_options.append(cb)
        export_box.children = tuple([export_label] + export_options)

        # Guardar resultados na instância do botão (simples)
        run_btn.results = results

run_btn.on_click(on_click_run)

display(W.HBox([run_btn]))
display(output)
display(W.HTML("<hr/>"))
display(export_box)

# Botão para confirmar exportação
confirm_export_btn = W.Button(description="Export selected", button_style='success', icon="save")
export_status = W.Output()

def on_click_export(_):
    export_status.clear_output()
    with export_status:
        if not hasattr(run_btn, "results"):
            print("Nenhum resultado de treino disponível. Execute o treino primeiro.")
            return
        results = run_btn.results
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        for name, cb in export_checkboxes.items():
            if cb.value:
                model_path = ART_MODELS / f"{name}_{ts}.joblib"
                joblib.dump(results[name]["pipeline"], model_path)
                print(f"Saved: {model_path}")

confirm_export_btn.on_click(on_click_export)
display(confirm_export_btn)
display(export_status)

HBox(children=(Button(button_style='danger', description="Don't Panic", icon='rocket', style=ButtonStyle()),))

Output()

HTML(value='<hr/>')

VBox()

Button(button_style='success', description='Export selected', icon='save', style=ButtonStyle())

Output()

---

### Notas e Dicas

- **Pré‑processamento:** Ajuste o `ColumnTransformer` conforme seu dataset (dtypes no N1 ajudam muito).
- **Curvas ROC:** Só são exibidas quando `predict_proba` existe e o target é binário.
- **LightGBM:** Habilitado automaticamente se estiver instalado no ambiente.
- **Exportação:** Artefatos salvos em `artifacts/models/ModelName_YYYYMMDD_HHMMSS.joblib`.
- **Extensibilidade:** Para adicionar modelos, inclua uma entrada em `MODEL_SPECS` com `class` e `params`.

> Qualquer etapa visual pode ser personalizada com HTML + CSS dentro do notebook.