# Modelo preditivo com Pipeline e GridSearchCV (RandomForest)

Este notebook constrói uma pipeline de pré-processamento (numérica e categórica), faz busca de hiperparâmetros com GridSearchCV, avalia com validação cruzada e gera um arquivo de submissão para o conjunto de teste.


In [None]:
# Importações
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report

# Configs de display
pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")


In [None]:
# Carregamento dos dados
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

X = train.drop(columns=["id", "labels"], errors="ignore")
y = train["labels"]
X_test = test.drop(columns=["id"], errors="ignore")

print(train.shape, test.shape)
X.head()


In [None]:
# Pré-processamento dos dados
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ],
    remainder="drop"
)

print({"num_cols": len(num_cols), "cat_cols": len(cat_cols)})


In [None]:
# Modelo + Grid Search
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    refit=True,
    return_train_score=False
)

grid.fit(X, y)

print("Melhores hiperparâmetros:", grid.best_params_)
print("Acurácia média CV:", grid.best_score_)

best_model = grid.best_estimator_


In [None]:
# Avaliação com validação cruzada
# Usando o melhor estimador encontrado (com pré-processamento embutido)
y_pred_cv = cross_val_predict(grid.best_estimator_, X, y, cv=3, n_jobs=-1)

cm = confusion_matrix(y, y_pred_cv)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Matriz de Confusão (CV)")
plt.xlabel("Predito")
plt.ylabel("Real")
plt.show()

print("\nRelatório de Classificação (CV):\n")
print(classification_report(y, y_pred_cv))


In [None]:
# Predição no conjunto de teste e criação do arquivo de submissão
best_model = grid.best_estimator_
preds = best_model.predict(X_test)

submission = sample_submission.copy()
submission["labels"] = preds

# Garante diretório de saída
import os
os.makedirs("submissions", exist_ok=True)

output_path = "submissions/submission3.csv"
submission.to_csv(output_path, index=False)
print(f"Arquivo salvo em: {output_path}")
submission.head()
