#  1. Importações e dados.

In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve
)

from category_encoders import TargetEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [15]:
df = pd.read_csv(
    "/home/fause/ML-Transportes/PBIC/Etapa3/acidentes_pbic_2020_2025_Final.csv",
    encoding="utf-8",
    parse_dates=["data_inversa"],
    dayfirst=True,
    low_memory=False
)

print("Shape inicial:", df.shape)


Shape inicial: (1678326, 48)


In [16]:
df = df[df["Fabricante"] != "Não Informado"]
df = df[df["ano_fabricacao_veiculo"] > 1900]
df = df.dropna(subset=["gravidade_numerica"])

print("Shape após limpeza:", df.shape)


Shape após limpeza: (1549666, 48)


# 2) Criar target binário 'grave'

In [17]:
df["grave"] = (df["gravidade_numerica"] >= 2).astype(int)

print(df["grave"].value_counts(normalize=True))


grave
0    0.825981
1    0.174019
Name: proportion, dtype: float64


In [18]:
df["idade_veiculo"] = 2025 - df["ano_fabricacao_veiculo"]


3) Análises descritivas rápidas (identificar fabricantes/modelos/anos mais envolvidos em acidentes graves)

# 4) Preparar dados para ML

In [19]:
cat_features = [
    "tipo_acidente",
    "classificacao_acidente",
    "fase_dia",
    "condicao_metereologica",
    "tipo_pista",
    "tracado_via",
    "uso_solo",
    "dia_semana",
    "uf",
    "tipo_veiculo",
    "Fabricante",
    "Modelo"
]

num_features = ["idade_veiculo"]

features = cat_features + num_features


for col in cat_features:
    df[col] = df[col].fillna("DESCONHECIDO").astype(str)

df[num_features] = df[num_features].fillna(0)

X = df[features]
y = df["grave"]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale = neg / pos

print("scale_pos_weight:", scale)


scale_pos_weight: 4.746496892049115


Xboost

In [21]:
preprocessador = ColumnTransformer(
    transformers=[
        ("cat", TargetEncoder(), cat_features),
        ("num", "passthrough", num_features)
    ]
)

modelo_xgb = Pipeline([
    ("prep", preprocessador),
    ("clf", XGBClassifier(
        n_estimators=800,
        learning_rate=0.02,
        max_depth=5,
        min_child_weight=3,
        subsample=0.85,
        colsample_bytree=0.85,
        gamma=2,
        reg_alpha=0.3,
        reg_lambda=1.2,
        scale_pos_weight=scale,
        objective="binary:logistic",
        eval_metric="aucpr",
        n_jobs=-1,
        random_state=42
    ))
])


In [22]:
modelo_xgb.fit(X_train, y_train)
y_proba_xgb = modelo_xgb.predict_proba(X_test)[:, 1]


In [23]:
def avaliar_modelo(y_test, y_proba, nome):
    precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-9)
    idx = np.argmax(f1)
    threshold = thresholds[idx]
    
    print(f"\n===== {nome} =====")
    print("Threshold ótimo:", threshold)
    print("F1:", f1[idx])
    print("Precision:", precision[idx])
    print("Recall:", recall[idx])

    y_pred = (y_proba >= threshold).astype(int)
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_proba))


In [24]:
avaliar_modelo(y_test, y_proba_xgb, "XGBoost")



===== XGBoost =====
Threshold ótimo: 0.6432057
F1: 0.5474209222203594
Precision: 0.5045343673775721
Recall: 0.5982757022341708
              precision    recall  f1-score   support

           0       0.91      0.88      0.89    255999
           1       0.50      0.60      0.55     53935

    accuracy                           0.83    309934
   macro avg       0.71      0.74      0.72    309934
weighted avg       0.84      0.83      0.83    309934

ROC AUC: 0.8537311994104767


In [25]:
df[cat_features] = df[cat_features].fillna("Desconhecido")

# Garantir tipo string
for col in cat_features:
    df[col] = df[col].astype(str)

# Garante que todas as categóricas não tenham NaN e sejam string
for col in cat_features:
    X[col] = X[col].fillna("DESCONHECIDO").astype(str)

    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna("DESCONHECIDO").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna("DESCONHECIDO").astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].fillna("DESCONHECIDO").astype(str)
A value is trying to be set on a copy of a slice fro

catboost

In [26]:
cat_idx = [X.columns.get_loc(col) for col in cat_features]

modelo_cat = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[1, scale],
    verbose=100,
    random_state=42
)

modelo_cat.fit(
    X_train,
    y_train,
    cat_features=cat_idx
)

y_proba_cat = modelo_cat.predict_proba(X_test)[:, 1]


0:	total: 1.47s	remaining: 19m 37s
100:	total: 2m 12s	remaining: 15m 15s
200:	total: 4m 33s	remaining: 13m 34s
300:	total: 6m 54s	remaining: 11m 26s
400:	total: 9m 19s	remaining: 9m 16s
500:	total: 11m 47s	remaining: 7m 2s
600:	total: 14m 24s	remaining: 4m 46s
700:	total: 16m 57s	remaining: 2m 23s
799:	total: 19m 28s	remaining: 0us


In [27]:
avaliar_modelo(y_test, y_proba_cat, "CatBoost")



===== CatBoost =====
Threshold ótimo: 0.7272776346137008
F1: 0.7639580938944169
Precision: 0.7954919513467946
Recall: 0.7348289607861315
              precision    recall  f1-score   support

           0       0.95      0.96      0.95    255999
           1       0.80      0.73      0.76     53935

    accuracy                           0.92    309934
   macro avg       0.87      0.85      0.86    309934
weighted avg       0.92      0.92      0.92    309934

ROC AUC: 0.9566382872095766


In [29]:
avaliar_modelo(y_test, y_proba_xgb, "XGBoost")
avaliar_modelo(y_test, y_proba_cat, "CatBoost")



===== XGBoost =====
Threshold ótimo: 0.6432057
F1: 0.5474209222203594
Precision: 0.5045343673775721
Recall: 0.5982757022341708
              precision    recall  f1-score   support

           0       0.91      0.88      0.89    255999
           1       0.50      0.60      0.55     53935

    accuracy                           0.83    309934
   macro avg       0.71      0.74      0.72    309934
weighted avg       0.84      0.83      0.83    309934

ROC AUC: 0.8537311994104767

===== CatBoost =====
Threshold ótimo: 0.7272776346137008
F1: 0.7639580938944169
Precision: 0.7954919513467946
Recall: 0.7348289607861315
              precision    recall  f1-score   support

           0       0.95      0.96      0.95    255999
           1       0.80      0.73      0.76     53935

    accuracy                           0.92    309934
   macro avg       0.87      0.85      0.86    309934
weighted avg       0.92      0.92      0.92    309934

ROC AUC: 0.9566382872095766
