In [1]:
# 04_modeling_optional.ipynb

# ============================================
# 0. SETUP
# ============================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Carregar parquet já limpos
df_acc = pd.read_parquet("datasets/cleaned/Acidentes_DadosAbertos_20250912.parquet")
df_vei = pd.read_parquet("datasets/cleaned/TipoVeiculo_DadosAbertos_20250912.parquet")
df_vit = pd.read_parquet("datasets/cleaned/Vitimas_DadosAbertos_20250912.parquet")

# ============================================
# 1. Construção de dataset para modelagem
# ============================================
df = df_vei.merge(
    df_acc[["num_acidente","dia_semana","fase_dia","cond_meteorologica","tp_rodovia","cond_pista"]],
    on="num_acidente", how="left"
).merge(
    df_vit.groupby("num_acidente")[["qtde_obitos"]].sum().reset_index(),
    on="num_acidente", how="left"
)

df["grave"] = (df["qtde_obitos"].fillna(0) > 0).astype(int)

# Features categóricas
X = pd.get_dummies(df[["tipo_veiculo","dia_semana","fase_dia","cond_meteorologica","tp_rodovia","cond_pista"]], drop_first=True)
y = df["grave"]

# ============================================
# 2. Treino/teste
# ============================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# ============================================
# 3. Avaliação
# ============================================
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99   1803123
           1       0.81      0.00      0.01     36213

    accuracy                           0.98   1839336
   macro avg       0.89      0.50      0.50   1839336
weighted avg       0.98      0.98      0.97   1839336

ROC AUC: 0.7946316039265433
