In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [2]:
df = pd.read_csv(
    "/home/fause/ML-Transportes/PBIC/Etapa3/acidentes_pbic_2020_2025_Final.csv",
    encoding="utf-8",
    parse_dates=["data_inversa"],
    dayfirst=True,
    low_memory=False
)

print(df.shape)
df.head()


(1678326, 48)


Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_principal,...,mes,dia_mes,hora,gravidade_numerica,total_vitimas,Marca_Principal,Modelo_Grupo,Nome_Modelo,Fabricante,Modelo
0,260031.0,578475.0,2020-01-01,quarta-feira,01:00:00,TO,153.0,6781,GURUPI,Sim,...,1,1,1,2,1.0,FIAT,SIENA ATTRACTIV 1.4/SIENA ATTRACTIV 1.4,SIENA,FIAT,SIENA
1,260031.0,578475.0,2020-01-01,quarta-feira,01:00:00,TO,153.0,6781,GURUPI,Sim,...,1,1,1,2,1.0,FIAT,SIENA ATTRACTIV 1.4/SIENA ATTRACTIV 1.4,SIENA,FIAT,SIENA
2,260036.0,582005.0,2020-01-01,quarta-feira,01:00:00,RJ,116.0,178,NOVA IGUACU,N√£o,...,1,1,1,0,1.0,N√£o Informado,N√£o Informado/N√£o Informado,NO,N√£o Informado,NO
3,260036.0,582005.0,2020-01-01,quarta-feira,01:00:00,RJ,116.0,178,NOVA IGUACU,Sim,...,1,1,1,0,1.0,N√£o Informado,N√£o Informado/N√£o Informado,NO,N√£o Informado,NO
4,260037.0,578473.0,2020-01-01,quarta-feira,01:52:00,SC,101.0,2069,SAO JOSE,Sim,...,1,1,1,0,1.0,VW,NOVA SAVEIRO RB MBVS/NOVA SAVEIRO RB MBVS,NOVA,VOLKSWAGEN,NOVA


In [3]:

df = df.dropna(subset=["gravidade_numerica"])

df["grave"] = (df["gravidade_numerica"] >= 2).astype(int)


In [4]:
df['data_inversa'] = pd.to_datetime(df['data_inversa'], format='%d/%m/%Y', errors='coerce')


df["hora"] = df["horario"].str.slice(0,2).astype(int)
df["periodo"] = pd.cut(
    df["hora"],
    bins=[0,6,12,18,24],
    labels=["madrugada", "manh√£", "tarde", "noite"],
    include_lowest=True
)

df['dia_semana'] = df['data_inversa'].dt.weekday
df['fim_semana'] = df['dia_semana'].isin([5,6]).map({True:"fim_semana", False:"dia_util"})


In [5]:
cols = [
    "grave",
    "tipo_veiculo",
    "tipo_pista",
    "fase_dia",
    "condicao_metereologica",
    "sentido_via",
    "periodo",
    "fim_semana",
    "classificacao_acidente",
]
df_rules = df[cols].copy()

df_rules.head()


Unnamed: 0,grave,tipo_veiculo,tipo_pista,fase_dia,condicao_metereologica,sentido_via,periodo,fim_semana,classificacao_acidente
0,1,Autom√≥vel,Simples,Plena Noite,Nublado,Crescente,madrugada,dia_util,Com V√≠timas Feridas
1,1,Autom√≥vel,Simples,Plena Noite,Nublado,Crescente,madrugada,dia_util,Com V√≠timas Feridas
2,0,Outros,Dupla,Plena Noite,C√©u Claro,Decrescente,madrugada,dia_util,Com V√≠timas Fatais
3,0,Outros,Dupla,Plena Noite,C√©u Claro,Decrescente,madrugada,dia_util,Com V√≠timas Fatais
4,0,Caminhonete,Simples,Plena Noite,Nublado,Crescente,madrugada,dia_util,Com V√≠timas Feridas


In [None]:
transactions = df_rules.apply(lambda x: [f"{col}={x[col]}" for col in df_rules.columns], axis=1)
transactions = transactions.tolist()


In [None]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_tf = pd.DataFrame(te_ary, columns=te.columns_)

df_tf.head()


Unnamed: 0,classificacao_acidente=Com V√≠timas Fatais,classificacao_acidente=Com V√≠timas Feridas,classificacao_acidente=Sem V√≠timas,classificacao_acidente=nan,condicao_metereologica=Chuva,condicao_metereologica=C√©u Claro,condicao_metereologica=Garoa/Chuvisco,condicao_metereologica=Granizo,condicao_metereologica=Ignorado,condicao_metereologica=Neve,...,tipo_veiculo=Outros,tipo_veiculo=Quadriciclo,tipo_veiculo=Reboque,tipo_veiculo=Semireboque,tipo_veiculo=Trator de esteira,tipo_veiculo=Trator de rodas,tipo_veiculo=Trator misto,tipo_veiculo=Triciclo,tipo_veiculo=Utilit√°rio,tipo_veiculo=√înibus
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Verificar tamanho dos dados
print(f"N√∫mero de transa√ß√µes: {len(df_tf)}")
print(f"N√∫mero de itens √∫nicos: {len(df_tf.columns)}")
print(f"Tamanho estimado em mem√≥ria: {df_tf.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Se necess√°rio, reduzir a amostra
if len(df_tf) > 100000:
    print("\n‚ö†Ô∏è Dataset muito grande! Reduzindo para 100.000 amostras...")
    df_tf_sample = df_tf.sample(n=100000, random_state=42)
else:
    df_tf_sample = df_tf

print(f"\nUsando {len(df_tf_sample)} transa√ß√µes para an√°lise")

N√∫mero de transa√ß√µes: 1576318
N√∫mero de itens √∫nicos: 55
Tamanho estimado em mem√≥ria: 0.08 GB

‚ö†Ô∏è Dataset muito grande! Reduzindo para 100.000 amostras...

Usando 100000 transa√ß√µes para an√°lise


In [None]:
# Usar low_memory=True e aumentar min_support para reduzir uso de mem√≥ria
freq_items = apriori(df_tf_sample, min_support=0.01, use_colnames=True, low_memory=True, verbose=1)
freq_items.sort_values("support", ascending=False).head(10)

Processing 20 combinations | Sampling itemset size 9876


Unnamed: 0,support,itemsets
13,1.0,(fim_semana=dia_util)
242,0.82414,"(fim_semana=dia_util, grave=0)"
14,0.82414,(grave=0)
1,0.75411,(classificacao_acidente=Com V√≠timas Feridas)
64,0.75411,"(fim_semana=dia_util, classificacao_acidente=C..."
599,0.64173,"(fim_semana=dia_util, classificacao_acidente=C..."
65,0.64173,"(classificacao_acidente=Com V√≠timas Feridas, g..."
4,0.61318,(condicao_metereologica=C√©u Claro)
126,0.61318,"(condicao_metereologica=C√©u Claro, fim_semana=..."
224,0.55967,"(fase_dia=Pleno dia, fim_semana=dia_util)"


In [None]:
rules = association_rules(freq_items, metric="confidence", min_threshold=0.2)
rules = rules.sort_values("lift", ascending=False)
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
24155,"(periodo=noite, classificacao_acidente=Com V√≠t...","(fase_dia=Plena Noite, tipo_pista=Simples, gra...",0.03209,0.04579,0.0125,0.389529,8.506867,1.0,0.011031,1.563073,0.911705,0.19119,0.360235,0.331257
24145,"(fase_dia=Plena Noite, tipo_pista=Simples, gra...","(periodo=noite, classificacao_acidente=Com V√≠t...",0.04579,0.03209,0.0125,0.272985,8.506867,1.0,0.011031,1.331349,0.924794,0.19119,0.248882,0.331257
63375,"(fase_dia=Plena Noite, tipo_pista=Simples, fim...","(periodo=noite, classificacao_acidente=Com V√≠t...",0.04579,0.03209,0.0125,0.272985,8.506867,1.0,0.011031,1.331349,0.924794,0.19119,0.248882,0.331257
63402,"(periodo=noite, classificacao_acidente=Com V√≠t...","(fase_dia=Plena Noite, tipo_pista=Simples, fim...",0.03209,0.04579,0.0125,0.389529,8.506867,1.0,0.011031,1.563073,0.911705,0.19119,0.360235,0.331257
63398,"(periodo=noite, fim_semana=dia_util, classific...","(fase_dia=Plena Noite, tipo_pista=Simples, gra...",0.03209,0.04579,0.0125,0.389529,8.506867,1.0,0.011031,1.563073,0.911705,0.19119,0.360235,0.331257


In [None]:
rules_graves = rules[rules["consequents"].astype(str).str.contains("grave=1")]
rules_graves = rules_graves.sort_values(["lift", "confidence"], ascending=False)

rules_graves.head(20)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
24155,"(periodo=noite, classificacao_acidente=Com V√≠t...","(fase_dia=Plena Noite, tipo_pista=Simples, gra...",0.03209,0.04579,0.0125,0.389529,8.506867,1.0,0.011031,1.563073,0.911705,0.19119,0.360235,0.331257
63402,"(periodo=noite, classificacao_acidente=Com V√≠t...","(fase_dia=Plena Noite, tipo_pista=Simples, fim...",0.03209,0.04579,0.0125,0.389529,8.506867,1.0,0.011031,1.563073,0.911705,0.19119,0.360235,0.331257
63398,"(periodo=noite, fim_semana=dia_util, classific...","(fase_dia=Plena Noite, tipo_pista=Simples, gra...",0.03209,0.04579,0.0125,0.389529,8.506867,1.0,0.011031,1.563073,0.911705,0.19119,0.360235,0.331257
24153,"(fase_dia=Plena Noite, classificacao_acidente=...","(grave=1, tipo_pista=Simples, periodo=noite)",0.05297,0.02885,0.0125,0.235983,8.179641,1.0,0.010972,1.27111,0.92684,0.180323,0.213286,0.334629
63391,"(fase_dia=Plena Noite, fim_semana=dia_util, cl...","(grave=1, tipo_pista=Simples, periodo=noite)",0.05297,0.02885,0.0125,0.235983,8.179641,1.0,0.010972,1.27111,0.92684,0.180323,0.213286,0.334629
63400,"(fase_dia=Plena Noite, classificacao_acidente=...","(grave=1, tipo_pista=Simples, periodo=noite, f...",0.05297,0.02885,0.0125,0.235983,8.179641,1.0,0.010972,1.27111,0.92684,0.180323,0.213286,0.334629
125002,"(fase_dia=Plena Noite, tipo_pista=Simples, tip...","(grave=1, periodo=noite, fim_semana=dia_util, ...",0.04356,0.02868,0.01005,0.230716,8.0445,1.0,0.008801,1.262629,0.915574,0.161602,0.208002,0.290567
93696,"(fase_dia=Plena Noite, tipo_pista=Simples, tip...","(grave=1, periodo=noite, condicao_metereologic...",0.04356,0.02868,0.01005,0.230716,8.0445,1.0,0.008801,1.262629,0.915574,0.161602,0.208002,0.290567
124984,"(fase_dia=Plena Noite, tipo_pista=Simples, tip...","(grave=1, periodo=noite, condicao_metereologic...",0.04356,0.02868,0.01005,0.230716,8.0445,1.0,0.008801,1.262629,0.915574,0.161602,0.208002,0.290567
63392,"(fase_dia=Plena Noite, tipo_pista=Simples, cla...","(grave=1, periodo=noite, fim_semana=dia_util)",0.03763,0.04279,0.0125,0.332182,7.76307,1.0,0.01089,1.433339,0.90525,0.18404,0.302328,0.312153


In [None]:
def interpret_rules(r):
    for _, row in r.iterrows():
        print(f"\nüìå Regra:")
        print(f"SE {set(row['antecedents'])} ENT√ÉO {set(row['consequents'])}")
        print(f"‚Üí Suporte: {row['support']:.4f}")
        print(f"‚Üí Confian√ßa: {row['confidence']:.4f}")
        print(f"‚Üí Lift: {row['lift']:.4f} (>=1 indica risco acima da m√©dia)")
        
interpret_rules(rules_graves.head(10))



üìå Regra:
SE {'periodo=noite', 'classificacao_acidente=Com V√≠timas Fatais'} ENT√ÉO {'fase_dia=Plena Noite', 'tipo_pista=Simples', 'grave=1'}
‚Üí Suporte: 0.0125
‚Üí Confian√ßa: 0.3895
‚Üí Lift: 8.5069 (>=1 indica risco acima da m√©dia)

üìå Regra:
SE {'periodo=noite', 'classificacao_acidente=Com V√≠timas Fatais'} ENT√ÉO {'fase_dia=Plena Noite', 'tipo_pista=Simples', 'fim_semana=dia_util', 'grave=1'}
‚Üí Suporte: 0.0125
‚Üí Confian√ßa: 0.3895
‚Üí Lift: 8.5069 (>=1 indica risco acima da m√©dia)

üìå Regra:
SE {'periodo=noite', 'fim_semana=dia_util', 'classificacao_acidente=Com V√≠timas Fatais'} ENT√ÉO {'fase_dia=Plena Noite', 'tipo_pista=Simples', 'grave=1'}
‚Üí Suporte: 0.0125
‚Üí Confian√ßa: 0.3895
‚Üí Lift: 8.5069 (>=1 indica risco acima da m√©dia)

üìå Regra:
SE {'fase_dia=Plena Noite', 'classificacao_acidente=Com V√≠timas Fatais'} ENT√ÉO {'grave=1', 'tipo_pista=Simples', 'periodo=noite'}
‚Üí Suporte: 0.0125
‚Üí Confian√ßa: 0.2360
‚Üí Lift: 8.1796 (>=1 indica risco acima da m√