In [9]:
import pandas as pd
import re
import warnings
import logging
from pathlib import Path

# Configurar o logger para mostrar mensagens no notebook
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Desativar um aviso específico do pandas sobre 'chained assignment'
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

Tratamento do Arquivo Accidents0515.csv

In [10]:
# Definir os caminhos dos arquivos
raw_accident_file = "../../raw/Accidents0515.csv"

# Carregar uma amostra do DataFrame
df_raw_accident = pd.read_csv(raw_accident_file)
df_accident = df_raw_accident.sample(100_000, random_state=42)
logging.info(f"Amostra de 100.000 linhas de dados brutos selecionada.")

df_accident.head()

INFO:Amostra de 100.000 linhas de dados brutos selecionada.


Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
1454743,2013440420297,464593.0,105760.0,-1.083863,50.847699,44,2,1,1,06/11/2013,...,0,0,1,2,2,0,0,1,1,E01017112
59836,2005121620018,432700.0,455830.0,-1.502665,53.997476,12,3,1,1,11/06/2005,...,0,8,4,1,1,0,0,1,1,E01027727
1180041,2011471105667,531360.0,107140.0,-0.13536,50.848807,47,3,2,2,09/09/2011,...,0,4,1,1,1,0,0,1,1,E01016939
187735,2005930002099,340040.0,732760.0,-2.975172,56.483166,93,3,2,1,27/05/2005,...,0,0,1,1,1,0,0,1,1,
1329139,201250CD3D019,245382.0,126930.0,-4.205966,51.020841,50,3,2,1,28/09/2012,...,0,8,1,1,1,0,0,1,1,E01020282


In [11]:
# Padronizar nomes
df_accident.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df_accident.columns]

# Garantir colunas obrigatórias
required_cols = ["accident_index", "date", "longitude", "latitude"]
for col in required_cols:
    if col not in df_accident.columns:
        raise ValueError(f"Coluna obrigatória ausente: {col}")

# Ajuste de tipos
df_accident['date'] = pd.to_datetime(df_accident['date'], dayfirst=True)
df_accident['accident_timestamp'] = pd.to_datetime(df_accident['date'].dt.strftime('%Y-%m-%d') + ' ' + df_accident['time'])

# Lista de colunas a serem descartadas que não existem no DDL
columns_to_drop = [
    "location_easting_osgr", "location_northing_osgr", "police_force",
    "local_authority_(district)", "local_authority_(highway)",
    "1st_road_class", "1st_road_number", "2nd_road_class",
    "2nd_road_number", "pedestrian_crossing_human_control",
    "did_police_officer_attend_scene_of_accident",
    "lsoa_of_accident_location","date", "time"
]
df_accident = df_accident.drop(columns=columns_to_drop)

# Limpeza de dados
df_accident = df_accident.dropna()

# Limites geográficos (Reino Unido)
df_accident = df_accident[df_accident["longitude"].between(-10, 5)]   
df_accident = df_accident[df_accident["latitude"].between(49, 61)]

# Limites de velocidade razoáveis
if "speed_limit" in df_accident.columns:
    df_accident = df_accident[df_accident["speed_limit"].between(10, 200)]

# Garantir inteiros válidos
if "number_of_vehicles" in df_accident.columns:
    df_accident = df_accident[df_accident["number_of_vehicles"] >= 1]
if "number_of_casualties" in df_accident.columns:
    df_accident = df_accident[df_accident["number_of_casualties"] >= 0]


# Remover duplicatas
df_accident = df_accident.drop_duplicates(subset=["accident_index"])

df_accident.head()

df_silver_accident = df_accident

In [None]:
silver_accident_file = "../accidents.parquet"

df_silver_accident.to_parquet(silver_accident_file, index=False)
logging.info(f"Silver salvo em {silver_accident_file}")

INFO:Silver salvo em ../accidents.parquet


Tratamento do Arquivo Casualties0515.csv

In [18]:
# Definir os caminhos dos arquivos
raw_casualties_file = "../../raw/Casualties0515.csv"

# Carregar uma amostra do DataFrame, estamos ignorando as linhas com erros pois o arquivo tem algumas linhas com colunas extras.
df_raw_casualties = pd.read_csv(raw_casualties_file, on_bad_lines='skip')
df_casualties = df_raw_casualties.sample(100_000, random_state=42)
logging.info(f"Amostra de 100.000 linhas de dados brutos selecionada.")

df_casualties.head()

INFO:Amostra de 100.000 linhas de dados brutos selecionada.


Unnamed: 0,Accident_Index,Vehicle_Reference,Casualty_Reference,Casualty_Class,Sex_of_Casualty,Age_of_Casualty,Age_Band_of_Casualty,Casualty_Severity,Pedestrian_Location,Pedestrian_Movement,Car_Passenger,Bus_or_Coach_Passenger,Pedestrian_Road_Maintenance_Worker,Casualty_Type,Casualty_Home_Area_Type
1114716,2009230991332,2,1,1,1,20,4,2,0,0,0,0,-1,2,1
450129,2006440089428,1,1,3,2,15,3,2,5,1,0,0,-1,0,1
117177,2005215130075,2,1,1,2,35,6,3,0,0,0,0,-1,9,1
197956,200545RE73241,2,2,1,1,46,8,3,0,0,0,0,-1,9,1
2194543,201454A137514,1,1,1,2,24,5,3,0,0,0,0,0,9,1


In [20]:
# Padronizar nomes
df_casualties.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df_casualties.columns]

# Garantir colunas obrigatórias
required_cols = ["accident_index", "vehicle_reference", "casualty_reference"]
for col in required_cols:
    if col not in df_casualties.columns:
        raise ValueError(f"Coluna obrigatória ausente: {col}")

# Limpeza de dados
df_casualties = df_casualties.dropna()

# Lista de colunas a serem descartadas que não existem no DDL
columns_to_drop = [
    "pedestrian_location", "pedestrian_road_maintenance_worker", "casualty_home_area_type"
]
df_casualties = df_casualties.drop(columns=columns_to_drop)


df_silver_casualties = df_casualties

df_casualties.head()

Unnamed: 0,accident_index,vehicle_reference,casualty_reference,casualty_class,sex_of_casualty,age_of_casualty,age_band_of_casualty,casualty_severity,pedestrian_movement,car_passenger,bus_or_coach_passenger,casualty_type
1114716,2009230991332,2,1,1,1,20,4,2,0,0,0,2
450129,2006440089428,1,1,3,2,15,3,2,1,0,0,0
117177,2005215130075,2,1,1,2,35,6,3,0,0,0,9
197956,200545RE73241,2,2,1,1,46,8,3,0,0,0,9
2194543,201454A137514,1,1,1,2,24,5,3,0,0,0,9


In [21]:
silver_casualties_file = "../casualties.parquet"

df_silver_casualties.to_parquet(silver_casualties_file, index=False)
logging.info(f"Silver salvo em {silver_casualties_file}")

INFO:Silver salvo em ../casualties.parquet


In [28]:
# Definir os caminhos dos arquivos
raw_vehicles_file = "../../raw/Vehicles0515.csv"

# Carregar uma amostra do DataFrame
df_raw_vehicles = pd.read_csv(raw_vehicles_file, on_bad_lines='skip')
df_vehicles = df_raw_vehicles.sample(100_000, random_state=42)
logging.info(f"Amostra de 100.000 linhas de dados brutos selecionada.")

df_vehicles.head()

INFO:Amostra de 100.000 linhas de dados brutos selecionada.


Unnamed: 0,Accident_Index,Vehicle_Reference,Vehicle_Type,Towing_and_Articulation,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Junction_Location,Skidding_and_Overturning,Hit_Object_in_Carriageway,Vehicle_Leaving_Carriageway,...,Was_Vehicle_Left_Hand_Drive?,Journey_Purpose_of_Driver,Sex_of_Driver,Age_of_Driver,Age_Band_of_Driver,Engine_Capacity_(CC),Propulsion_Code,Age_of_Vehicle,Driver_IMD_Decile,Driver_Home_Area_Type
1899420,2010521003424,1,9,0,18,0,1,0,0,0,...,1,15,2,23,5,1910,2,3,1,1
13598,200501JI40818,1,21,0,17,0,8,5,10,1,...,1,15,1,31,6,5900,2,2,2,1
1673144,201001KD50543,2,9,0,16,0,8,0,0,0,...,1,15,2,43,7,1973,1,12,-1,-1
881323,20073102B4193,1,9,0,18,0,1,1,0,0,...,1,15,1,17,4,1242,1,9,7,1
1242375,20084100F4571,2,1,0,18,0,8,0,0,0,...,1,2,1,59,9,-1,-1,-1,10,1


In [None]:
# Padronizar nomes
df_vehicles.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df_vehicles.columns]

# Garantir colunas obrigatórias
required_cols = ["accident_index", "vehicle_reference", "vehicle_type"]
for col in required_cols:
    if col not in df_vehicles.columns:
        raise ValueError(f"Coluna obrigatória ausente: {col}")

# Limpeza de dados
df_vehicles = df_vehicles.dropna()

# Lista de colunas a serem descartadas que não existem no DDL
columns_to_drop = [
    "towing_and_articulation", "junction_location", "skidding_and_overturning", "hit_object_in_carriageway",
    "vehicle_leaving_carriageway", "journey_purpose_of_driver", "engine_capacity_(cc)",
    "driver_imd_decile", "driver_home_area_type"
]
df_vehicles = df_vehicles.drop(columns=columns_to_drop)

df_silver_casualties = df_vehicles

df_silver_casualties.head()

KeyError: "['driver_imd_secile'] not found in axis"