In [21]:
import pandas as pd
import re
import warnings
import logging
from pathlib import Path

# Configurar o logger para mostrar mensagens no notebook
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Desativar um aviso específico do pandas sobre 'chained assignment'
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

Tratamento do Arquivo Accidents0515.csv

In [22]:
# Definir os caminhos dos arquivos
raw_accident_file = "../../raw/Accidents0515.csv"

# Carregar uma amostra do DataFrame
df_raw_accident = pd.read_csv(raw_accident_file)
df_accident = df_raw_accident.sample(100_000, random_state=42)
logging.info(f"Amostra de 100.000 linhas de dados brutos selecionada.")

df_accident.head()

INFO:Amostra de 100.000 linhas de dados brutos selecionada.


Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
1454743,2013440420297,464593.0,105760.0,-1.083863,50.847699,44,2,1,1,06/11/2013,...,0,0,1,2,2,0,0,1,1,E01017112
59836,2005121620018,432700.0,455830.0,-1.502665,53.997476,12,3,1,1,11/06/2005,...,0,8,4,1,1,0,0,1,1,E01027727
1180041,2011471105667,531360.0,107140.0,-0.13536,50.848807,47,3,2,2,09/09/2011,...,0,4,1,1,1,0,0,1,1,E01016939
187735,2005930002099,340040.0,732760.0,-2.975172,56.483166,93,3,2,1,27/05/2005,...,0,0,1,1,1,0,0,1,1,
1329139,201250CD3D019,245382.0,126930.0,-4.205966,51.020841,50,3,2,1,28/09/2012,...,0,8,1,1,1,0,0,1,1,E01020282


In [23]:
# Padronizar nomes
df_accident.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df_accident.columns]

# Garantir colunas obrigatórias
required_cols = ["accident_index", "date", "longitude", "latitude"]
for col in required_cols:
    if col not in df_accident.columns:
        raise ValueError(f"Coluna obrigatória ausente: {col}")

# Ajuste de tipos
df_accident['date'] = pd.to_datetime(df_accident['date'], dayfirst=True)
df_accident['accident_timestamp'] = pd.to_datetime(df_accident['date'].dt.strftime('%Y-%m-%d') + ' ' + df_accident['time'])

# Lista de colunas a serem descartadas que não existem no DDL
columns_to_drop = [
    "location_easting_osgr", "location_northing_osgr", "police_force",
    "local_authority_(district)", "local_authority_(highway)",
    "1st_road_class", "1st_road_number", "2nd_road_class",
    "2nd_road_number", "pedestrian_crossing_human_control",
    "did_police_officer_attend_scene_of_accident",
    "lsoa_of_accident_location","date", "time"
]
df_accident = df_accident.drop(columns=columns_to_drop)

# Limpeza de dados
df_accident = df_accident.dropna()

# Limites geográficos (Reino Unido)
df_accident = df_accident[df_accident["longitude"].between(-10, 5)]   
df_accident = df_accident[df_accident["latitude"].between(49, 61)]

# Limites de velocidade razoáveis
if "speed_limit" in df_accident.columns:
    df_accident = df_accident[df_accident["speed_limit"].between(10, 200)]

# Garantir inteiros válidos
if "number_of_vehicles" in df_accident.columns:
    df_accident = df_accident[df_accident["number_of_vehicles"] >= 1]
if "number_of_casualties" in df_accident.columns:
    df_accident = df_accident[df_accident["number_of_casualties"] >= 0]


# Remover duplicatas
df_accident = df_accident.drop_duplicates(subset=["accident_index"])


df_silver_accident = df_accident

df_silver_accident.head()


Unnamed: 0,accident_index,longitude,latitude,accident_severity,number_of_vehicles,number_of_casualties,day_of_week,road_type,speed_limit,junction_detail,junction_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area,accident_timestamp
1454743,2013440420297,-1.083863,50.847699,2,1,1,4,6,20,0,-1,0,1,2,2,0,0,1,2013-11-06 15:45:00
59836,2005121620018,-1.502665,53.997476,3,1,1,7,6,30,3,4,8,4,1,1,0,0,1,2005-06-11 00:20:00
1180041,2011471105667,-0.13536,50.848807,3,2,2,6,6,30,3,4,4,1,1,1,0,0,1,2011-09-09 08:49:00
187735,2005930002099,-2.975172,56.483166,3,2,1,6,1,30,1,4,0,1,1,1,0,0,1,2005-05-27 10:50:00
1329139,201250CD3D019,-4.205966,51.020841,3,2,1,6,6,30,3,4,8,1,1,1,0,0,1,2012-09-28 14:00:00


In [24]:
silver_accident_file = "../accidents.parquet"

df_silver_accident.to_parquet(silver_accident_file, index=False)
logging.info(f"Silver salvo em {silver_accident_file}")

INFO:Silver salvo em ../accidents.parquet


In [25]:
# Definir os caminhos dos arquivos
raw_vehicles_file = "../../raw/Vehicles0515.csv"

# Carregar uma amostra do DataFrame
df_raw_vehicles = pd.read_csv(raw_vehicles_file, on_bad_lines='skip')
df_vehicles = df_raw_vehicles.sample(100_000, random_state=42)
df_vehicles = df_vehicles[df_vehicles["Accident_Index"].isin(df_silver_accident["accident_index"].unique())]
logging.info(f"Amostra de 100.000 linhas de dados brutos selecionada.")

df_vehicles.head()

INFO:Amostra de 100.000 linhas de dados brutos selecionada.


Unnamed: 0,Accident_Index,Vehicle_Reference,Vehicle_Type,Towing_and_Articulation,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Junction_Location,Skidding_and_Overturning,Hit_Object_in_Carriageway,Vehicle_Leaving_Carriageway,...,Was_Vehicle_Left_Hand_Drive?,Journey_Purpose_of_Driver,Sex_of_Driver,Age_of_Driver,Age_Band_of_Driver,Engine_Capacity_(CC),Propulsion_Code,Age_of_Vehicle,Driver_IMD_Decile,Driver_Home_Area_Type
1579707,200945RE40475,4,9,0,4,0,0,0,0,0,...,1,2,2,26,6,2198,1,6,3,1
1379939,200901PL60314,2,9,0,18,0,0,0,0,0,...,1,15,3,-1,-1,-1,-1,-1,-1,-1
1828595,201037G030338,1,11,0,3,0,1,0,0,0,...,1,1,1,-1,-1,14618,2,12,-1,-1
2541705,201306K078584,1,9,0,18,0,1,1,0,0,...,1,2,1,30,6,-1,-1,-1,3,1
1028411,200763DP36107,1,9,0,18,0,0,0,0,1,...,1,1,2,63,9,1868,2,6,-1,3


In [26]:
# Padronizar nomes
df_vehicles.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df_vehicles.columns]

# Garantir colunas obrigatórias
required_cols = ["accident_index", "vehicle_reference", "vehicle_type"]
for col in required_cols:
    if col not in df_vehicles.columns:
        raise ValueError(f"Coluna obrigatória ausente: {col}")

# Limpeza de dados
df_vehicles = df_vehicles.dropna()

df_vehicles['was_vehicle_left_hand_drive'] = df_vehicles['was_vehicle_left_hand_drive?']

# Lista de colunas a serem descartadas que não existem no DDL
columns_to_drop = [
    "towing_and_articulation", "junction_location", "skidding_and_overturning", "hit_object_in_carriageway","hit_object_off_carriageway",
    "vehicle_leaving_carriageway", "journey_purpose_of_driver", "engine_capacity_(cc)",
    "driver_imd_decile", "driver_home_area_type","was_vehicle_left_hand_drive?", "1st_point_of_impact"
]

df_vehicles = df_vehicles.drop(columns=columns_to_drop)

df_silver_vehicles = df_vehicles

df_silver_vehicles.head()

Unnamed: 0,accident_index,vehicle_reference,vehicle_type,vehicle_manoeuvre,vehicle_location_restricted_lane,sex_of_driver,age_of_driver,age_band_of_driver,propulsion_code,age_of_vehicle,was_vehicle_left_hand_drive
1579707,200945RE40475,4,9,4,0,2,26,6,1,6,1
1379939,200901PL60314,2,9,18,0,3,-1,-1,-1,-1,1
1828595,201037G030338,1,11,3,0,1,-1,-1,2,12,1
2541705,201306K078584,1,9,18,0,1,30,6,-1,-1,1
1028411,200763DP36107,1,9,18,0,2,63,9,2,6,1


In [27]:
silver_vehicles_file = "../vehicles.parquet"

df_silver_vehicles.to_parquet(silver_vehicles_file, index=False)
logging.info(f"Silver salvo em {silver_vehicles_file}")

INFO:Silver salvo em ../vehicles.parquet


Tratamento do Arquivo Casualties0515.csv

In [28]:
# Definir os caminhos dos arquivos
raw_casualties_file = "../../raw/Casualties0515.csv"

# Carregar uma amostra do DataFrame, estamos ignorando as linhas com erros pois o arquivo tem algumas linhas com colunas extras.
df_raw_casualties = pd.read_csv(raw_casualties_file, on_bad_lines='skip')
df_casualties = df_raw_casualties.sample(100_000, random_state=42)
df_casualties['compound_key'] = df_casualties['Accident_Index'] + '_' + df_casualties['Vehicle_Reference'].astype(str)
df_silver_vehicles['compound_key'] = df_silver_vehicles['accident_index'] + '_' + df_silver_vehicles['vehicle_reference'].astype(str)
df_casualties = df_casualties[df_casualties["compound_key"].isin(df_silver_vehicles["compound_key"].unique())]
logging.info(f"Amostra de 100.000 linhas de dados brutos selecionada.")

df_casualties.head()

INFO:Amostra de 100.000 linhas de dados brutos selecionada.


Unnamed: 0,Accident_Index,Vehicle_Reference,Casualty_Reference,Casualty_Class,Sex_of_Casualty,Age_of_Casualty,Age_Band_of_Casualty,Casualty_Severity,Pedestrian_Location,Pedestrian_Movement,Car_Passenger,Bus_or_Coach_Passenger,Pedestrian_Road_Maintenance_Worker,Casualty_Type,Casualty_Home_Area_Type,compound_key
1880721,201306P071391,2,2,2,2,9,2,3,0,0,2,0,0,9,1,201306P071391_2
1901791,201314A063413,1,1,3,2,82,11,3,5,1,0,0,0,0,1,201314A063413_1
715174,2007460139360,2,2,2,2,78,11,3,0,0,2,0,-1,9,1,2007460139360_2
1129546,2009330904190,1,1,1,2,21,5,2,0,0,0,0,-1,9,2,2009330904190_1
2088245,2014131AD1283,2,1,1,1,28,6,3,0,0,0,0,0,1,1,2014131AD1283_2


In [None]:
# Padronizar nomes
df_casualties.columns = [col.lower().replace(" ", "_").replace("-", "_") for col in df_casualties.columns]

# Garantir colunas obrigatórias
required_cols = ["accident_index", "vehicle_reference", "casualty_reference"]
for col in required_cols:
    if col not in df_casualties.columns:
        raise ValueError(f"Coluna obrigatória ausente: {col}")

# Limpeza de dados
df_casualties = df_casualties.dropna()

if "bus_or_coach_passenger" in df_casualties.columns:
    df_casualties = df_casualties[
        (df_casualties["bus_or_coach_passenger"] >= 1) & (df_casualties["bus_or_coach_passenger"] <= 9)
    ]
# Lista de colunas a serem descartadas que não existem no DDL
columns_to_drop = [
    "pedestrian_location", "pedestrian_road_maintenance_worker", "casualty_home_area_type", "compound_key"
]
df_casualties = df_casualties.drop(columns=columns_to_drop)


df_silver_casualties = df_casualties

df_casualties.head()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [20]:
silver_casualties_file = "../casualties.parquet"

df_silver_casualties.to_parquet(silver_casualties_file, index=False)
logging.info(f"Silver salvo em {silver_casualties_file}")

INFO:Silver salvo em ../casualties.parquet
