In [2]:
import time
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from tqdm.notebook import tqdm
from datetime import timedelta

In [2]:
tqdm.pandas()

In [3]:
def timed_function(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()  # Record the start time
        result = func(*args, **kwargs)  # Execute the function
        end_time = time.time()  # Record the end time
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time:.6f} seconds")
        return result
    return wrapper

## Merging

In [None]:
accidents = pd.read_csv('../data/clean/accidents_clean.csv')
traffic = pd.read_csv('../data/clean/traffic_clean.csv')
pedestrian = pd.read_csv('../data/clean/pedestrian_clean.csv')
weather = pd.read_csv('../data/clean/weather_clean.csv')

In [5]:
accidents['dateTime'] = pd.to_datetime(accidents['dateTime'])
traffic['dateTime'] = pd.to_datetime(traffic['dateTime'])
pedestrian['dateTime'] = pd.to_datetime(pedestrian['dateTime'])
weather['dateTime'] = pd.to_datetime(weather['dateTime'])

In [6]:
@timed_function
def assign_closest_volume(A, B, name, number_of_stations, number_of_periods):
    A = A.copy()
    errors = 0
    
    volume_columns = [f'{name}_volume_{station}_period_{period}'
                      for period in range(number_of_periods)
                      for station in range(number_of_stations)]
    for col in volume_columns:
        A[col] = np.full(len(A), np.nan, dtype='float64')

    B_grouped = B.groupby('dateTime')

    for period in tqdm(range(number_of_periods), desc="Processing periods"):
        adjusted_timestamps = A['dateTime'] - pd.to_timedelta(period, unit='h')

        unique_timestamps = set(adjusted_timestamps.unique())
        valid_timestamps = unique_timestamps.intersection(B_grouped.groups.keys())

        if len(valid_timestamps) == 0:
            errors += 1
            continue

        timestamp_to_indices = adjusted_timestamps.groupby(adjusted_timestamps).groups

        for timestamp in tqdm(valid_timestamps, desc=f"Period {period}: Processing timestamps", leave=False):
            idxs = timestamp_to_indices.get(timestamp, [])
            
            B_filtered = B_grouped.get_group(timestamp)
            if B_filtered.empty:
                errors += 1
                continue  

            kd_tree = cKDTree(B_filtered[['x', 'y']].values)

            points = A.loc[idxs, ['x', 'y']].values

            distances, indices = kd_tree.query(points, k=number_of_stations)

            if indices.ndim == 1:
                indices = indices[:, np.newaxis]

            volumes = B_filtered['volume'].values[indices]

            for station in range(number_of_stations):
                col_name = f'{name}_volume_{station}_period_{period}'
                A.loc[idxs, col_name] = volumes[:, station]

    print(f'{A.isna().sum().sum()} values are nan and thus set to 0.')
    A[volume_columns] = A[volume_columns].fillna(0)
    print(f'Number of errors encountered: {errors}')

    return A

In [7]:
accidents_traffic = assign_closest_volume(accidents, traffic, 'traffic', 3, 2)

Processing periods:   0%|          | 0/2 [00:00<?, ?it/s]

Period 0: Processing timestamps:   0%|          | 0/35750 [00:00<?, ?it/s]

Period 1: Processing timestamps:   0%|          | 0/35748 [00:00<?, ?it/s]

12 values are nan and thus set to 0.
Number of errors encountered: 0
Execution time: 108.377295 seconds


In [9]:
accidents_traffic.isna().sum()

AccidentType                   0
AccidentSeverityCategory       0
AccidentInvolvingPedestrian    0
AccidentInvolvingBicycle       0
AccidentInvolvingMotorcycle    0
RoadType                       0
x                              0
y                              0
year                           0
month                          0
weekday                        0
hour                           0
dateTime                       0
traffic_volume_0_period_0      0
traffic_volume_1_period_0      0
traffic_volume_2_period_0      0
traffic_volume_0_period_1      0
traffic_volume_1_period_1      0
traffic_volume_2_period_1      0
dtype: int64

In [10]:
accidents_traffic_pedestrian = assign_closest_volume(accidents_traffic, pedestrian, 'pedestrian', 3, 2)

Processing periods:   0%|          | 0/2 [00:00<?, ?it/s]

Period 0: Processing timestamps:   0%|          | 0/32701 [00:00<?, ?it/s]

Period 1: Processing timestamps:   0%|          | 0/32700 [00:00<?, ?it/s]

30369 values are nan and thus set to 0.
Number of errors encountered: 0
Execution time: 112.499400 seconds


In [11]:
accidents_traffic_pedestrian.isna().sum()

AccidentType                    0
AccidentSeverityCategory        0
AccidentInvolvingPedestrian     0
AccidentInvolvingBicycle        0
AccidentInvolvingMotorcycle     0
RoadType                        0
x                               0
y                               0
year                            0
month                           0
weekday                         0
hour                            0
dateTime                        0
traffic_volume_0_period_0       0
traffic_volume_1_period_0       0
traffic_volume_2_period_0       0
traffic_volume_0_period_1       0
traffic_volume_1_period_1       0
traffic_volume_2_period_1       0
pedestrian_volume_0_period_0    0
pedestrian_volume_1_period_0    0
pedestrian_volume_2_period_0    0
pedestrian_volume_0_period_1    0
pedestrian_volume_1_period_1    0
pedestrian_volume_2_period_1    0
dtype: int64

In [12]:
@timed_function
def assign_weather(A, B, number_of_periods, features):

    weather_columns = [f'{feature}_period_{period}' 
                       for period in range(number_of_periods) 
                       for feature in features]
    
    for col in weather_columns:
        A[col] = np.nan
    
    for period in tqdm(range(number_of_periods), desc="Processing periods"):
        adjusted_timestamps = A['dateTime'] - timedelta(hours=period)
        
        A_temp = A.copy()
        A_temp['adjusted_dateTime'] = adjusted_timestamps
        
        merged = A_temp.merge(
            B,
            how='left',
            left_on='adjusted_dateTime',
            right_on='dateTime',
            suffixes=('', '_B')
        )
        
        for feature in features:
            col_name = f'{feature}_period_{period}'
            A[col_name] = merged[feature].values
    
    if 'adjusted_dateTime' in A.columns:
        A.drop(columns=['adjusted_dateTime'], inplace=True)
    
    A[weather_columns] = A[weather_columns].fillna(0)
    
    total_nans = A[weather_columns].isna().sum().sum()
    print(f'{total_nans} values are nan and thus set to 0.')
    
    return A


In [13]:
weather_features = ['temperature_2m', 'precipitation', 'snowfall', 'snow_depth', 'surface_pressure', 'cloud_cover']
accidents_traffic_pedestrian_weather = assign_weather(accidents_traffic_pedestrian, weather, 4, weather_features)

Processing periods:   0%|          | 0/4 [00:00<?, ?it/s]

0 values are nan and thus set to 0.
Execution time: 0.273830 seconds


In [15]:
accidents_traffic_pedestrian_weather.to_csv('../data/clean/merged.csv', index=False)
accidents_traffic_pedestrian_weather.to_csv('../data/inference/historic_data.csv', index=False)

In [8]:
accidents_traffic_pedestrian_weather.columns

Index(['AccidentType', 'AccidentSeverityCategory',
       'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'RoadType', 'x', 'y', 'year', 'month',
       'weekday', 'hour', 'dateTime', 'traffic_volume_0_period_0',
       'traffic_volume_1_period_0', 'traffic_volume_2_period_0',
       'traffic_volume_0_period_1', 'traffic_volume_1_period_1',
       'traffic_volume_2_period_1', 'pedestrian_volume_0_period_0',
       'pedestrian_volume_1_period_0', 'pedestrian_volume_2_period_0',
       'pedestrian_volume_0_period_1', 'pedestrian_volume_1_period_1',
       'pedestrian_volume_2_period_1', 'temperature_2m_period_0',
       'precipitation_period_0', 'snowfall_period_0', 'snow_depth_period_0',
       'surface_pressure_period_0', 'cloud_cover_period_0',
       'temperature_2m_period_1', 'precipitation_period_1',
       'snowfall_period_1', 'snow_depth_period_1', 'surface_pressure_period_1',
       'cloud_cover_period_1', 'temperature_2m_period_2'