In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


# index,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),City,County,State,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Accident_Date,Accident_Year,Accident_Month,Accident_Day,accident_weekday,is_weekend,is_holiday,Accident_Time

# Load and preprocess the dataset
# Assume `data` is a DataFrame containing your dataset
data = pd.read_csv('../data/processed/Los_Angeles_Accidents_Complete_2016_2023.csv')
pd.options.display.max_columns = None

columns_to_drop = ['index', 'Start_Time', 'End_Time', 'Distance(mi)', 'City', 'County', 'State', 'Accident_Date']

data = data.drop(columns_to_drop, axis=1)

2024-08-09 17:55:00.387107: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-09 17:55:00.405920: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def get_years(data_frame, column):
  return data_frame[column].apply(lambda date: pd.to_numeric(date[0:4]))

def get_months(data_frame, column):
  return data_frame[column].apply(lambda date: pd.to_numeric(date[5:7]))

def get_days(data_frame, column):
  return data_frame[column].apply(lambda date: pd.to_numeric(date[8:10]))

def get_hours(data_frame, column):
  return data_frame[column].apply(lambda date: pd.to_numeric(date[11:13]))

data['Weather_Timestamp_Year'] = get_years(data, 'Weather_Timestamp')
data['Weather_Timestamp_Month'] = get_months(data, 'Weather_Timestamp')
data['Weather_Timestamp_Day'] = get_days(data, 'Weather_Timestamp')
data['Weather_Timestamp_Time'] = get_hours(data, 'Weather_Timestamp')

data

Unnamed: 0,Severity,Start_Lat,Start_Lng,Weather_Timestamp,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Weather_Condition,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Accident_Year,Accident_Month,Accident_Day,accident_weekday,is_weekend,is_holiday,Accident_Time,Weather_Timestamp_Year,Weather_Timestamp_Month,Weather_Timestamp_Day,Weather_Timestamp_Time
0,2,34.078926,-118.289040,2016-06-21 10:47:00,82.9,64.925012,47.0,29.95,10.0,Variable,4.6,0.003435,Clear,False,False,False,False,True,False,False,False,False,False,False,False,False,Day,2016,6,21,1,False,False,10,2016,6,21,10
1,3,34.091179,-118.239471,2016-06-21 10:47:00,82.9,64.925012,47.0,29.95,10.0,Variable,4.6,0.003435,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,2016,6,21,1,False,False,10,2016,6,21,10
2,3,34.037239,-118.309074,2016-06-21 10:47:00,82.9,64.925012,47.0,29.95,10.0,Variable,4.6,0.003435,Clear,False,False,False,False,False,False,False,False,True,False,False,False,False,Day,2016,6,21,1,False,False,10,2016,6,21,10
3,3,34.027458,-118.274490,2016-06-21 10:47:00,82.9,64.925012,47.0,29.95,10.0,Variable,4.6,0.003435,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,2016,6,21,1,False,False,10,2016,6,21,10
4,3,33.947544,-118.279434,2016-06-21 11:53:00,80.1,64.925012,52.0,29.96,10.0,ESE,9.2,0.003435,Clear,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,2016,6,21,1,False,False,11,2016,6,21,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156486,2,34.036930,-118.438770,2019-08-22 16:51:00,72.0,72.000000,76.0,29.57,10.0,SW,8.0,0.000000,Fair,False,False,False,False,True,False,False,False,False,False,False,False,False,Day,2019,8,22,3,False,False,17,2019,8,22,16
156487,3,34.075790,-118.276680,2019-08-23 03:52:00,67.0,67.000000,79.0,29.62,7.0,CALM,0.0,0.000000,Cloudy,False,False,False,False,False,False,False,False,False,False,False,False,False,Night,2019,8,23,4,False,False,4,2019,8,23,3
156488,2,34.023790,-118.276390,2019-08-23 12:52:00,81.0,81.000000,49.0,29.66,8.0,CALM,0.0,0.000000,Fair,False,False,False,False,False,False,False,False,False,False,False,False,False,Day,2019,8,23,4,False,False,12,2019,8,23,12
156489,2,34.070610,-118.263910,2019-08-23 13:52:00,82.0,82.000000,47.0,29.65,9.0,CALM,0.0,0.000000,Fair,False,False,False,False,True,False,False,False,False,False,False,False,False,Day,2019,8,23,4,False,False,13,2019,8,23,13


In [3]:
data = data.drop('Weather_Timestamp', axis=1)

In [4]:
def onehot_encode(dataframe, columns, prefixies):
  dataframe = dataframe.copy()
  for column, prefix in zip(columns, prefixies):
    dummies = pd.get_dummies(dataframe[column], prefix=prefix, dtype='int')
    dataframe = pd.concat([dataframe, dummies], axis=1)
    dataframe = dataframe.drop(column, axis=1)
  return dataframe

In [5]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object' }

{'Wind_Direction': 24, 'Weather_Condition': 33, 'Sunrise_Sunset': 2}

In [6]:
data = onehot_encode(
  data,
  ['Wind_Direction', 'Weather_Condition'],
  ['WD', 'WC']
)

In [8]:
def convert_binary_to_boolean(dataframe, column):
  return dataframe[column].apply(lambda x: 1 if x == 'Day' else 0)

In [11]:
data['Sunrise_Sunset'] = convert_binary_to_boolean(data, 'Sunrise_Sunset')

In [14]:
y = data['Severity'].copy()
X = data.drop('Severity', axis=1).copy()

In [15]:
# To make severity values from 0 to 3
y=y-1

In [17]:
X = X.astype(float)

In [18]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [23]:
inputs = tf.keras.Input(shape=(X.shape[1]))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(4, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 32
epochs = 20

history = model.fit(X_train, y_train, validation_split=0.2, batch_size=batch_size, epochs=epochs, callbacks=[
  tf.keras.callbacks.ReduceLROnPlateau(),
  tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3
  )
])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
