In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

### Read Data

In [2]:
df = pd.read_csv('C:/Users/anton/dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,air_temperature,water_temperature,wind_gust_max_10min,wind_speed_avg_10min,wind_force_avg_10min,wind_direction,windchill,barometric_pressure_qfe,...,Hour_14.0,Hour_15.0,Hour_16.0,Hour_17.0,Hour_18.0,Hour_19.0,Hour_20.0,Hour_21.0,Hour_22.0,Hour_23.0
0,0,2011-01-01 00:30:00,2.233333,5.2,2.4,1.216667,1.216667,1785,2.2,974.55,...,0,0,0,0,0,0,0,0,0,0
1,1,2011-01-01 01:30:00,2.38,5.2,2.8,0.86,0.86,1076,2.16,973.98,...,0,0,0,0,0,0,0,0,0,0
2,2,2011-01-01 02:30:00,2.58,5.14,1.2,0.34,0.34,1159,2.58,973.64,...,0,0,0,0,0,0,0,0,0,0
3,3,2011-01-01 02:30:00,2.58,5.14,1.2,0.34,0.34,1159,2.58,973.64,...,0,0,0,0,0,0,0,0,0,0
4,4,2011-01-01 03:30:00,2.5,5.16,1.9,0.52,0.52,1122,2.54,973.42,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,air_temperature,water_temperature,wind_gust_max_10min,wind_speed_avg_10min,wind_force_avg_10min,wind_direction,windchill,barometric_pressure_qfe,...,Hour_14.0,Hour_15.0,Hour_16.0,Hour_17.0,Hour_18.0,Hour_19.0,Hour_20.0,Hour_21.0,Hour_22.0,Hour_23.0
0,0,2011-01-01 00:30:00,2.233333,5.2,2.4,1.216667,1.216667,1785,2.2,974.55,...,0,0,0,0,0,0,0,0,0,0
1,1,2011-01-01 01:30:00,2.38,5.2,2.8,0.86,0.86,1076,2.16,973.98,...,0,0,0,0,0,0,0,0,0,0
2,2,2011-01-01 02:30:00,2.58,5.14,1.2,0.34,0.34,1159,2.58,973.64,...,0,0,0,0,0,0,0,0,0,0
3,3,2011-01-01 02:30:00,2.58,5.14,1.2,0.34,0.34,1159,2.58,973.64,...,0,0,0,0,0,0,0,0,0,0
4,4,2011-01-01 03:30:00,2.5,5.16,1.9,0.52,0.52,1122,2.54,973.42,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['Unnamed: 0', 'date', 'air_temperature', 'water_temperature',
       'wind_gust_max_10min', 'wind_speed_avg_10min', 'wind_force_avg_10min',
       'wind_direction', 'windchill', 'barometric_pressure_qfe',
       'precipitation', 'dew_point', 'global_radiation', 'humidity',
       'water_level', 'AccidentInvolvingPedestrian',
       'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'Year',
       'Accident', 'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9', 'AccidentSeverityCategory_as2',
       'AccidentSeverityCategory_as3', 'AccidentSeverityCategory_as4',
       'WeekDay_Monday', 'WeekDay_Saturday', 'WeekDay_Sunday',
       'WeekDay_Thursday', 'WeekDay_Tuesday', 'WeekDay_Wednesday', 'Month_2.0',
       'Month_3.0', 'Month_4.0', 'Month_5.0', 'Month_6.0', 'Month_7.0',
       'Month_8.0', 'Month_9.0', 

## I) Binary Classification

**Create new dataset containing the relevant variables:**

In [5]:
df = df[df.columns[~df.columns.isin([col for col in df.columns if col.startswith('Accident') and col != 'Accident'])]]

### a) Neural Network

**Create Input and Output Data:**

In [6]:
X = df[df.columns[~df.columns.isin(['date', 'Accident', 'Unnamed: 0', 'Year'])]]
y = df['Accident']

**Create train and test datasets:**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Model Parameters:**

In [8]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

**Define Custom Dataloaders:**

In [9]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [10]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

**Define Neural Net Architecture**
<img src="https://miro.medium.com/max/1400/0*CLjAAd7s6o0yfEYZ.jpg"
     alt="NN"
     style="float: left; margin-right: 10px;" />

In [11]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 53.
        self.layer_1 = nn.Linear(53, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [12]:
model = binaryClassification()
device = torch.device('cpu')
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

binaryClassification(
  (layer_1): Linear(in_features=53, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


**Accuracy function:**

In [13]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [14]:
y_train.shape

(61694,)

In [15]:
for X_batch, y_batch in train_loader:
    print(y_batch.unsqueeze(1).shape)

torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])


torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])


**Train the model:**

In [17]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.58663 | Acc: 69.546
Epoch 002: | Loss: 0.56832 | Acc: 70.836
Epoch 003: | Loss: 0.56603 | Acc: 71.051
Epoch 004: | Loss: 0.56336 | Acc: 71.175
Epoch 005: | Loss: 0.56173 | Acc: 71.376
Epoch 006: | Loss: 0.55938 | Acc: 71.400
Epoch 007: | Loss: 0.55871 | Acc: 71.527
Epoch 008: | Loss: 0.55723 | Acc: 71.492
Epoch 009: | Loss: 0.55588 | Acc: 71.612
Epoch 010: | Loss: 0.55366 | Acc: 71.800
Epoch 011: | Loss: 0.55303 | Acc: 71.744
Epoch 012: | Loss: 0.55084 | Acc: 72.076
Epoch 013: | Loss: 0.55027 | Acc: 71.941
Epoch 014: | Loss: 0.54873 | Acc: 72.020
Epoch 015: | Loss: 0.54716 | Acc: 72.186
Epoch 016: | Loss: 0.54589 | Acc: 72.200
Epoch 017: | Loss: 0.54529 | Acc: 72.460
Epoch 018: | Loss: 0.54353 | Acc: 72.563
Epoch 019: | Loss: 0.54233 | Acc: 72.544
Epoch 020: | Loss: 0.54162 | Acc: 72.638
Epoch 021: | Loss: 0.54004 | Acc: 72.657
Epoch 022: | Loss: 0.53932 | Acc: 72.565
Epoch 023: | Loss: 0.53813 | Acc: 72.817
Epoch 024: | Loss: 0.53736 | Acc: 72.835
Epoch 025: | Los

**Test the model:**

In [18]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

**Classification Report**

In [19]:
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

           0       0.74      0.70      0.72     16332
           1       0.67      0.71      0.69     14055

    accuracy                           0.70     30387
   macro avg       0.70      0.70      0.70     30387
weighted avg       0.70      0.70      0.70     30387



### b) Random Forest

In [22]:
#output label
target = np.array(df['Accident'])

#input features
features = df[df.columns[~df.columns.isin(['date','Accident', 'Unnamed: 0', 'Year'])]]
feature_list = list(features.columns)
features = np.array(features)

#split dataset into training and testing
train_features, test_features, train_target, test_target = train_test_split(features, 
target, test_size = 0.25, random_state = 10)


# Instantiate model with 100 decision trees
clf = RandomForestClassifier(n_estimators= 100)

# Train the model on training data
clf.fit(train_features, train_target)

predictions = clf.predict(test_features)
print("Accuracy:", metrics.accuracy_score(test_target, predictions))

#check contributions to prediction
feature_names = df.columns[~df.columns.isin(['date','Accident', 'Unnamed: 0', 'Year'])]
feature_imp = pd.Series(clf.feature_importances_, index=feature_names.sort_values(ascending=False))
feature_imp

Accuracy: 0.7466660874853395


windchill                  0.062942
wind_speed_avg_10min       0.059713
wind_gust_max_10min        0.054272
wind_force_avg_10min       0.055062
wind_direction             0.054914
water_temperature          0.063789
water_level                0.061920
precipitation              0.063550
humidity                   0.011462
global_radiation           0.059450
dew_point                  0.117756
barometric_pressure_qfe    0.069967
air_temperature            0.050164
WeekDay_Wednesday          0.006634
WeekDay_Tuesday            0.009059
WeekDay_Thursday           0.017588
WeekDay_Sunday             0.006671
WeekDay_Saturday           0.006321
WeekDay_Monday             0.006661
Month_9.0                  0.002905
Month_8.0                  0.003373
Month_7.0                  0.003845
Month_6.0                  0.004040
Month_5.0                  0.003722
Month_4.0                  0.003451
Month_3.0                  0.003316
Month_2.0                  0.003788
Month_12.0                 0