In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import StandardScaler, MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

### Read Data

In [None]:
df = pd.read_csv('datasets/dataset.csv')
df.head()

In [None]:
df.head()

In [None]:
df.columns

**Class Distribution**

In [None]:
plt.title('Fig.1) Accident Distribution', fontweight = 'bold')
sns.countplot(x = 'Accident', data=df, facecolor=(0, 0, 0, 0),
                   linewidth=5, edgecolor=sns.color_palette("dark", 3))
plt.ylabel('Count', fontweight = 'bold')
plt.xlabel('Accident', fontweight = 'bold')
plt.show()

## I) Binary Classification

**Create new dataset containing the relevant variables:**

In [None]:
df = df[df.columns[~df.columns.isin([col for col in df.columns if col.startswith('Accident') and col != 'Accident'])]]

### a) Neural Network

**Create Input and Output Data:**

In [None]:
X = df[df.columns[~df.columns.isin(['date', 'Accident', 'Unnamed: 0'])]]
y = df['Accident']

**Create train and test datasets:**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Model Parameters:**

In [None]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

**Define Custom Dataloaders:**

In [None]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

**Define Neural Net Architecture**
<img src="https://miro.medium.com/max/1400/0*CLjAAd7s6o0yfEYZ.jpg"
     alt="NN"
     style="float: left; margin-right: 10px;" />

In [None]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 53.
        self.layer_1 = nn.Linear(53, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
model = binaryClassification()
device = torch.device('cpu')
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

**Accuracy function:**

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
y_train.shape

In [None]:
for X_batch, y_batch in train_loader:
    print(y_batch.unsqueeze(1).shape)

**Train the model:**

In [None]:
y_pred.shape

In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

**Test the model:**

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

**Classification Report**

In [None]:
print(classification_report(y_test, y_pred_list))

### b) Random Forest

In [None]:
#output label
target = np.array(df['Accident'])

#input features
features = df[df.columns[~df.columns.isin(['date','Accident', 'Unnamed: 0'])]]
feature_list = list(features.columns)
features = np.array(features)

#split dataset into training and testing
train_features, test_features, train_target, test_target = train_test_split(features, 
target, test_size = 0.25, random_state = 10)


# Instantiate model with 100 decision trees
clf = RandomForestClassifier(n_estimators= 100)

# Train the model on training data
clf.fit(train_features, train_target)

predictions = clf.predict(test_features)
print("Accuracy:", metrics.accuracy_score(test_target, predictions))

#check contributions to prediction
feature_names = df.columns[~df.columns.isin(['date','Accident', 'Unnamed: 0'])]
feature_imp = pd.Series(clf.feature_importances_, index=feature_names.sort_values(ascending=False))
feature_imp

## II) Mulit-Class Classification

**Notes:**
<br>
***a)***
<br>
Multi-class prediction for AccidentSeverity
<br>
***b)***
<br>
Multi-class prediction for AccidentType

In [2]:
df = pd.read_csv('datasets/dataset.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'date', 'air_temperature', 'water_temperature',
       'wind_gust_max_10min', 'wind_speed_avg_10min', 'wind_force_avg_10min',
       'wind_direction', 'windchill', 'barometric_pressure_qfe',
       'precipitation', 'dew_point', 'global_radiation', 'humidity',
       'water_level', 'AccidentInvolvingPedestrian',
       'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'Accident',
       'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9', 'AccidentSeverityCategory_as2',
       'AccidentSeverityCategory_as3', 'AccidentSeverityCategory_as4',
       'WeekDay_Monday', 'WeekDay_Saturday', 'WeekDay_Sunday',
       'WeekDay_Thursday', 'WeekDay_Tuesday', 'WeekDay_Wednesday', 'Month_2.0',
       'Month_3.0', 'Month_4.0', 'Month_5.0', 'Month_6.0', 'Month_7.0',
       'Month_8.0', 'Month_9.0', 'Month_1

### - Accident Severity prediction

### i) Neural Network

**Create Input and Output Data:**

In [4]:
df.columns

Index(['Unnamed: 0', 'date', 'air_temperature', 'water_temperature',
       'wind_gust_max_10min', 'wind_speed_avg_10min', 'wind_force_avg_10min',
       'wind_direction', 'windchill', 'barometric_pressure_qfe',
       'precipitation', 'dew_point', 'global_radiation', 'humidity',
       'water_level', 'AccidentInvolvingPedestrian',
       'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'Accident',
       'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9', 'AccidentSeverityCategory_as2',
       'AccidentSeverityCategory_as3', 'AccidentSeverityCategory_as4',
       'WeekDay_Monday', 'WeekDay_Saturday', 'WeekDay_Sunday',
       'WeekDay_Thursday', 'WeekDay_Tuesday', 'WeekDay_Wednesday', 'Month_2.0',
       'Month_3.0', 'Month_4.0', 'Month_5.0', 'Month_6.0', 'Month_7.0',
       'Month_8.0', 'Month_9.0', 'Month_1

In [5]:
cols = ['air_temperature', 'water_temperature',
       'wind_gust_max_10min', 'wind_speed_avg_10min', 'wind_force_avg_10min',
       'wind_direction', 'windchill', 'barometric_pressure_qfe',
       'precipitation', 'dew_point', 'global_radiation', 'humidity',
       'water_level', 'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9', 'WeekDay_Monday', 'WeekDay_Saturday', 'WeekDay_Sunday',
       'WeekDay_Thursday', 'WeekDay_Tuesday', 'WeekDay_Wednesday', 'Month_2.0',
       'Month_3.0', 'Month_4.0', 'Month_5.0', 'Month_6.0', 'Month_7.0',
       'Month_8.0', 'Month_9.0', 'Month_10.0', 'Month_11.0', 'Month_12.0',
       'Hour_1.0', 'Hour_2.0', 'Hour_3.0', 'Hour_4.0', 'Hour_5.0', 'Hour_6.0',
       'Hour_7.0', 'Hour_8.0', 'Hour_9.0', 'Hour_10.0', 'Hour_11.0',
       'Hour_12.0', 'Hour_13.0', 'Hour_14.0', 'Hour_15.0', 'Hour_16.0',
       'Hour_17.0', 'Hour_18.0', 'Hour_19.0', 'Hour_20.0', 'Hour_21.0',
       'Hour_22.0', 'Hour_23.0']

#create data frame for prediction
df_nn_acc_type = df[cols]

In [7]:
pred_cols = list(df_nn_acc_type.columns[[col.startswith('Accident') for col in df_nn_acc_type.columns]])

X = df_nn_acc_type[df_nn_acc_type.columns[~df_nn_acc_type.columns.isin(pred_cols)]]
y = df_nn_acc_type[pred_cols]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)

X_train, X_test = X_train.astype(float), X_test.astype(float)
y_train, y_test = y_train.astype(float), y_test.astype(float)

In [10]:
## train data
class BinaryDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __len__ (self):
        return len(self.X_data)
    
    def __getitem__(self, index):
        
        features = self.X_data[index,:]
        labels = self.y_data[index,:]
        
        features = torch.tensor(features, dtype=torch.float32)
        # there are 10 classes and each class can have a binary value ...
        # ... either 0 or 1
        label1 = torch.tensor(labels[0], dtype=torch.float32)
        label2 = torch.tensor(labels[1], dtype=torch.float32)
        label3 = torch.tensor(labels[2], dtype=torch.float32)
        label4 = torch.tensor(labels[3], dtype=torch.float32)
        label5 = torch.tensor(labels[4], dtype=torch.float32)
        label6 = torch.tensor(labels[5], dtype=torch.float32)
        label7 = torch.tensor(labels[6], dtype=torch.float32)
        label8 = torch.tensor(labels[7], dtype=torch.float32)
        label9 = torch.tensor(labels[8], dtype=torch.float32)
        label10 = torch.tensor(labels[9], dtype=torch.float32)
        
        return {
            'features': features,
            'label1': label1,
            'label2': label2,
            'label3': label3,
            'label4': label4,
            'label5': label5,
            'label6': label6,
            'label7': label7,
            'label8': label8,
            'label9': label9,
            'label10': label10,
        
        }
        

In [18]:
class MultiHeadBinaryModel(nn.Module):
    def __init__(self):
        super(MultiHeadBinaryModel, self).__init__()
        self.fc1 = nn.Linear(53, 32) # 12 is the number of features
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 10)
        
        # we will treat each head as a binary classifier ...
        # ... so the output features will be 1
        self.out1 = nn.Linear(10, 1)
        self.out2 = nn.Linear(10, 1)
        self.out3 = nn.Linear(10, 1)
        self.out4 = nn.Linear(10, 1)
        self.out5 = nn.Linear(10, 1)
        self.out6 = nn.Linear(10, 1)
        self.out7 = nn.Linear(10, 1)
        self.out8 = nn.Linear(10, 1)
        self.out9 = nn.Linear(10, 1)
        self.out10 = nn.Linear(10, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        
        # each binary classifier head will have its own output
        out1 = torch.sigmoid(self.out1(x))
        out2 = torch.sigmoid(self.out2(x))
        out3 = torch.sigmoid(self.out3(x))
        out4 = torch.sigmoid(self.out4(x))
        out5 = torch.sigmoid(self.out5(x))
        out6 = torch.sigmoid(self.out6(x))
        out7 = torch.sigmoid(self.out7(x))
        out8 = torch.sigmoid(self.out8(x))
        out9 = torch.sigmoid(self.out9(x))
        out10 = torch.sigmoid(self.out10(x))
        
        return out1, out2, out3, out4, out5, out6, out7, out8, out9, out10

In [19]:
def binary_loss_fn(outputs, targets):
    o1, o2, o3, o4, o5, o6, o7, o8, o9, o10 = outputs
    t1, t2, t3, t4, t5, t6, t7, t8, t9, t10 = targets
    l1 = nn.BCELoss()(o1, t1)
    l2 = nn.BCELoss()(o2, t2)
    l3 = nn.BCELoss()(o3, t3)
    l4 = nn.BCELoss()(o4, t4)
    l5 = nn.BCELoss()(o5, t5)
    l6 = nn.BCELoss()(o6, t6)
    l7 = nn.BCELoss()(o7, t7)
    l8 = nn.BCELoss()(o8, t8)
    l9 = nn.BCELoss()(o9, t9)
    l10 = nn.BCELoss()(o10, t10)
    return (l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + l10) / 10

In [20]:
train_dataset = BinaryDataset(X_train, y_train)
# train data loader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64)
model = MultiHeadBinaryModel()

In [21]:
# training function
def train(model, dataloader, optimizer, loss_fn, train_dataset, device):
    model.train()
    counter = 0
    train_running_loss = 0.0
    for i, data in tqdm(enumerate(dataloader), total=int(len(train_dataset)/dataloader.batch_size)):
        counter += 1
        
        # extract the features and labels
        features = data['features'].to(device)
        target1 = data['label1'].to(device)
        target2 = data['label2'].to(device)
        target3 = data['label3'].to(device)
        target4 = data['label4'].to(device)
        target5 = data['label5'].to(device)
        target6 = data['label6'].to(device)
        target7 = data['label7'].to(device)
        target8 = data['label8'].to(device)
        target9 = data['label9'].to(device)
        target10 = data['label10'].to(device)
        
        # zero-out the optimizer gradients
        optimizer.zero_grad()
        
        outputs = model(features)
        targets = (target1, target2, target3, target4, target5, target6, target7, target8, target9, target10)
        loss = loss_fn(outputs, targets)
        train_running_loss += loss.item()
        
        # backpropagation
        loss.backward()
        # update optimizer parameters
        optimizer.step()
        
    train_loss = train_running_loss / counter
    return train_loss

In [22]:
optimizer = optim.Adam(params=model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 100
# load the model on to the computation device
model.to(device)

MultiHeadBinaryModel(
  (fc1): Linear(in_features=53, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=10, bias=True)
  (out1): Linear(in_features=10, out_features=1, bias=True)
  (out2): Linear(in_features=10, out_features=1, bias=True)
  (out3): Linear(in_features=10, out_features=1, bias=True)
  (out4): Linear(in_features=10, out_features=1, bias=True)
  (out5): Linear(in_features=10, out_features=1, bias=True)
  (out6): Linear(in_features=10, out_features=1, bias=True)
  (out7): Linear(in_features=10, out_features=1, bias=True)
  (out8): Linear(in_features=10, out_features=1, bias=True)
  (out9): Linear(in_features=10, out_features=1, bias=True)
  (out10): Linear(in_features=10, out_features=1, bias=True)
)

In [23]:
loss_fn = binary_loss_fn

In [24]:
train_loss = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss = train(
        model, train_dataloader, optimizer, binary_loss_fn, train_dataset, device
    )
    train_loss.append(train_epoch_loss)
    print(f"Train Loss: {train_epoch_loss:.4f}")
torch.save(model.state_dict(), 'outputs/multi_head_binary.pth')

  0%|                                                                                          | 0/963 [00:00<?, ?it/s]

Epoch 1 of 100





ValueError: Using a target size (torch.Size([64])) that is different to the input size (torch.Size([64, 1])) is deprecated. Please ensure they have the same size.

In [None]:
model = MultiHeadBinaryModel()
device = torch.device('cpu')
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
class multiClassification(nn.Module):
    def __init__(self):
        super(multiClassification, self).__init__()
        # Number of input features is 53.
        self.layer_1 = nn.Linear(53, 64)
        self.layer_2 = nn.Linear(64, 32)
        self.layer_out = nn.Linear(32, 10)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
        
    def forward(self, inputs):
        x = self.layer_1(inputs)
        x = self.relu(x)
        x = self.layer_2(x)
        x = self.relu(x)
        x = self.layer_out(x)
        x = self.softmax(x)
        
        return x

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_tag = torch.round(y_pred)

    correct_results_sum = (y_pred_tag == y_test).sum()
    acc = correct_results_sum/(y_test.shape[0]*y_test.shape[1]*100)
    
    return acc

In [None]:
len(train_loader)

In [None]:
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch)
        acc = multi_acc(y_pred, y_batch)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.5f}')

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
multi_acc(torch.Tensor(np.array(y_pred_list)), torch.Tensor(y_test))

### ii) Random Forest

In [None]:
#list of columns that are not necessary for prediction
cols = ['Unnamed: 0', 'date', 'AccidentInvolvingPedestrian',
       'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'Accident',
        'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9']


#create data frame for Random Forest prediction
df_rf_acc_type = df[df.columns[~df.columns.isin(cols)]]
df_rf_acc_type.columns

**Define targets and labels and run the model:**

In [None]:
#output label
output_class = df_rf_acc_type[[col for col in df_rf_acc_type.columns if col.startswith('Accident')]]
target = np.array(output_class)

#input features
features = df_rf_acc_type[df_rf_acc_type.columns[~df_rf_acc_type.columns.isin(output_class)]]
feature_list = list(features.columns)
features = np.array(features)

#split dataset into training and testing
train_features, test_features, train_target, test_target = train_test_split(features, 
target, test_size = 0.33, random_state = 32)


# Instantiate model with 100 decision trees
clf = RandomForestClassifier(n_estimators= 250)

# Train the model on training data
clf.fit(train_features, train_target)

predictions = clf.predict(test_features)
print("Accuracy:", metrics.accuracy_score(test_target, predictions))

In [None]:
predictions, test_target

### - Accident Type prediction

### i) Neural Network

In [None]:
df.columns


In [None]:
cols = ['Unnamed: 0', 'date', 'AccidentInvolvingPedestrian',
       'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'Accident',
        'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9']

In [None]:
#create data frame for Random Forest prediction
df_nn_acc_type = df[df.columns[~df.columns.isin(cols)]]

X = df_nn_acc_type[df_nn_acc_type.columns[~df_nn_acc_type.columns.isin(cols)]]
y = df_nn_acc_type[['AccidentSeverityCategory_as2', 'AccidentSeverityCategory_as3', 'AccidentSeverityCategory_as4']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))


In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)


In [None]:
class multiClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 53.
        self.layer_1 = nn.Linear(53, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
model = binaryClassification()
device = torch.device('cpu')
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc


In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

### ii) Random Forest

In [None]:
#List of columns that are necessary for prediction
cols = ['air_temperature', 'water_temperature',
       'wind_gust_max_10min', 'wind_speed_avg_10min', 'wind_force_avg_10min',
       'wind_direction', 'windchill', 'barometric_pressure_qfe',
       'precipitation', 'dew_point', 'global_radiation', 'humidity',
       'water_level', 'AccidentType_at00', 'AccidentType_at1', 'AccidentType_at2',
       'AccidentType_at3', 'AccidentType_at4', 'AccidentType_at5',
       'AccidentType_at6', 'AccidentType_at7', 'AccidentType_at8',
       'AccidentType_at9',
       'WeekDay_Monday', 'WeekDay_Saturday', 'WeekDay_Sunday',
       'WeekDay_Thursday', 'WeekDay_Tuesday', 'WeekDay_Wednesday', 'Month_2.0',
       'Month_3.0', 'Month_4.0', 'Month_5.0', 'Month_6.0', 'Month_7.0',
       'Month_8.0', 'Month_9.0', 'Month_10.0', 'Month_11.0', 'Month_12.0',
       'Hour_1.0', 'Hour_2.0', 'Hour_3.0', 'Hour_4.0', 'Hour_5.0', 'Hour_6.0',
       'Hour_7.0', 'Hour_8.0', 'Hour_9.0', 'Hour_10.0', 'Hour_11.0',
       'Hour_12.0', 'Hour_13.0', 'Hour_14.0', 'Hour_15.0', 'Hour_16.0',
       'Hour_17.0', 'Hour_18.0', 'Hour_19.0', 'Hour_20.0', 'Hour_21.0',
       'Hour_22.0', 'Hour_23.0']

#Create data frame for Random Forest prediction
df_rf_acc_sev = df[df.columns[df.columns.isin(cols)]]

**Define targets and labels and run the model:**

In [None]:
#output label
output_class = df_rf_acc_sev[[col for col in df_rf_acc_sev.columns if col.startswith('Accident')]]
target = np.array(output_class)

#input features
features = df_rf_acc_sev[df_rf_acc_sev.columns[~df_rf_acc_sev.columns.isin(output_class)]]
feature_list = list(features.columns)
features = np.array(features)

#split dataset into training and testing
train_features, test_features, train_target, test_target = train_test_split(features, 
target, test_size = 0.25, random_state = 11)


# Instantiate model with 100 decision trees
clf = RandomForestClassifier(n_estimators= 130)

# Train the model on training data
clf.fit(train_features, train_target)

predictions = clf.predict(test_features)
print("Accuracy:", metrics.accuracy_score(test_target, predictions))