In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

### Read Data

In [None]:
df = pd.read_csv('datasets/dataset.csv')
df.head()

In [None]:
df.head()

In [None]:
df.columns

## I) Binary Classification

**Create new dataset containing the relevant variables:**

In [None]:
df = df[df.columns[~df.columns.isin([col for col in df.columns if col.startswith('Accident') and col != 'Accident'])]]

### a) Neural Network

**Create Input and Output Data:**

In [None]:
X = df[df.columns[~df.columns.isin(['date', 'Accident', 'Unnamed: 0'])]]
y = df['Accident']

**Create train and test datasets:**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Model Parameters:**

In [None]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

**Define Custom Dataloaders:**

In [None]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

**Define Neural Net Architecture**
<img src="https://miro.medium.com/max/1400/0*CLjAAd7s6o0yfEYZ.jpg"
     alt="NN"
     style="float: left; margin-right: 10px;" />

In [None]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 53.
        self.layer_1 = nn.Linear(53, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
model = binaryClassification()
device = torch.device('cpu')
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

**Accuracy function:**

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
y_train.shape

In [None]:
for X_batch, y_batch in train_loader:
    print(y_batch.unsqueeze(1).shape)

**Train the model:**

In [None]:
y_pred.shape

In [None]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

**Test the model:**

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

**Classification Report**

In [None]:
print(classification_report(y_test, y_pred_list))

### b) Random Forest

In [None]:
#output label
target = np.array(df['Accident'])

#input features
features = df[df.columns[~df.columns.isin(['date','Accident', 'Unnamed: 0'])]]
feature_list = list(features.columns)
features = np.array(features)

#split dataset into training and testing
train_features, test_features, train_target, test_target = train_test_split(features, 
target, test_size = 0.25, random_state = 10)


# Instantiate model with 100 decision trees
clf = RandomForestClassifier(n_estimators= 100)

# Train the model on training data
clf.fit(train_features, train_target)

predictions = clf.predict(test_features)
print("Accuracy:", metrics.accuracy_score(test_target, predictions))

#check contributions to prediction
feature_names = df.columns[~df.columns.isin(['date','Accident', 'Unnamed: 0'])]
feature_imp = pd.Series(clf.feature_importances_, index=feature_names.sort_values(ascending=False))
feature_imp