# Load libraries and import data

In [1]:
# Load numpy, pandas, sklearn, torch, etc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch import *
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset

import sklearn
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.cross_validation import train_test_split

from dateutil.parser import parse




  from numpy.core.umath_tests import inner1d


In [None]:
# Load data
train_df = pd.read_csv('/home/fhell/Desktop/verkehrsunfaelle_train.csv',encoding ='latin1')
test_df = pd.read_csv('/home/fhell/Desktop/ittalents/verkehrsunfaelle_test.csv',encoding ='latin1')

#make copy of original df
X_train_1=train_df
X_test_1=test_df

# Handle outlier
29 of Feb which parser cant handle, year is not given in most entries.


In [None]:
for idx in range(0,X_train_1.shape[0]):
    if X_train_1['Unfalldatum'][idx] ==  '29. Feb.':
        X_train_1['Unfalldatum'][idx] ='28. Feb.'

for idx in range(0,X_test_1.shape[0]):
    if X_test_1['Unfalldatum'][idx] ==  '29. Feb.':
        X_test_1['Unfalldatum'][idx] ='28. Feb.'


# Parse datetime from dataframe 
using dateutil.parser, dateutil parser has to be edited to account for german date names


In [None]:
for idx in range(0,train_df.shape[0]):
    X_train_1['Unfalldatum'][idx] = parse(X_train_1['Unfalldatum'][idx])
    
for idx in range(0,X_test_1.shape[0]):
    X_test_1['Unfalldatum'][idx] = parse(X_test_1['Unfalldatum'][idx])
    
# extract month, as most year values are missing in the original dataset, we neglect year (we could also create a distribution from the not missing data and fill the missing values)
X_train_1['Monat'] = X_train_1['Unfalldatum'].dt.month
X_test_1['Monat'] = X_test_1['Unfalldatum'].dt.month

In [None]:
#save data after parsing

X_train_1.to_csv('train.csv',encoding ='latin1')
X_test_1.to_csv('test.csv',encoding ='latin1')

In [2]:
# Load parsed data again
train_df = pd.read_csv('/home/fhell/Desktop/train.csv',encoding ='latin1')
test_df = pd.read_csv('/home/fhell/Desktop/test.csv',encoding ='latin1')

In [3]:
#make copy of original df
X_train_1=train_df
X_test_1=test_df

# Drop useless variables 
X_train_1.drop(labels = ['Unnamed: 0','Unfalldatum'], axis = 1, inplace = True)
X_test_1.drop(labels = ['Unnamed: 0','Unfalldatum'], axis = 1, inplace = True)
X_train_1.head()

Unnamed: 0,Strassenklasse,Alter,Unfallklasse,Unfallschwere,Lichtverhältnisse,Verletzte Personen,Anzahl Fahrzeuge,Bodenbeschaffenheit,Geschlecht,Zeit (24h),Fahrzeugtyp,Wetterlage,Monat
0,Bundesstrasse,59,Fahrer,1,Tageslicht: Strassenbeleuchtung vorhanden,2,2,trocken,männlich,1330,LKW ab 7.5t,Gut,1
1,Autobahn,48,Fahrer,1,Tageslicht: Strassenbeleuchtung vorhanden,2,4,trocken,weiblich,1724,Auto,Gut,5
2,nicht klassifiziert,56,Fahrer,2,Tageslicht: Strassenbeleuchtung vorhanden,1,1,nass / feucht,männlich,1345,Mottorrad (500cc),Gut,2
3,Bundesstrasse,66,Fahrer,1,Dunkelheit: Strassenbeleuchtung vorhanden und ...,3,2,nass / feucht,weiblich,1830,Auto,Regen,10
4,Bundesstrasse,33,Fahrer,2,Dunkelheit: keine Strassenbeleuchtung,1,1,nass / feucht,männlich,15,Auto,Gut,11


In [4]:
#parse 24h and extract hour
X_train_1['Zeit (24h)']=X_train_1['Zeit (24h)'].apply(lambda x: '{0:0>4}'.format(x))
X_test_1['Zeit (24h)']=X_test_1['Zeit (24h)'].apply(lambda x: '{0:0>4}'.format(x))
X_test_1['Zeit (24h)'] = pd.to_datetime(X_test_1['Zeit (24h)'], format = '%H%M')
X_test_1['Zeit (24h)'] = X_test_1['Zeit (24h)'].dt.hour
X_train_1['Zeit (24h)'] = pd.to_datetime(X_train_1['Zeit (24h)'], format = '%H%M')
X_train_1['Zeit (24h)'] = X_train_1['Zeit (24h)'].dt.hour

In [5]:
## month is a cyclic feature. hence some cyclic feature engineering, see https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
m_per_year = 12

X_train_1['sin Monat'] = np.sin(2*np.pi*X_train_1['Monat']/m_per_year)
X_train_1['cos Monat'] = np.cos(2*np.pi*X_train_1['Monat']/m_per_year)
X_test_1['sin Monat'] = np.sin(2*np.pi*X_test_1['Monat']/m_per_year)
X_test_1['cos Monat'] = np.cos(2*np.pi*X_test_1['Monat']/m_per_year)

# Label encoding
we chose label encoding on the original data, we do not create additional features from e.g. 'Lichtervältnisse'


In [6]:
le=LabelEncoder()

columns = [
 'Strassenklasse',
 'Unfallklasse',
 'Lichtverhältnisse',
 'Bodenbeschaffenheit',
 'Geschlecht',
 'Fahrzeugtyp',
 'Wetterlage']

for col in columns:

       if train_df[col].dtypes=='object':
        data=train_df[col].append(test_df[col])
        le.fit(data.values)
        train_df[col]=le.transform(train_df[col])
        test_df[col]=le.transform(test_df[col])

# One hot encoding for labels

In [7]:

columns = ['Unfallschwere']

enc=OneHotEncoder(sparse=False)

for col in columns:
    data=X_train_1[[col]]
    enc.fit(data)

    temp = enc.transform(X_train_1[[col]])

    temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])

    temp=temp.set_index(X_train_1.index.values) 
    Y_train_1=pd.concat([temp],axis=1)
    

# Data normalization 
for Age and other columns, we do not bin age groups, which might be useful: https://arxiv.org/pdf/1702.04415.pdf
could use embeddings instead

In [8]:

columns = ['Alter', 'Verletzte Personen',
 'Anzahl Fahrzeuge']

for col in columns:
    data=X_train_1[[col]].append(X_test_1[[col]])
    scaler = StandardScaler()

    scaler.fit(data)
    
    temp = scaler.transform(X_train_1[[col]])

    temp=pd.DataFrame(temp,columns=[(col+"_"+str('scaled'))])

    temp=temp.set_index(X_train_1.index.values)
       
    X_train_1=pd.concat([X_train_1,temp],axis=1)

    temp = scaler.transform(X_test_1[[col]])
       
    temp=pd.DataFrame(temp,columns=[(col+"_"+str('scaled'))])


    temp=temp.set_index(X_test_1.index.values)

    X_test_1=pd.concat([X_test_1,temp],axis=1)

# Seperate features and labels

In [9]:

Y_traintemp = Y_train_1
X_traintemp = X_train_1.drop(labels = ["Unfallschwere"],axis = 1)

# Define categorial and continous features
we chose to embed daytime, while we use cyclic feature engineering for month - out of curiosity.



In [10]:
categorical_features = ['Strassenklasse',

 'Unfallklasse',
 'Lichtverhältnisse',
 'Zeit (24h)',
 'Geschlecht',

 'Bodenbeschaffenheit',

 'Fahrzeugtyp',
 'Wetterlage']

cont_features = [
 'Verletzte Personen_scaled',
 'Anzahl Fahrzeuge_scaled',

 'Alter_scaled',
 'sin Monat',
 'cos Monat',

                       ]


# Construct embedding
how many unique values are in training and test dataset per feature, construct embedding dimensionality


In [11]:
#how many unique values are in training and test dataset per categorial feature, construct embedding matrix
tempc = pd.concat([X_traintemp,X_test_1])

cat_dims = [int(tempc[col].nunique()) for col in categorical_features]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

cat_dims, emb_dims

([6, 3, 5, 24, 2, 7, 17, 9],
 [(6, 3), (3, 2), (5, 3), (24, 12), (2, 1), (7, 4), (17, 9), (9, 5)])

# Split train data set into training and validation set


In [12]:
X_train, x_test, Y_train, y_test = train_test_split(X_traintemp, Y_traintemp, test_size=0.2, random_state=107)

# Upsample minority class
a more advance data augmentation technique would be useful here such as using GANs (https://stats.stackexchange.com/questions/358970/can-a-gan-be-used-for-tabular-vector-data-augmentation) 
or Synthetic Minority Over-sampling Technique (SMOTE) (Chawla et al., 2012),

In [13]:
datatemp = pd.concat([pd.Series.astype(X_train,dtype = np.float64),pd.Series.astype(Y_train,dtype = np.float64)],axis=1)

# Separate majority and minority classes
df_1 = datatemp[datatemp['Unfallschwere_1'].values==1]
df_2 = datatemp[datatemp['Unfallschwere_2'].values==1]
df_3 = datatemp[datatemp['Unfallschwere_3'].values==1]

 
# Upsample minority class 2
df_2_upsampled = resample(df_2, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_1.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Upsample minority class 3
df_3_upsampled = resample(df_3, 
                                 replace=True,     # sample with replacement
                                 n_samples=df_1.shape[0],    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_1, df_2_upsampled, df_3_upsampled])
 
# Display new class counts
df_upsampled[df_upsampled['Unfallschwere_1'].values==1].shape, df_upsampled[df_upsampled['Unfallschwere_3'].values==1].shape

## seperate features from labels again
Y_train = df_upsampled[["Unfallschwere_1","Unfallschwere_2","Unfallschwere_3"]]
X_train = df_upsampled.drop(labels = ["Unfallschwere_1","Unfallschwere_2","Unfallschwere_3"],axis = 1)
Y_train.shape,X_train.shape

((32289, 3), (32289, 17))

# Seperate continous and categorial features, construct dataloader


In [14]:
#seperate continous and categorial features for dataloader
cont = X_train[cont_features]
cat = X_train[categorical_features]

#change datatype to torch tensor, create dataset and dataloader with batches

#prepare train data for pytorch (categorial features are int, while continous are float)
Y_tensor_train  = torch.tensor(np.asarray(Y_train.values,dtype=np.float32))
X_tensor_train  = torch.tensor(np.asarray(cat.values,dtype=np.int64))
cont_data = torch.tensor(np.asarray(cont.values,dtype=np.float32))

dataset = TensorDataset(X_tensor_train, Y_tensor_train, cont_data)
dataloader = DataLoader(dataset, batch_size=100,
                        shuffle=True, num_workers=4)

#prepare validation data
cont_val = x_test[cont_features]
cat_val = x_test[categorical_features]
X_tensor_val  = torch.tensor(np.asarray(cat_val.values,dtype=np.int64))
Y_tensor_val  = torch.tensor(np.asarray(y_test.values,dtype=np.float32))
cont_data_val = torch.tensor(np.asarray(cont_val.values,dtype=np.float32))  

#prepare test data
cont_test = X_test_1[cont_features]
cat_test = X_test_1[categorical_features]
X_tensor_test  = torch.tensor(np.asarray(cat_test.values,dtype=np.int64))
cont_data_test = torch.tensor(np.asarray(cont_test.values,dtype=np.float32))  


# Specify neural network parameters 


In [15]:

N_FEATURES =  X_train_1.shape[1]
LR = 0.001
#different dropout for different layers, more dropout for later layers
dropout = torch.nn.Dropout(p=1 - (0.5))
dropout1 = torch.nn.Dropout(p=1 - (0.9))
dropout2 = torch.nn.Dropout(p=1 - (0.75))
no_of_cont = cont_data.shape[1]

N_LABELS = Y_train_1.shape[1]   #3 #n classes


hiddenLayer1Size=512
hiddenLayer2Size=int(hiddenLayer1Size/2)
hiddenLayer3Size=int(hiddenLayer1Size/4)
hiddenLayer4Size=int(hiddenLayer1Size/8)
hiddenLayer5Size=int(hiddenLayer1Size/16)

emb_layers = nn.ModuleList([nn.Embedding(x, y)
                                     for x, y in emb_dims])
no_of_embs = sum([y for x, y in emb_dims])
bn1 = nn.BatchNorm1d(no_of_cont)

linear1=torch.nn.Linear(no_of_embs+no_of_cont, hiddenLayer1Size, bias=True) 
torch.nn.init.xavier_uniform(linear1.weight)

linear2=torch.nn.Linear(hiddenLayer1Size, hiddenLayer2Size)
torch.nn.init.xavier_uniform(linear2.weight)

linear3=torch.nn.Linear(hiddenLayer2Size, hiddenLayer3Size)
torch.nn.init.xavier_uniform(linear3.weight)

linear4=torch.nn.Linear(hiddenLayer3Size, hiddenLayer4Size)
torch.nn.init.xavier_uniform(linear4.weight)

linear5=torch.nn.Linear(hiddenLayer4Size, N_LABELS)
torch.nn.init.xavier_uniform(linear5.weight)



sigmoid = torch.nn.Sigmoid()
sftmx = torch.nn.Softmax()
tanh=torch.nn.Tanh()
leakyrelu=torch.nn.LeakyReLU()






# Construct classifier, forward pass, run training and validation

In [None]:

#define classifier class, architecture of nn
class _classifier(nn.Module):
    def __init__(self):
        super(_classifier, self).__init__()
        self.emb_layers = emb_layers
        self.first_bn_layer = bn1
        self.main = nn.Sequential(

            linear1,leakyrelu,nn.BatchNorm1d(hiddenLayer1Size),dropout2,
            linear2,leakyrelu,nn.BatchNorm1d(hiddenLayer2Size),dropout,          
            linear3,leakyrelu,nn.BatchNorm1d(hiddenLayer3Size),dropout,
            linear4,leakyrelu,nn.BatchNorm1d(hiddenLayer4Size),dropout,
            linear5,sigmoid
            
        )
#define pytorch forward pass        
    def forward(self, cat_data, cont_data):
        x = [emb_layer(cat_data[:, i]) for i,emb_layer in enumerate(self.emb_layers)]
        x = torch.cat(x, 1)
        x = dropout1(x)
        normalized_cont_data = self.first_bn_layer(cont_data)
        mainin = torch.cat([x, normalized_cont_data], 1) 
        
        return self.main(mainin)

classifier = _classifier().cuda()
#define optimizer and criterion, we dont use a LR sheduler for now
optimizer = optim.Adam(classifier.parameters())
#lr=LR,weight_decay=5e-3 learning rate and weight decay not implemented yet
criterion = nn.BCELoss()
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[60,100,150,400], gamma = 0.1)

#train network with n epochs and minibatches
epochs = 1500
for epoch in range(epochs):
    losses = []
    tu = [] 
    
    for sample_batched, labels_batched, cont_data in dataloader:
          
        output = classifier(sample_batched.cuda(),cont_data.cuda()) # predict labels from input
        loss = criterion(output.cuda(), labels_batched.cuda()) #compute loss

        optimizer.zero_grad()  # clear gradients for next train
        loss.backward() # backpropagation, compute gradients
        optimizer.step() # apply gradients
        losses.append(loss.data.mean())
    #scheduler.step() #apply scheduler after each epoch
        
    print('[%d/%d] Loss: %.3f' % (epoch+1, epochs, np.mean(losses)))

     
    if epoch % 10 == 0:
  #check validation log loss every 10 epoch  
        cl1 = classifier
        prediction = (cl1(X_tensor_val.cuda(),cont_data_val.cuda()).data > 0.5).float() # zero or one
        pred_y = prediction.cpu().numpy().squeeze()
        target_y = Y_tensor_val.cpu().data.numpy()
        tu.append(log_loss(target_y, pred_y))
        print('[%d/%d] Validation log loss: %.3f' % (epoch+1, epochs, np.mean(tu)))


[1/1500] Loss: 0.642
[1/1500] Validation log loss: 12.595
[2/1500] Loss: 0.493
[3/1500] Loss: 0.416
[4/1500] Loss: 0.371
[5/1500] Loss: 0.343
[6/1500] Loss: 0.328
[7/1500] Loss: 0.314
[8/1500] Loss: 0.303
[9/1500] Loss: 0.293
[10/1500] Loss: 0.285
[11/1500] Loss: 0.284
[11/1500] Validation log loss: 18.508
[12/1500] Loss: 0.273
[13/1500] Loss: 0.265
[14/1500] Loss: 0.263
[15/1500] Loss: 0.256
[16/1500] Loss: 0.254
[17/1500] Loss: 0.251
[18/1500] Loss: 0.246
[19/1500] Loss: 0.242
[20/1500] Loss: 0.241
[21/1500] Loss: 0.235
[21/1500] Validation log loss: 19.322
[22/1500] Loss: 0.232
[23/1500] Loss: 0.229
[24/1500] Loss: 0.225
[25/1500] Loss: 0.223
[26/1500] Loss: 0.222
[27/1500] Loss: 0.218
[28/1500] Loss: 0.217
[29/1500] Loss: 0.215
[30/1500] Loss: 0.213
[31/1500] Loss: 0.209
[31/1500] Validation log loss: 17.999
[32/1500] Loss: 0.206
[33/1500] Loss: 0.203
[34/1500] Loss: 0.205
[35/1500] Loss: 0.204
[36/1500] Loss: 0.198
[37/1500] Loss: 0.199
[38/1500] Loss: 0.195
[39/1500] Loss: 0.194


[310/1500] Loss: 0.111
[311/1500] Loss: 0.106
[311/1500] Validation log loss: 14.942
[312/1500] Loss: 0.107
[313/1500] Loss: 0.108
[314/1500] Loss: 0.106
[315/1500] Loss: 0.107
[316/1500] Loss: 0.107
[317/1500] Loss: 0.107
[318/1500] Loss: 0.106
[319/1500] Loss: 0.105
[320/1500] Loss: 0.108
[321/1500] Loss: 0.107
[321/1500] Validation log loss: 14.610
[322/1500] Loss: 0.106
[323/1500] Loss: 0.103
[324/1500] Loss: 0.107
[325/1500] Loss: 0.109
[326/1500] Loss: 0.107
[327/1500] Loss: 0.103
[328/1500] Loss: 0.105
[329/1500] Loss: 0.104
[330/1500] Loss: 0.106
[331/1500] Loss: 0.104
[331/1500] Validation log loss: 14.404
[332/1500] Loss: 0.105
[333/1500] Loss: 0.106
[334/1500] Loss: 0.106
[335/1500] Loss: 0.107
[336/1500] Loss: 0.105
[337/1500] Loss: 0.103
[338/1500] Loss: 0.103
[339/1500] Loss: 0.104
[340/1500] Loss: 0.103
[341/1500] Loss: 0.104
[341/1500] Validation log loss: 14.536
[342/1500] Loss: 0.103
[343/1500] Loss: 0.105
[344/1500] Loss: 0.104
[345/1500] Loss: 0.107


# Predicitons for whole validation set

In [None]:
cl1 = classifier

#prediction = (cl1(X_tensor_val).data).float() # probabilities 
prediction = (cl1(X_tensor_val.cuda(),cont_data_val.cuda()).data > 0.5).float() # zero or one

pred_y = prediction.cpu().numpy().squeeze()

target_y = Y_tensor_val.cpu().data.numpy()

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
labels = ['0', '1','2']
cm = confusion_matrix(
    target_y.argmax(axis=1), pred_y.argmax(axis=1))
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
#cplt.show()

# Save prediction for test set

In [None]:
prediction_test = (cl1(X_tensor_test.cuda(),cont_data_test.cuda()).data > 0.5).float() # zero or one
pred_y_test = prediction_test.cpu().numpy().squeeze()

IDtest = pd.DataFrame(data=X_test_1.index.values,columns = ['Unfall_ID'])
pred = pd.Series(pred_y_test.argmax(axis=1), name="Unfallschwere")

results = pd.concat([IDtest,pred],axis=1)

results.to_csv("MLP_pytorch_embed.csv",index=False)

# Classification using different classical models

In [None]:
#labels instead of one-hot encoding
Y_train1 = Y_train.values
Y_train1  = np.argmax(Y_train1, axis=1)
Y_train1

# Try different models, combine them later in ensemble learning, use 10 folds
kfold = StratifiedKFold(n_splits=10)

# Gradient boosting

In [None]:
# Gradient boosting
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] }
gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
gsGBC.fit(X_train,Y_train1)
GBC_best = gsGBC.best_estimator_
gsGBC.best_score_

In [None]:
pred_y = pd.Series(GBC_best.predict(X_train), name="GBC")
target_y = Y_train1

labels = ['0', '1','2']
cm = confusion_matrix(
    target_y, pred_y)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
pred_y = GBC_best.predict(x_test)
target_y = np.argmax(y_test.values, axis=1)
cm = confusion_matrix(
    target_y, pred_y)

print(cm)
labels = ['0', '1','2']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ExtraTrees 

In [None]:
ExtC = ExtraTreesClassifier()
ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 7],
              "min_samples_split": [2, 3, 7],
              "min_samples_leaf": [1, 3, 7],
              "bootstrap": [False],
              "n_estimators" :[300,600],
              "criterion": ["gini"]}
gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
gsExtC.fit(X_train,Y_train1)
ExtC_best = gsExtC.best_estimator_
gsExtC.best_score_

# Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
rf_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 7],
              "min_samples_split": [2, 3, 7],
              "min_samples_leaf": [1, 3, 7],
              "bootstrap": [False],
              "n_estimators" :[300,600],
              "criterion": ["gini"]}
gsrandom_forest = GridSearchCV(random_forest,param_grid = rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
gsrandom_forest.fit(X_train,Y_train1)
# Best score
random_forest_best = gsrandom_forest.best_estimator_
gsrandom_forest.best_score_

In [None]:
pred_y = pd.Series(random_forest_best.predict(X_train), name="random_forest")
target_y = Y_train1
labels = ['0', '1','2']
cm = confusion_matrix(
    target_y, pred_y)
print(cm)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
pred_y = pd.Series(random_forest_best.predict(x_test), name="random_forest")
target_y = np.argmax(y_test.values, axis=1)
cm = confusion_matrix(
    target_y, pred_y)

print(cm)
labels = ['0', '1','2']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Adaboost

In [None]:
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state=7)
ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "algorithm" : ["SAMME","SAMME.R"],
              "n_estimators" :[1,2],
              "learning_rate":  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}
gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
gsadaDTC.fit(X_train,Y_train1)
adaDTC_best = gsadaDTC.best_estimator_
gsadaDTC.best_score_

# SVC classifier

In [None]:
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.1],
                  'C': [10,200]}
gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1)
gsSVMC.fit(X_train,Y_train1)
SVMC_best = gsSVMC.best_estimator_
# Best score
gsSVMC.best_score_

# Ensemble voting

In [None]:

test_Unfallschwere_AdaDTC = pd.Series(adaDTC_best.predict(X_test_1), name="AdaDTC")
test_Unfallschwere_ExtC = pd.Series(ExtC_best.predict(X_test_1), name="ExtC")
test_Unfallschwere_GBC = pd.Series(GBC_best.predict(X_test_1), name="GBC")
test_Unfallschwere_SVMC = pd.Series(SVMC_best.predict(X_test_1), name="SVMC")
test_Unfallschwere_random_forest = pd.Series(random_forest_best.predict(X_test_1), name="random_forest")


# Concatenate all classifier results
ensemble_results = pd.concat([test_Unfallschwere_AdaDTC, test_Unfallschwere_ExtC, test_Unfallschwere_GBC,test_Unfallschwere_SVMC,test_Unfallschwere_random_forest],axis=1)

VotingPredictor = VotingClassifier(estimators=[('ExtC', ExtC_best), ('GBC',GBC_best),
('SVMC', SVMC_best), ('random_forest', random_forest_best)], voting='soft', n_jobs=4)
VotingPredictor = VotingPredictor.fit(X_train, Y_train1)

# Save prediction

In [None]:
#Save prediciton from ensemble voting
IDtest = pd.DataFrame(data=X_test.index.values,columns = ['Unfall_ID'])
test = pd.Series(VotingPredictor.predict(X_test_1), name="Unfallschwere")

results = pd.concat([IDtest,test],axis=1)

results.to_csv("ensemble_python_voting.csv",index=False)