In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Preprocess data

In [None]:
from google.colab import files


uploaded = files.upload()


In [None]:
import pandas as pd
import io

df_data = pd.read_csv(io.BytesIO(uploaded['event_data_50k_used_for_first_training.csv']))



In [None]:
df_data = df_data.sample(frac=1)

In [None]:
df_data.head()

Unnamed: 0,NE_1,NE_2,NE_3,NE_4,HE_1,HE_2,HE_3,HE_4,HE_5,CPU,...,DISK,LOAD,NIC_U,NIC_S,SQ_1,SQ_2,SQ_3,HTTP_1,HTTP_2,ROOT_CAUSE
24226,0,0,0,0,1,0,0,0,0,0.08,...,0.64,0.85,0.16,0,3.06,0.48,0.52,0.17,0.87,HARDWARE_ERROR_1_1005
18375,1,0,0,0,0,0,0,0,0,0.22,...,0.19,0.86,0.24,0,0.77,0.72,0.55,0.73,0.13,NETWORK_ERROR_1001
45872,0,0,0,0,0,0,0,0,0,0.12,...,0.3,0.36,0.38,1,0.66,0.77,0.37,0.47,0.45,NIC_ERROR_1015
20958,0,0,0,0,0,0,0,0,0,0.05,...,0.94,0.57,0.22,0,0.06,0.91,0.34,0.7,0.57,DISK_USAGE_ERROR_1012
13388,0,0,0,0,0,0,0,0,0,0.06,...,0.95,0.52,0.21,0,0.38,0.45,0.46,0.69,0.18,DISK_USAGE_ERROR_1012


In [None]:
label_encoder = preprocessing.LabelEncoder() #Encode target labels with value between 0 and n_labels-1.
df_data["ROOT_CAUSE"] = label_encoder.fit_transform(df_data["ROOT_CAUSE"])
# To print 5 rows in the data
#df_data.head()
# To pring columns in the csv
#df_data.columns
#df_data.drop("ID", axis=1, inplace=True)
# Transforms the final column ROOT_CAUSE = classes will print the root causes available in the data
classes = label_encoder.classes_

In [None]:
classes

array(['DISK_USAGE_ERROR_1012', 'HARDWARE_ERROR_1_1005',
       'NETWORK_ERROR_1001', 'NIC_ERROR_1015', 'SLOW_QUERY_ERROR_1_1016'],
      dtype=object)

### Dataset and Dataloader class

In [None]:
class SampleDataset(Dataset):
    def __init__(self, data_frame, is_train=True):
        self.df = data_frame
        self.train = self.df.iloc[:35000, :]
        self.test = self.df.iloc[35000:, :]
        if is_train:
            self.data = self.train
            self.feat = self.data.loc[:, :"HTTP_2"]
            self.labels = self.data.loc[:, "ROOT_CAUSE"]
        else:
            self.data = self.test
            self.feat = self.data.loc[:, :"HTTP_2"]
            self.labels = self.data.loc[:, "ROOT_CAUSE"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inp = self.feat.iloc[idx].to_numpy()
        gt = self.labels.iloc[idx]
        return inp, gt

### Create Model

In [None]:
class SampleModel(nn.Module):
    def __init__(self, in_feat, hidden_feat, num_labels):
        super(SampleModel, self).__init__()
        self.in_feat = in_feat
        self.hidden_feat = hidden_feat
        self.num_labels = num_labels
        self.linear_1 = nn.Linear(in_feat, hidden_feat)
        self.linear_2 = nn.Linear(hidden_feat, num_labels)
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.linear_1(x)
        out = self.relu(out)
        out = self.linear_2(out)
        return out

### Instantiate train dataset, train dataloader, test dataset and test dataloader

In [None]:
train_dataset = SampleDataset(df_data, is_train=True)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, drop_last=False)
test_dataset = SampleDataset(df_data, is_train=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, drop_last=False)

In [None]:
sample_iter = iter(train_dataloader)
x,y = next(sample_iter)

In [None]:
y.shape

torch.Size([64])

In [None]:
input_feat = 20
num_labels = 5
hidden_feat = 64
model = SampleModel(input_feat, hidden_feat, num_labels)

In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()

In [None]:
epochs = 100
for epoch in range(epochs):
    running_loss = 0.
    for idx, data in enumerate(train_dataloader):
        inp, gt = data
        inp = inp.type(torch.float)
        optimizer.zero_grad()
        out = model(inp)
        loss = criterion(out, gt)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch{epoch}: Loss: {running_loss/len(train_dataloader)}")
torch.save(model.state_dict(), "./event-data-weights.pt")

Epoch0: Loss: 1.0847927040984033
Epoch1: Loss: 0.5263194895095755
Epoch2: Loss: 0.23383336376664407
Epoch3: Loss: 0.12426989178226025
Epoch4: Loss: 0.07967885559802736
Epoch5: Loss: 0.057596490229777926
Epoch6: Loss: 0.04487185195851152
Epoch7: Loss: 0.03672415978348451
Epoch8: Loss: 0.031103614782420765
Epoch9: Loss: 0.027008216283275616
Epoch10: Loss: 0.023897749232213394
Epoch11: Loss: 0.021457594827879935
Epoch12: Loss: 0.019493021326347285
Epoch13: Loss: 0.01787768937885707
Epoch14: Loss: 0.016526106916967324
Epoch15: Loss: 0.015378319445513938
Epoch16: Loss: 0.014391173046742867
Epoch17: Loss: 0.013532913936101159
Epoch18: Loss: 0.012779614797042482
Epoch19: Loss: 0.01211295144247671
Epoch20: Loss: 0.011518627683541838
Epoch21: Loss: 0.010985319079555422
Epoch22: Loss: 0.010503937063782972
Epoch23: Loss: 0.010067153689272742
Epoch24: Loss: 0.009668927957257852
Epoch25: Loss: 0.009304258416081613
Epoch26: Loss: 0.008968985633979087
Epoch27: Loss: 0.008659645240801562
Epoch28: Loss

### Validation

In [None]:
checkpoint = torch.load("event-data-weights.pt")
model.load_state_dict(checkpoint) #Load model weights

<All keys matched successfully>

In [None]:
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

In [None]:
with torch.no_grad():
    model.eval()
    running_loss = 0.
    for idx, data in enumerate(test_dataloader):
        inp, gt = data
        inp = inp.type(torch.float)
        out = model(inp)
        _, predictions = torch.max(out, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(gt, predictions):
            if label == prediction:
                correct_pred
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1

In [None]:
correct_pred

{'DISK_USAGE_ERROR_1012': 3001,
 'HARDWARE_ERROR_1_1005': 2969,
 'NETWORK_ERROR_1001': 3016,
 'NIC_ERROR_1015': 3002,
 'SLOW_QUERY_ERROR_1_1016': 2996}

In [None]:
total_pred

{'DISK_USAGE_ERROR_1012': 3001,
 'HARDWARE_ERROR_1_1005': 2969,
 'NETWORK_ERROR_1001': 3016,
 'NIC_ERROR_1015': 3002,
 'SLOW_QUERY_ERROR_1_1016': 3007}

In [None]:
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

Accuracy for class: DISK_USAGE_ERROR_1012 is 100.0 %
Accuracy for class: HARDWARE_ERROR_1_1005 is 100.0 %
Accuracy for class: NETWORK_ERROR_1001 is 100.0 %
Accuracy for class: NIC_ERROR_1015 is 100.0 %
Accuracy for class: SLOW_QUERY_ERROR_1_1016 is 99.6 %


In [None]:
sample_arr = torch.tensor(np.array([0,0,0,0,0,0,0,0,0,0.34,0.25,0.88,0.81,0.90,1,4.45,0.81,0.9,0.83,0.94]))
sample_arr = sample_arr[None,:]
#error_index = list(classes).index("SLOW_QUERY_ERROR_1_1016")
with torch.no_grad():
    model.eval()
    inp = sample_arr.type(torch.float)
    out = model(inp)
    _, predictions = torch.max(out, 1)
    #list(classes)[predictions.item()]
    print(f'Error: {list(classes)[predictions.item()]}')

Error: SLOW_QUERY_ERROR_1_1016
