In [3]:
!pip install ray

Collecting ray
  Downloading ray-1.12.0-cp37-cp37m-manylinux2014_x86_64.whl (53.2 MB)
[K     |████████████████████████████████| 53.2 MB 160 kB/s 
[?25hCollecting virtualenv
  Downloading virtualenv-20.14.1-py2.py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 24.5 MB/s 
Collecting frozenlist
  Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[K     |████████████████████████████████| 144 kB 47.8 MB/s 
[?25hCollecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 31.4 MB/s 
[?25hCollecting aiosignal
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting distlib<1,>=0.3.1
  Downloading distlib-0.3.4-py2.py3-none-any.whl (461 kB)
[K     |████████████████████████████████| 461 kB 59.7 MB/s 
[?25hCollecting platformdirs<3,>=2
  Downloading pla

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss
import pickle
from collections import defaultdict
from torch.optim import AdamW
import itertools
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
import csv
from sklearn.metrics import precision_recall_fscore_support
import os.path

## ***Basic Neural Network Model***

In [7]:
class Net(nn.Module):

    def __init__(self, input_size, num_layers, max_layer_size, drop_out):
        super(Net, self).__init__()
        self.input_size = input_size
        self.num_layers = num_layers
        self.max_layer_size = max_layer_size
        in_size = max_layer_size

        self.layers = [nn.Linear(input_size, in_size)]
        for layer in range(1, num_layers-1):
            self.layers.append(nn.Linear(in_size, int(in_size/2)))
            in_size = int(in_size/2)
        self.layers.append(nn.Linear(in_size, 1))
        self.drop_out = nn.Dropout(drop_out)


    def forward(self, x, num_layers):
        for layer in self.layers[:-1]:
            x = F.relu(self.drop_out(layer(x)))
        x = self.layers[-1]
        return x

## ***DataSet Loader***

In [None]:
from torch.utils import data
class HatespeechDataset(Dataset):
    def __init__(self, filename):
        with open('filename', newline='') as csvfile:
            data_reader = csv.reader(csvfile, delimiter=';')
        
            self.data = []      
            for line in data_reader:
                #need to know the format first
                pass   
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        ids = self.data[idx][0:3]
        representations = self.data[idx][3:-1]
        label = self.data[idx][-1]
        return ids, representations, label


## ***Hyperparameter Optimisation***

In [None]:
config = {
    "lr": tune.loguniform(1e-3, 1e-1),
    "batch_size": tune.choice([4, 8, 12, 16]),
    "num_layers": tune.choice([2, 3, 4]),
    "drop_out" : tune.uniform(0.1, 0.8),
    "max_layer_size": tune.uniform(80, 512),
}

scheduler = ASHAScheduler(
    metric="accuracy",
    mode="min",
    max_t=10000, #No time restrictions
    grace_period=2, 
    reduction_factor=2) 

reporter = CLIReporter(
    parameter_columns=["lr", "batch_size", "num_layers", "drop_out", "max_layer_size"],
    metric_columns=["accuracy", "training_iteration"])

## ***Train***

In [None]:
def train(config, filename, epochs=2):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model = Net(input_size, config["num_layers"], config["max_layer_size"], config["drop_out"])

    train_dataset = HatespeechDataset("220502_train_data_preprocessed.csv")
    dev_dataset = HatespeechDataset("220502_dev_data_preprocessed.csv")
    train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=config["batch_size"])

    vector_size = train_dataset.data[0][3].size()[-1] #Get size of representations
    net = Net(vector_size).to(device)

    criterion = nn.BCEWithLogitsLoss() 
    optimizer = AdamW(net.parameters())

    for epoch in range(epochs):  # loop over the dataset multiple times
        net.train()
        running_loss = 0.0
        for i, data in enumerate(train_dataloader):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
                running_loss = 0.0
        
        net.eval()
        preds = []
        labels = []
        with torch.no_grad():

            for i, data in enumerate(dev_dataloader):
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                outputs = net(inputs)
                preds.extend([round(output.item()) for output in list(outputs)])
                preds.extend([label.item() for label in list(labels)])

            # sklearn.accuracy...
            accuracy = precision_recall_fscore_support(labels, preds, average='binary')

            # log evaluation 

            with tune.checkpoint_dir(epoch) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                torch.save((model.state_dict(), optimizer.state_dict()), path)
                        
            tune.report(accuracy=...)
    
    torch.save(net.state_dict(), PATH=f'./nn_classification.model')



## ***Test***

In [None]:
def test(representation, modelfile):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    test_dataset = HatespeechDataset("220502_test_data_preprocessed.csv")
    test_dataloader = DataLoader(test_dataset, batch_size=config["batch_size"])

    vector_size = test_dataset.data[0][3].size()[-1] #Get size of representations
    net = Net(vector_size)
    net.load_state_dict(torch.load(modelfile))


    net.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for i, data in enumerate(test_dataloader):
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = net(inputs)
            preds.extend([output.item() for output in list(outputs)])
            preds.extend([label.item() for label in list(labels)])