In [None]:
import datasets 
import numpy as np
import os
import time
import torch
import tensorflow as tf
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
import torch.optim as optim
import torchvision
import torchtext
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from langdetect import detect_langs
from torch.nn.utils.rnn import pad_sequence
from pandarallel import pandarallel 
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

if torch.cuda.is_available():  
  use_cuda = True
else:  
  use_cuda = False

# Data Loading

In [None]:
# loading GLOVE embeddings
GLOVE = torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=10000)  # use 10k most common words

In [None]:
# loading dataset
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df.describe()

In [None]:
#load numpy array from file
tmp_np_arr = np.load('hate_speech.npy', allow_pickle=True)

#convert to pandas dataframe
df.drop(df.iloc[:, 15:131], inplace=True, axis=1)
df_tmp = df.drop(["annotator_id"], axis=1)
df_norm = pd.DataFrame(tmp_np_arr, columns=df_tmp.columns)

In [None]:
df_norm

In [None]:
df_spliced = df_norm.drop('comment_id', axis=1)
df_spliced = df_spliced.drop('platform', axis=1)
df_spliced = df_spliced.drop('sentiment', axis=1)
df_spliced = df_spliced.drop('hatespeech', axis=1)
df_spliced = df_spliced.drop('hate_speech_score', axis=1)

In [None]:
df_spliced

In [None]:
tweets = df_spliced.iloc[:,-1:]

In [None]:
labels = df_spliced.iloc[:,:-1]

In [None]:
labels = labels.to_numpy()

In [None]:
label_names = list(df_spliced.iloc[:,:-1].columns)

In [None]:
train_size = int(0.7*len(df_spliced))
val_size = int((len(df_spliced) - train_size)/2)
test_size = len(df_spliced) - train_size - val_size

In [None]:
train_size

In [None]:
val_size

In [None]:
test_size

In [None]:
def split_tweet(tweet):
    # separate punctuations
    tweet = tweet.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ")
    return tweet.lower().split()

In [None]:
def get_tweet_words(glove_vector):
    train, valid, test = [], [], []
    for index, row in df_spliced.iterrows():
        try:
            tweet = row[-1]
            idxs = [glove_vector.stoi[w]        # lookup the index of word
                    for w in split_tweet(tweet)
                    if w in glove_vector.stoi] # keep words that has an embedding
            if not idxs: # ignore tweets without any word with an embedding
                continue
            idxs = torch.tensor(idxs) # convert list to pytorch tensor
            label = np.array(row[:-1].values).astype(np.float32) 
            label = torch.tensor(label) #storing label information to tensor
            #adding tweet to corresponding train/val/test set
            if index < train_size:
                train.append((idxs, label))
            elif index < train_size+val_size:
                valid.append((idxs, label))
            else:
                test.append((idxs, label))
        except:
            print("Error at index: ", index)
            continue
    return train, valid, test

train, valid, test = get_tweet_words(GLOVE)

In [None]:
# train # uncomment to see the train set

In [None]:
from torch import nn
from torch.nn.utils.rnn import pad_sequence

def pad_collate(batch):
  (xx, yy) = zip(*batch)
  x_lens = [len(x) for x in xx]
  y_lens = [len(y) for y in yy]

  xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
  yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

  return xx_pad, yy_pad
  
train_loader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True, collate_fn=pad_collate)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=128, shuffle=True, collate_fn=pad_collate)
test_loader = torch.utils.data.DataLoader(test, batch_size=128, shuffle=True, collate_fn=pad_collate)

# Helpers

In [None]:
###############################################################################
# For Training
def get_model_name(name, batch_size, learning_rate, epoch):
    """ Generate a name for the model consisting of all the hyperparameter values

    Args:
        config: Configuration object containing the hyperparameters
    Returns:
        path: A string with the hyperparameter name and value concatenated
    """
    path = "model_{0}_bs{1}_lr{2}_epoch{3}".format(name,
                                                   batch_size,
                                                   learning_rate,
                                                   epoch)
    return path


###############################################################################
# Training Curve
def plot_training_curve(path):
    """ Plots the training curve for a model run, given the csv files
    containing the train/validation error/loss.

    Args:
        path: The base path of the csv files produced during training
    """
    import matplotlib.pyplot as plt
    train_err = np.loadtxt("{}_train_err.csv".format(path))
    val_err = np.loadtxt("{}_val_err.csv".format(path))
    train_loss = np.loadtxt("{}_train_loss.csv".format(path))
    val_loss = np.loadtxt("{}_val_loss.csv".format(path))
    plt.title("Train vs Validation Error")
    n = len(train_err) # number of epochs
    plt.plot(range(1,n+1), train_err, label="Train")
    plt.plot(range(1,n+1), val_err, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Error")
    plt.legend(loc='best')
    plt.show()
    plt.title("Train vs Validation Loss")
    plt.plot(range(1,n+1), train_loss, label="Train")
    plt.plot(range(1,n+1), val_loss, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend(loc='best')
    plt.show()

# Models

In [None]:
""" EPAI Class Models
This file contains the class objects for RNN, BiRNN, LSTM, BiLSTM and GRU.
"""
# import gensim.downloader
import torch
from torch import nn
import torchtext

# global variables
# GLOVE = torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=10000)  # use 10k most common words
# WORD2VEC = gensim.downloader.load("word2vec-google-news-300")


class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim*2, 1024)
        self.fc6 = nn.Linear(1024, 512)
        self.fc7 = nn.Linear(512, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, output_dim)
        self.relu = nn.ReLU()
        # self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # embedded = self.dropout(self.embedding(text))
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        # hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        hidden = self.relu(self.fc(hidden))
        hidden = self.relu(self.fc6(hidden))
        hidden = self.relu(self.fc7(hidden))
        hidden = self.relu(self.fc1(hidden))
        hidden = self.relu(self.fc2(hidden))
        hidden = self.relu(self.fc3(hidden))
        hidden = self.relu(self.fc4(hidden))
        return self.fc5(hidden.squeeze(0))

class Tweet_RNN(nn.Module):
    """
    The class object for the RNN.
    Attributes:
    emb: the type of embedding
    hidden_size: the number of layers
    nn: the actual neural network
    fc: the activation layer
    """

    # Tweet_RNN.__init__(self, input_size, hidden_size, num_classes)
    # param: self:Tweet_RNN
    # param: input_size:int
    # param: hidden_size:int
    # param: num_classes:int
    # param: embedding:str
    #    the string should be either: "glove", "word2vec" or "none"
    #    and correspond to the desired embedding
    # return: void
    # initializes the RNN
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, embedding: str) -> None:
        super(Tweet_RNN, self).__init__()
        self.name = 'Tweet_RNN'
        if embedding == "glove":
            self.emb = nn.Embedding.from_pretrained(GLOVE.vectors)
        elif embedding == "word2vec":
            self.emb = nn.Embedding.from_pretrained(WORD2VEC)
        else:
            self.emb = nn.Embedding(input_size, num_classes)
        self.hidden_size = hidden_size
        self.nn = nn.RNN(input_size, hidden_size, batch_first=True)
        # self.linear = nn.Linear(input_size, 1);
        self.linear = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    # forward(self, x)
    # param: self:Tweet_RNN
    # param: x:torch.FloatTensor
    # initializes the RNN
    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.nn(x, h0)

        # Pass the output of hidden layer from the last time step to the classifier
        out = self.sigmoid(self.linear(out[:, -1, :])) #sigmoid activiation is applyed for all 8 classes

        return out #outputs a tensor of dimension 8 


class Tweet_BiRNN(nn.Module):
    """
    The class object for the BiRNN.
    Attributes:
    emb: the type of embedding
    hidden_size: the number of layers
    nn: the actual neural network
    fc: the activation layer
    """

    # Tweet_BiRNN.__init__(self, input_size, hidden_size, num_classes)
    # param: self:Tweet_BiRNN
    # param: input_size:int
    # param: hidden_size:int
    # param: num_classes:int
    # param: embedding:str
    #    the string should be either: "glove", "word2vec" or "none"
    #    and correspond to the desired embedding
    # return: void
    # initializes the BiRNN
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, embedding: str) -> None:
        super(Tweet_BiRNN, self).__init__()
        self.name = 'Tweet_BiRNN'
        if embedding == "glove":
            self.emb = nn.Embedding.from_pretrained(GLOVE.vectors)
        elif embedding == "word2vec":
            self.emb = nn.Embedding.from_pretrained(WORD2VEC)
        else:
            self.emb = nn.Embedding(input_size, num_classes)
        self.hidden_size = hidden_size
        self.nn = nn.RNN(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Sigmoid()

    # forward(self, x)
    # param: self:Tweet_BiRNN
    # param: x:torch.FloatTensor
    # initializes the BiRNN
    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.nn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out


class Tweet_LSTM(nn.Module):
    """
    The class object for the LSTM.
    Attributes:
    emb: the type of embedding
    hidden_size: the number of layers
    nn: the actual neural network
    fc: the activation layer
    """

    # Tweet_LSTM.__init__(self, input_size, hidden_size, num_classes)
    # param: self:Tweet_LSTM
    # param: input_size:int
    # param: hidden_size:int
    # param: num_classes:int
    # param: embedding:str
    #    the string should be either: "glove", "word2vec" or "none"
    #    and correspond to the desired embedding
    # return: void
    # initializes the LSTM
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, embedding: str) -> None:
        super(Tweet_LSTM, self).__init__()
        self.name = 'Tweet_LSTM'
        if embedding == "glove":
            self.emb = nn.Embedding.from_pretrained(GLOVE.vectors)
        elif embedding == "word2vec":
            self.emb = nn.Embedding.from_pretrained(WORD2VEC)
        else:
            self.emb = nn.Embedding(input_size, num_classes)
        self.hidden_size = hidden_size
        self.nn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sigmoid()

    # forward(self, x)
    # param: self:Tweet_LSTM
    # param: x:torch.FloatTensor
    # return: out:torch.FloatTensor
    # initializes the LSTM
    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state and cell state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        c0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the LSTM
        out, _ = self.nn(x, (h0, c0))
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out


class Tweet_BiLSTM(nn.Module):
    """
    The class object for the BiLSTM.
    Attributes:
    emb: the type of embedding
    hidden_size: the number of layers
    nn: the actual neural network
    fc: the activation layer
    """

    # Tweet_BiLSTM.__init__(self, input_size, hidden_size, num_classes)
    # param: self:Tweet_BiLSTM
    # param: input_size:int
    # param: hidden_size:int
    # param: num_classes:int
    # param: embedding:str
    #    the string should be either: "glove", "word2vec" or "none"
    #    and correspond to the desired embedding
    # return: void
    # initializes the BiLSTM
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, embedding: str) -> None:
        super(Tweet_BiLSTM, self).__init__()
        self.name = 'Tweet_BiLSTM'
        if embedding == "glove":
            self.emb = nn.Embedding.from_pretrained(GLOVE.vectors)
        elif embedding == "word2vec":
            self.emb = nn.Embedding.from_pretrained(WORD2VEC)
        else:
            self.emb = nn.Embedding(input_size, num_classes)
        self.hidden_size = hidden_size
        self.nn = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Sigmoid()

    # forward(self, x)
    # param: self:Tweet_BiLSTM
    # param: x:torch.FloatTensor
    # return: out:torch.FloatTensor
    # initializes the BiLSTM
    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state and cell state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        c0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the BiLSTM
        out, _ = self.nn(x, (h0, c0))
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

class Tweet_GRU(nn.Module):
    """
    The class object for the GRU.
    Attributes:
    emb: the type of embedding
    hidden_size: the number of layers
    nn: the actual neural network
    fc: the activation layer
    """

    # Tweet_GRU.__init__(self, input_size, hidden_size, num_classes)
    # param: self:Tweet_GRU
    # param: input_size:int
    # param: hidden_size:int
    # param: num_classes:int
    # param: embedding:str
    #    the string should be either: "glove", "word2vec" or "none"
    #    and correspond to the desired embedding
    # return: void
    # initializes the GRU
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, embedding: str) -> None:
        super(Tweet_GRU, self).__init__()
        self.name = 'Tweet_GRU'
        if embedding == "glove":
            self.emb = nn.Embedding.from_pretrained(GLOVE.vectors)
        elif embedding == "word2vec":
            self.emb = nn.Embedding.from_pretrained(WORD2VEC)
        else:
            self.emb = nn.Embedding(input_size, num_classes)
        self.hidden_size = hidden_size
        self.nn = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Sigmoid()

    # forward(self, x)
    # param: self:Tweet_GRU
    # param: x:torch.FloatTensor
    # return: out:torch.FloatTensor
    # initializes the GRU
    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state and cell state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        c0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the GRU
        out, _ = self.nn(x, (h0, c0))
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

# Training

In [None]:
def evaluate(net, loader, criterion):
    """ Evaluate the network on the validation set.

     Args:
         net: PyTorch neural network object
         loader: PyTorch data loader for the validation set
         criterion: The loss function
     Returns:
         err: A scalar for the avg classification error over the validation set
         loss: A scalar for the average loss function over the validation set
     """
    for i, data in enumerate(loader, 0):
        inputs, labels = data
        if use_cuda and torch.cuda.is_available():
          inputs= inputs.cuda()
          labels = labels.cuda()
          net = net.cuda()
        outputs = net(inputs)
        loss_func = nn.BCELoss()
        loss = loss_func(outputs, labels.float())
        loss.backward()
    return_loss = float(loss)
    return return_loss
   

   
def get_accuracy(model, training_mode, train_loader, valid_loader):

    if training_mode:
        loader = train_loader
    else:
        loader = valid_loader

    accuracy = []

    for text, label in loader:
        predictions = model(text)
        # f1_scores.append(f1_score(label, predictions, average='macro'))
        #detatching the tensor from the graph
        predictions = predictions.detach().numpy()
        label = label.squeeze().detach().numpy()

        # predictions[predictions >= 0.5] = 1
        # predictions[predictions < 0.5] = 0

        predictions = np.where(predictions >= 0.6, 1, 0)
        label = np.where(label >= 0.6, 1, 0)

        # print(predictions.shape)
        # print(label.shape)
        #compute hamming loss
        accuracy.append(hamming_loss(label, predictions))
        # hamming.append(f1_score(label, predictions, average='macro'))

    return np.mean(accuracy)



def train_net(net, batch_size=150, learning_rate=0.005, epochs=6):

    #initiate loaders according to batch size
    train_loader = torch.utils.data.DataLoader(train[:10000], batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
    valid_loader = torch.utils.data.DataLoader(valid[:10000], batch_size=batch_size, shuffle=True, collate_fn=pad_collate)

    ########################################################################
    # Define the Loss function and optimizer
    # Optimizer will be SGD with Momentum.
    # criterion = nn.BCELoss()
    criterion = nn.BCEWithLogitsLoss()
    # optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9)
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    ########################################################################
    # Set up some numpy arrays to store the training/test loss/erruracy
    iters, train_loss, valid_loss, train_acc, val_acc = [], [], [], [], []
    n = 0 # the number of iterations
    ########################################################################
    # Train the network
    # Loop over the data iterator and sample a new batch of training data
    # Get the output from the network, and optimize our loss function.
    start_time = time.time()
    for epoch in range(epochs):
        print(f'Epoch {epoch+1} of {epochs}')
        for text, labels in iter(train_loader):
            optimizer.zero_grad()
            output = net(text)
            loss = criterion(output, labels.squeeze())
            loss.backward()
            optimizer.step()

            #save training information
            iters.append(n)
            train_loss.append(float(loss)) 
            train_acc.append(get_accuracy(net, True, train_loader, valid_loader, size=batch_size)) # compute training accuracy 
            val_acc.append(get_accuracy(net, False, train_loader, valid_loader, size = batch_size))  # compute validation accuracy
            # calculate for validation loss
            
            n += 1
            # print(n)
        print(("Epoch {}: loss: {}, Training Accuracy: {}, Validation Accuracy: {}").format(
            epoch + 1, train_loss[epoch], train_acc[epoch], val_acc[epoch]
        ))

    # plotting; using spaghetti code for now since csv slows down training time
    plt.title("Loss Curve")
    plt.plot(iters, train_loss, label="Train")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(iters, train_acc, label="Train")
    plt.plot(iters, val_acc, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Training Accuracy")
    plt.legend(loc='best')
    plt.show()

    # print("Final Training Accuracy: {}".format(train_acc[-1]))
    # print("Final Validation Accuracy: {}".format(val_acc[-1]))
    # print('Finished Training')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Total time elapsed: {:.2f} seconds".format(elapsed_time))

    # Write the train/test loss/err into CSV file for plotting later
    # epochs = np.arange(1, num_epochs + 1)
    # np.savetxt("{}_train_err.csv".format(model_path), train_acc)
    # np.savetxt("{}_train_loss.csv".format(model_path), train_loss)
    # np.savetxt("{}_val_err.csv".format(model_path), val_acc)
    # np.savetxt("{}_val_loss.csv".format(model_path), val_loss)

In [None]:
model = Tweet_RNN(50, 64, 8, 'glove')
# model = Test_RNN(200, 64, 8)
model1 = Tweet_BiLSTM(256, 64, 8, 'glove')

In [None]:
#take only the first 1000 tweets for training, and the next 1000 for validation;
#put them into corresponding data loaders
#add padding to make all tweets the same length
train_loader = torch.utils.data.DataLoader(train[:20], batch_size=150, shuffle=True, collate_fn=pad_collate)
val_loader = torch.utils.data.DataLoader(train[20:40], batch_size=150, shuffle=True, collate_fn=pad_collate)

In [None]:
train_net(model, train_loader, valid_loader, batch_size=1, learning_rate=0.001, num_epochs=500)

In [None]:
plot_training_curve(get_model_name(model.name, 512, 0.01, 10))

# need tuning, nltk preprocessing too with stemming probs
# why tf is training loss also not decreasing
# and need to implement confusion matrix method of calculating acc