In [1]:
import datasets 
import numpy as np
import os
import time
import torch
import tensorflow as tf
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
import torch.optim as optim
import torchvision
import torchtext
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
from langdetect import detect_langs
from torch.nn.utils.rnn import pad_sequence
from pandarallel import pandarallel 
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

if torch.cuda.is_available():  
  use_cuda = True
else:  
  use_cuda = False



In [3]:
# loading GLOVE embeddings
GLOVE = torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=10000)  # use 10k most common words

In [4]:
# loading dataset
dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')   
df = dataset['train'].to_pandas()
df.describe()

Using custom data configuration ucberkeley-dlab--measuring-hate-speech-c32713cabe528196
Found cached dataset parquet (/Users/angelinazhai/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,hatespeech,hate_speech_score,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,annotator_age
count,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,...,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135451.0
mean,23530.416138,5567.097812,1.281352,2.954307,2.828875,2.56331,2.278638,2.698575,1.846211,1.052045,...,0.744733,-0.567428,1.034322,1.001052,-0.018817,0.300588,1.007158,1.011841,0.014589,37.910772
std,12387.194125,3230.508937,1.023542,1.231552,1.309548,1.38983,1.370876,0.8985,1.402372,1.345706,...,0.93226,2.380003,0.496867,0.791943,0.487261,0.23638,0.269876,0.675863,0.613006,11.641276
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.34,0.1,0.07,-1.82,0.02,0.39,0.28,-1.578693,18.0
25%,18148.0,2719.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,0.0,-2.33,0.71,0.56,-0.38,0.03,0.81,0.67,-0.341008,29.0
50%,20052.0,5602.5,1.0,3.0,3.0,3.0,3.0,3.0,2.0,0.0,...,0.0,-0.34,0.96,0.83,-0.02,0.34,0.97,0.85,0.110405,35.0
75%,32038.25,8363.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,...,2.0,1.41,1.3,1.22,0.35,0.42,1.17,1.13,0.449555,45.0
max,50070.0,11142.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,2.0,6.3,5.9,9.0,1.36,1.9,2.01,9.0,0.987511,81.0


In [5]:
#load numpy array from file
tmp_np_arr = np.load('hate_speech.npy', allow_pickle=True)

#convert to pandas dataframe
df.drop(df.iloc[:, 15:131], inplace=True, axis=1)
df_tmp = df.drop(["annotator_id"], axis=1)
df_norm = pd.DataFrame(tmp_np_arr, columns=df_tmp.columns)

In [6]:
df_spliced = df_norm.drop('comment_id', axis=1)
df_spliced = df_spliced.drop('platform', axis=1)
df_spliced = df_spliced.drop('sentiment', axis=1)
df_spliced = df_spliced.drop('hatespeech', axis=1)
df_spliced = df_spliced.drop('hate_speech_score', axis=1)

In [7]:
tweets = df_spliced.iloc[:,-1:]
labels = df_spliced.iloc[:,:-1]
labels = labels.to_numpy()
label_names = list(df_spliced.iloc[:,:-1].columns)
train_size = int(0.7*len(df_spliced))
val_size = int((len(df_spliced) - train_size)/2)
test_size = len(df_spliced) - train_size - val_size

In [9]:
def split_tweet(tweet):
    # separate punctuations
    tweet = tweet.replace(".", " . ") \
                 .replace(",", " , ") \
                 .replace(";", " ; ") \
                 .replace("?", " ? ")
    return tweet.lower().split()

In [50]:
def get_tweet_words(glove_vector):
    train, valid, test = [], [], []
    for index, row in df_spliced.iterrows():
        try:
            tweet = row[-1]
            idxs = [glove_vector.stoi[w]        # lookup the index of word
                    for w in split_tweet(tweet)
                    if w in glove_vector.stoi] # keep words that has an embedding
            if not idxs: # ignore tweets without any word with an embedding
                continue
            idxs = torch.tensor(idxs) # convert list to pytorch tensor
            label = np.array(row[:-1].values).astype(np.float32) 
            label = torch.tensor(label) #storing label information to tensor
            #adding tweet to corresponding train/val/test set
            if index < train_size:
                train.append((idxs, label))
            elif index < train_size+val_size:
                valid.append((idxs, label))
            else:
                test.append((idxs, label))
        except:
            print("Error at index: ", index)
            continue
    return train, valid, test

train, valid, test = get_tweet_words(GLOVE)

In [51]:
from torch import nn
from torch.nn.utils.rnn import pad_sequence

def pad_collate(batch):
  (xx, yy) = zip(*batch)
  x_lens = [len(x) for x in xx]
  y_lens = [len(y) for y in yy]

  xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
  yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

  return xx_pad, yy_pad
  
train_loader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True, collate_fn=pad_collate)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=128, shuffle=True, collate_fn=pad_collate)
test_loader = torch.utils.data.DataLoader(test, batch_size=128, shuffle=True, collate_fn=pad_collate)

In [52]:
class RNNMultiLabelClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_labels):
        super(RNNMultiLabelClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, num_labels)

    def forward(self, inputs):
        _, hidden = self.rnn(inputs)
        out = self.fc1(hidden.squeeze(0))
        out = F.sigmoid(out)
        return out
    
class Tweet_RNN(nn.Module):
    """
    The class object for the RNN.
    Attributes:
    emb: the type of embedding
    hidden_size: the number of layers
    nn: the actual neural network
    fc: the activation layer
    """

    # Tweet_RNN.__init__(self, input_size, hidden_size, num_classes)
    # param: self:Tweet_RNN
    # param: input_size:int
    # param: hidden_size:int
    # param: num_classes:int
    # param: embedding:str
    #    the string should be either: "glove", "word2vec" or "none"
    #    and correspond to the desired embedding
    # return: void
    # initializes the RNN
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, embedding: str) -> None:
        super(Tweet_RNN, self).__init__()
        self.name = 'Tweet_RNN'
        if embedding == "glove":
            self.emb = nn.Embedding.from_pretrained(GLOVE.vectors)
        elif embedding == "word2vec":
            self.emb = nn.Embedding.from_pretrained(WORD2VEC)
        else:
            self.emb = nn.Embedding(input_size, num_classes)
        self.hidden_size = hidden_size
        self.nn = nn.RNN(input_size, hidden_size, batch_first=True)
        # self.linear = nn.Linear(input_size, 1);
        self.linear = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    # forward(self, x)
    # param: self:Tweet_RNN
    # param: x:torch.FloatTensor
    # initializes the RNN
    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.nn(x, h0)

        # Pass the output of hidden layer from the last time step to the classifier
        out = self.sigmoid(self.linear(out[:, -1, :])) #sigmoid activiation is applyed for all 8 classes

        return out #outputs a tensor of dimension 8 

In [53]:
def train(model, optimizer, criterion, train_loader, epoch):
    model.train()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

In [54]:
def evaluate(model, data_loader):
    model.eval()
    y_true, y_pred = [], []
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in data_loader:
            outputs = model(inputs)
            y_true += targets.cpu().tolist()
            y_pred += (outputs >= 0.5).cpu().tolist()
            total_loss += criterion(outputs, targets).item()

    f1 = f1_score(y_true, y_pred, average='micro')
    avg_loss = total_loss / len(data_loader)
    return f1, avg_loss

In [66]:
# Example usage
input_size = 50  # Embedding dimension
hidden_size = 64
num_labels = 8  # Number of classes
batch_size = 32
num_epochs = 1

# train_loader = # Your PyTorch train data loader here
# valid_loader = # Your PyTorch validation data loader here

#get train and validation data loaders
#get first batch of data

model = RNNMultiLabelClassifier(119, hidden_size, num_labels)
optimizer = optim.Adam(model.parameters())
# optimizer = optim.Adam(model.parameters(), lr=learning_)
criterion = nn.BCELoss()  # Binary cross-entropy loss for multi-label classification

train_losses, train_accs = [], []
valid_losses, valid_accs = [], []

for epoch in range(num_epochs):
    train_iter = iter(train_loader)
    valid_iter = iter(valid_loader)
    train(model, optimizer, criterion, train_iter, epoch)
    train_f1, train_loss = evaluate(model, train_loader)
    valid_f1, valid_loss = evaluate(model, valid_loader)
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    train_accs.append(train_f1)
    valid_accs.append(valid_f1)
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train F1={train_f1:.4f}, Valid Loss={valid_loss:.4f}, Valid F1={valid_f1:.4f}")

plt.plot(train_losses, label='Train Loss')
plt.plot(valid_losses, label='Valid Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.plot(train_accs, label='Train F1')
plt.plot(valid_accs, label='Valid F1')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.legend()
plt.show()

RuntimeError: input.size(-1) must be equal to input_size. Expected 119, got 88