In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import re
import pandas as pd
import numpy as np
import time
# Tokenization function
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# Build vocabulary function
def build_vocab(datasets, min_freq=1):
    token_counter = Counter()
    for texts in datasets:
        for text in texts:
            tokens = tokenize(text)
            token_counter.update(tokens)

    # Create vocab with only tokens that meet the minimum frequency
    vocab = {token: idx + 2 for idx, (token, freq) in enumerate(token_counter.items()) if freq >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab


df_olidtest = pd.read_csv('../datasets/cleaned_OLID_test.tsv', sep="\t")
df_solidtest = pd.read_csv('../datasets/cleaned_SOLIDtest6K_trainer.tsv', sep='\t')
df_hso = pd.read_csv('../datasets/cleaned_hatespeech_offensive_test.tsv', sep='\t')
df_troff = pd.read_csv('../datasets/cleaned_tr_offenseval_test.tsv', sep='\t')



In [78]:
df_train_olid = pd.read_csv('../datasets/cleaned_OLID.tsv', sep='\t')
df_train_solid = pd.read_csv('../datasets/cleaned_SOLID9M_learner.tsv', sep='\t')
df_train_hso = pd.read_csv('../datasets/cleaned_hatespeech_offensive_train.tsv', sep='\t')
df_train_troff = pd.read_csv('../datasets/cleaned_tr_offenseval.tsv', sep='\t')

df_train_solid['label'] = df_train_solid['average'].apply(lambda x: 1 if x >= 0.8 else 0) # threshold the average values

sample_size = 80000
positive_ratio = 0.75

# Select the most confident positive values
semi_tweets_pos_df = df_train_solid[df_train_solid['average'] > 0.8].sample(n=np.floor(sample_size*positive_ratio).astype(int), random_state=1)

# Select the most confident negative values
semi_tweets_neg_df = df_train_solid[df_train_solid['average'] < 0.2].sample(n=np.floor(sample_size*(1-positive_ratio)).astype(int), random_state=1)

semi_tweets_df = pd.concat([semi_tweets_pos_df, semi_tweets_neg_df])
semi_tweets_df = semi_tweets_df.sample(frac=1, random_state=42)


semi_tweets_df.rename(columns={'text': 'tweet'}, inplace=True)
semi_tweets_df.reset_index(inplace=True, drop=True) 

df_trainolidsolid = pd.concat([df_train_olid, semi_tweets_df], ignore_index=True)
df_trainolidsolid = df_trainolidsolid.sample(frac=1)

df_train_solid = semi_tweets_df


datasets = [df_train_olid, df_trainolidsolid, df_train_solid, df_train_hso, df_train_troff]

In [79]:
# Custom Dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = tokenize(text)
        text_indices = [self.vocab.get(token, self.vocab['<UNK>']) for token in tokens]
        text_indices = text_indices[:self.max_len] + [self.vocab['<PAD>']] * (self.max_len - len(text_indices))
        return torch.tensor(text_indices), torch.tensor(label)
    

# Bi-directional LSTM Model class
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        output = self.fc(hidden)
        return output


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [80]:
dataset_index = 0
dataset_names = ['olid', 'olid + solid', 'solid', 'hso', 'tr off']
for train_dataset in datasets:
    print(f'current training dataset: {dataset_names[dataset_index]}')
    dataset_index += 1
    # Build vocabulary
    vocab = build_vocab([train_dataset['tweet']], min_freq=1)
    
    # Parameters
    max_len = 512  # Maximum length of text sequences
    vocab_size = len(vocab)
    embedding_dim = 200
    hidden_dim = 256
    output_dim = 2  # Number of classes
    n_layers = 2
    bidirectional = True
    dropout = 0.2


    # Create dataset and dataloader
    train_dataset = TextClassificationDataset(train_dataset['tweet'], train_dataset['label'], vocab, max_len)
    train_dataloader = DataLoader(train_dataset, batch_size=120, shuffle=True)

    olidtest_dataset = TextClassificationDataset(df_olidtest['tweet'], df_olidtest['label'], vocab, max_len)
    olidtest_dataloader = DataLoader(olidtest_dataset, batch_size=120, shuffle=True)

    solidtest_dataset = TextClassificationDataset(df_solidtest['tweet'], df_solidtest['label'], vocab, max_len)
    solidtest_dataloader = DataLoader(solidtest_dataset, batch_size=120, shuffle=True)

    hso_dataset = TextClassificationDataset(df_hso['tweet'], df_hso['label'], vocab, max_len)
    hso_dataloader = DataLoader(hso_dataset, batch_size=120, shuffle=True)

    troff_dataset = TextClassificationDataset(df_troff['tweet'], df_troff['label'], vocab, max_len)
    troff_dataloader = DataLoader(troff_dataset, batch_size=120, shuffle=True)

    # Instantiate the model, loss function, and optimizer
    model = BiLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    train_time_start = time.time()
    # Training loop
    epochs = 5
    for epoch in range(epochs):
        for inputs, targets in train_dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        print(f'Epoch: {epoch+1}, Loss: {loss.item()}')
    train_time_end = time.time()
    print(f'Training took: {train_time_end - train_time_start}')
    from sklearn.metrics import classification_report
    import numpy as np
    import time

    # Test function
    def test(model, test_dataloader, device):
        model.eval()
        all_targets = []
        all_predictions = []
        with torch.no_grad():
            for inputs, targets in test_dataloader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs, 1)
                all_targets.extend(targets.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        output = classification_report(all_targets, all_predictions)
        accuracy = sum(np.array(all_targets) == np.array(all_predictions)) / len(all_targets)
        return accuracy, output


    test_start_time = time.time()
    test_accuracy, test_classification_report = test(model, olidtest_dataloader, device)
    test_end_time = time.time()
    print(f'[olid] test accuracy: {test_accuracy}')
    print(f'test time: {test_end_time - test_start_time}')
    print(test_classification_report)


    test_start_time = time.time()
    test_accuracy, test_classification_report = test(model, solidtest_dataloader, device)
    test_end_time = time.time()
    print(f'[solid] test accuracy: {test_accuracy}')
    print(f'test time: {test_end_time - test_start_time}')
    print(test_classification_report)

    test_start_time = time.time()
    test_accuracy, test_classification_report = test(model, hso_dataloader, device)
    test_end_time = time.time()
    print(f'[hso] test accuracy: {test_accuracy}')
    print(f'test time: {test_end_time - test_start_time}')
    print(test_classification_report)

    test_start_time = time.time()
    test_accuracy, test_classification_report = test(model, troff_dataloader, device)
    test_end_time = time.time()
    print(f'[tr_offenseval] test accuracy: {test_accuracy}')
    print(f'test time: {test_end_time - test_start_time}')
    print(test_classification_report)
    model.to('cpu')
    del model, optimizer, criterion


current training dataset: olid
Epoch: 1, Loss: 0.6452415585517883
Epoch: 2, Loss: 0.54401695728302
Epoch: 3, Loss: 0.4863860607147217
Epoch: 4, Loss: 0.5503567457199097
Epoch: 5, Loss: 0.34313398599624634
Training took: 79.15260124206543
[olid] test accuracy: 0.7837209302325582
test time: 0.40545082092285156
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       620
           1       0.66      0.47      0.55       240

    accuracy                           0.78       860
   macro avg       0.74      0.69      0.70       860
weighted avg       0.77      0.78      0.77       860

[solid] test accuracy: 0.8858668446520941
test time: 2.640475034713745
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      2991
           1       0.89      0.88      0.89      3002

    accuracy                           0.89      5993
   macro avg       0.89      0.89      0.89      5993
weighted avg       0