# Environment preparation

In [1]:
# Install the required library
!pip install transformers



In [2]:
# Mount the Google Drive folder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Handle all the required imports
import json
import random
import time
import re
import numpy as np
import pandas as pd
import torch
import nltk
import datetime

from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, \
    get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, TensorDataset
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### **Methods used for BERT training and testing**

In [9]:
# Functions required for training and testing

def get_torch_device():
    # Check for GPU...
    if torch.cuda.is_available():
        print('GPU:', torch.cuda.get_device_name(0))
        return torch.device("cuda")

    else:
        print('No GPU available, using the CPU instead.')
        return torch.device("cpu")

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def convert_to_input(contents, tokenizer, max_length=128, pad_token=0, pad_token_segment_id=0):
    input_ids, attention_masks, token_type_ids = [], [], []

    for sentence in tqdm(contents, position=0, leave=True):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=max_length)

        i, t = inputs["input_ids"], inputs["token_type_ids"]
        m = [1] * len(i)

        padding_length = max_length - len(i)

        i = i + ([pad_token] * padding_length)
        m = m + ([0] * padding_length)
        t = t + ([pad_token_segment_id] * padding_length)

        input_ids.append(torch.Tensor([i]))
        attention_masks.append(torch.Tensor([m]))
        token_type_ids.append(torch.Tensor([t]))

    return torch.cat(input_ids, dim=0).to(torch.int64), torch.cat(attention_masks, dim=0).to(torch.int64)

def get_dataloaders(train_dataset, val_dataset, batch_size):
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )

    val_dataloader = DataLoader(
        val_dataset,  # The validation samples.
        sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
        batch_size=batch_size  # Evaluate with this batch size.
    )

    return train_dataloader, val_dataloader


def train(model, optimizer, scheduler, train_dataloader, validation_dataloader, epochs, device, multiclass=False):
    # Set the seed value all over the place to make this reproducible.
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):
        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # ========================================
        #               Train
        # ========================================

        # Reset the total loss for this epoch.
        total_train_loss = 0

        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different
            model_output = model(b_input_mask
                                 )
            model_output = model(b_input_ids,
                                 token_type_ids=None,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)

            loss = model_output.loss

            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        val_predictions = []
        val_labels = []
        for batch in validation_dataloader:
            # Unpack this training batch from our dataloader
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():

                eval_outputs = model(b_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_input_mask,
                                       labels=b_labels)
                loss = eval_outputs.loss
                logits = eval_outputs.logits

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()

            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += flat_accuracy(logits, label_ids)

            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()

            val_predictions.append(pred_flat)
            val_labels.append(labels_flat)

        val_predictions = np.hstack(np.array(val_predictions))
        val_labels = np.hstack(np.array(val_labels))

        precision = precision_score(val_labels, val_predictions, average=("micro" if multiclass else "binary"))
        recall = recall_score(val_labels, val_predictions, average=("micro" if multiclass else "binary"))
        f1score = f1_score(val_labels, val_predictions, average=("micro" if multiclass else "binary"))

        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        print(f"f1-score: {round(f1score, 2)}, precision: {round(precision,2)}, recall: {round(recall, 2)}")
        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))
    return model.state_dict()

def test(dataloader, model, score_average="binary"):

    predictions = []
    true_labels = []

    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            eval_outputs = model(b_input_ids,
                                  token_type_ids=None,
                                  attention_mask=b_input_mask,
                                  labels=b_labels)
        logits = eval_outputs.logits

        # Move logits and labels to CPU
        logits = np.argmax(logits.detach().cpu().numpy(), axis=1)
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

    predictions = np.hstack(predictions)
    true_labels = np.hstack(true_labels)

    precision = precision_score(true_labels, predictions, average=score_average)
    recall = recall_score(true_labels, predictions, average=score_average)
    f1score = f1_score(true_labels, predictions, average=score_average)
    accuracy = accuracy_score(true_labels, predictions)

    print(f"accuracy: {accuracy}, f1-score: {f1score}, precision: {precision}, recall: {recall}")

    return accuracy, f1score, precision, recall

def prepare_training_dataloaders(X, y, tokenizer, batch_size, max_length, device):
    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    train_input_ids, train_attention_masks = convert_to_input(x_train, tokenizer, max_length=max_length)
    val_input_ids, val_attention_masks = convert_to_input(x_val, tokenizer, max_length=max_length)

    y_train = torch.tensor(y_train).to(torch.int64)
    y_val = torch.tensor(y_val).to(torch.int64)

    train_dataset = TensorDataset(train_input_ids, train_attention_masks, y_train)
    val_dataset = TensorDataset(val_input_ids, val_attention_masks, y_val)

    train_dataloader, val_dataloader = get_dataloaders(train_dataset, val_dataset, batch_size)
    return train_dataloader, val_dataloader

def prepare_test_dataloader(X, y, tokenizer, batch_size, max_length, device):
    test_input_ids, test_attention_masks = convert_to_input(X, tokenizer, max_length=max_length)

    y_test = torch.tensor(y).to(torch.int64)

    test_dataset = TensorDataset(test_input_ids, test_attention_masks, y_test)

    test_dataloader = DataLoader(
        test_dataset,  # The validation samples.
        sampler=SequentialSampler(test_dataset),  # Pull out batches sequentially.
        batch_size=32  # Evaluate with this batch size.
    )

    return test_dataloader


def get_bert_eng(num_labels=2, pretrained_weights_path=None):
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=num_labels,
        output_attentions=False,
        output_hidden_states=False,
    )

    if pretrained_weights_path != None:
      model.load_state_dict(torch.load(pretrained_weights_path))

    return model

def get_bert_crosloen(model_folder_path, num_labels=2, pretrained_weights_path=None):
    model = BertForSequenceClassification.from_pretrained(model_folder_path, num_labels=num_labels)
    if pretrained_weights_path != None:
      model.load_state_dict(torch.load(pretrained_weights_path))

    return model



# **Multilingual BERT training on ENG datasets and testing on ENG & SLO**

## Binary datasets

### Gab training and testing on ENG

In [None]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

# Load the data
df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/gab/train.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/")
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_gab_trained.pth")

GPU: Tesla T4


  0%|          | 0/20783 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 20783/20783 [00:15<00:00, 1325.09it/s]
100%|██████████| 5196/5196 [00:03<00:00, 1337.73it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS 


Training...
  Batch    40  of    650.    Elapsed: 0:00:33.
  Batch    80  of    650.    Elapsed: 0:01:07.
  Batch   120  of    650.    Elapsed: 0:01:41.
  Batch   160  of    650.    Elapsed: 0:02:17.
  Batch   200  of    650.    Elapsed: 0:02:53.
  Batch   240  of    650.    Elapsed: 0:03:29.
  Batch   280  of    650.    Elapsed: 0:04:05.
  Batch   320  of    650.    Elapsed: 0:04:41.
  Batch   360  of    650.    Elapsed: 0:05:18.
  Batch   400  of    650.    Elapsed: 0:05:55.
  Batch   440  of    650.    Elapsed: 0:06:32.
  Batch   480  of    650.    Elapsed: 0:07:09.
  Batch   520  of    650.    Elapsed: 0:07:46.
  Batch   560  of    650.    Elapsed: 0:08:23.
  Batch   600  of    650.    Elapsed: 0:09:00.
  Batch   640  of    650.    Elapsed: 0:09:37.

  Average training loss: 0.35
  Training epoch took: 0:09:46

Running Validation...




  Accuracy: 0.91
f1-score: 0.9, precision: 0.95, recall: 0.86
  Validation Loss: 0.27
  Validation took: 0:00:39

Training...
  Batch    40  of    650.    Elapsed: 0:00:37.
  Batch    80  of    650.    Elapsed: 0:01:14.
  Batch   120  of    650.    Elapsed: 0:01:52.
  Batch   160  of    650.    Elapsed: 0:02:29.
  Batch   200  of    650.    Elapsed: 0:03:06.
  Batch   240  of    650.    Elapsed: 0:03:43.
  Batch   280  of    650.    Elapsed: 0:04:20.
  Batch   320  of    650.    Elapsed: 0:04:58.
  Batch   360  of    650.    Elapsed: 0:05:35.
  Batch   400  of    650.    Elapsed: 0:06:12.
  Batch   440  of    650.    Elapsed: 0:06:49.
  Batch   480  of    650.    Elapsed: 0:07:26.
  Batch   520  of    650.    Elapsed: 0:08:04.
  Batch   560  of    650.    Elapsed: 0:08:41.
  Batch   600  of    650.    Elapsed: 0:09:18.
  Batch   640  of    650.    Elapsed: 0:09:56.

  Average training loss: 0.23
  Training epoch took: 0:10:05

Running Validation...
  Accuracy: 0.92
f1-score: 0.91, prec

In [None]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/gab/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_gab_trained.pth")
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla T4


  0%|          | 0/6495 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 6495/6495 [00:04<00:00, 1344.71it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassific

accuracy: 0.9173210161662817, f1-score: 0.9157962759555511, precision: 0.9178922997840422, recall: 0.9142042403381438


(0.9173210161662817,
 0.9157962759555511,
 0.9178922997840422,
 0.9142042403381438)

### Reddit training and testing on ENG

In [None]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/reddit/train.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/")
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_reddit_trained.pth")

GPU: Tesla T4


  0%|          | 0/14214 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 14214/14214 [00:16<00:00, 865.68it/s]
100%|██████████| 3554/3554 [00:04<00:00, 834.94it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS ex


Training...
  Batch    40  of    445.    Elapsed: 0:00:38.
  Batch    80  of    445.    Elapsed: 0:01:17.
  Batch   120  of    445.    Elapsed: 0:01:55.
  Batch   160  of    445.    Elapsed: 0:02:33.
  Batch   200  of    445.    Elapsed: 0:03:11.
  Batch   240  of    445.    Elapsed: 0:03:49.
  Batch   280  of    445.    Elapsed: 0:04:27.
  Batch   320  of    445.    Elapsed: 0:05:05.
  Batch   360  of    445.    Elapsed: 0:05:43.
  Batch   400  of    445.    Elapsed: 0:06:21.
  Batch   440  of    445.    Elapsed: 0:06:59.

  Average training loss: 0.35
  Training epoch took: 0:07:03

Running Validation...




  Accuracy: 0.91
f1-score: 0.79, precision: 0.82, recall: 0.76
  Validation Loss: 0.28
  Validation took: 0:00:28

Training...
  Batch    40  of    445.    Elapsed: 0:00:38.
  Batch    80  of    445.    Elapsed: 0:01:16.
  Batch   120  of    445.    Elapsed: 0:01:54.
  Batch   160  of    445.    Elapsed: 0:02:32.
  Batch   200  of    445.    Elapsed: 0:03:10.
  Batch   240  of    445.    Elapsed: 0:03:48.
  Batch   280  of    445.    Elapsed: 0:04:26.
  Batch   320  of    445.    Elapsed: 0:05:04.
  Batch   360  of    445.    Elapsed: 0:05:42.
  Batch   400  of    445.    Elapsed: 0:06:20.
  Batch   440  of    445.    Elapsed: 0:06:58.

  Average training loss: 0.27
  Training epoch took: 0:07:02

Running Validation...
  Accuracy: 0.91
f1-score: 0.8, precision: 0.83, recall: 0.77
  Validation Loss: 0.29
  Validation took: 0:00:28

Training...
  Batch    40  of    445.    Elapsed: 0:00:38.
  Batch    80  of    445.    Elapsed: 0:01:16.
  Batch   120  of    445.    Elapsed: 0:01:54.
  Ba

In [None]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/reddit/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_reddit_trained.pth")
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla T4


  0%|          | 0/4442 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 4442/4442 [00:05<00:00, 783.31it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassifica

accuracy: 0.9027465105808195, f1-score: 0.8684532141814179, precision: 0.8657578191579269, recall: 0.8712582452117336


(0.9027465105808195,
 0.8684532141814179,
 0.8657578191579269,
 0.8712582452117336)

## Multi-class

### Kaggle toxic comment training and testing

In [None]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/toxic_comment_relabeled_multiclass/train.csv")
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", num_labels=4)
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device, multiclass=True)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_toxic_comment_trained.pth")

GPU: Tesla T4


  0%|          | 0/15121 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 15121/15121 [00:23<00:00, 655.58it/s]
100%|██████████| 3781/3781 [00:05<00:00, 650.39it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS ex


Training...
  Batch    40  of    473.    Elapsed: 0:00:39.
  Batch    80  of    473.    Elapsed: 0:01:17.
  Batch   120  of    473.    Elapsed: 0:01:56.
  Batch   160  of    473.    Elapsed: 0:02:34.
  Batch   200  of    473.    Elapsed: 0:03:12.
  Batch   240  of    473.    Elapsed: 0:03:50.
  Batch   280  of    473.    Elapsed: 0:04:29.
  Batch   320  of    473.    Elapsed: 0:05:07.
  Batch   360  of    473.    Elapsed: 0:05:45.
  Batch   400  of    473.    Elapsed: 0:06:23.
  Batch   440  of    473.    Elapsed: 0:07:01.

  Average training loss: 0.94
  Training epoch took: 0:07:32

Running Validation...




  Accuracy: 0.64
f1-score: 0.64, precision: 0.64, recall: 0.64
  Validation Loss: 0.81
  Validation took: 0:00:29

Training...
  Batch    40  of    473.    Elapsed: 0:00:38.
  Batch    80  of    473.    Elapsed: 0:01:16.
  Batch   120  of    473.    Elapsed: 0:01:55.
  Batch   160  of    473.    Elapsed: 0:02:33.
  Batch   200  of    473.    Elapsed: 0:03:11.
  Batch   240  of    473.    Elapsed: 0:03:49.
  Batch   280  of    473.    Elapsed: 0:04:27.
  Batch   320  of    473.    Elapsed: 0:05:06.
  Batch   360  of    473.    Elapsed: 0:05:44.
  Batch   400  of    473.    Elapsed: 0:06:22.
  Batch   440  of    473.    Elapsed: 0:07:00.

  Average training loss: 0.70
  Training epoch took: 0:07:31

Running Validation...
  Accuracy: 0.67
f1-score: 0.67, precision: 0.67, recall: 0.67
  Validation Loss: 0.74
  Validation took: 0:00:29

Training...
  Batch    40  of    473.    Elapsed: 0:00:38.
  Batch    80  of    473.    Elapsed: 0:01:16.
  Batch   120  of    473.    Elapsed: 0:01:55.
  B

In [None]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/toxic_comment_relabeled_multiclass/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_toxic_comment_trained.pth",
                          num_labels=4)
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla T4


  0%|          | 0/4823 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 4823/4823 [00:07<00:00, 682.69it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassifica

accuracy: 0.7028820236367406, f1-score: 0.6060090681186833, precision: 0.6567773376674054, recall: 0.5860866947754766


(0.7028820236367406,
 0.6060090681186833,
 0.6567773376674054,
 0.5860866947754766)

### Trac2 Subtask A training and testing

In [None]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/trac2/train.csv")
comments = df["content"].fillna("").values
labels = df["type"].fillna(0).values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", num_labels=3)
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device, multiclass=True)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_trac2_trained.pth")

GPU: Tesla T4


  0%|          | 0/3410 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 3410/3410 [00:01<00:00, 1878.60it/s]
100%|██████████| 853/853 [00:00<00:00, 1660.99it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expec


Training...
  Batch    40  of    107.    Elapsed: 0:00:35.
  Batch    80  of    107.    Elapsed: 0:01:12.

  Average training loss: 0.63
  Training epoch took: 0:01:38

Running Validation...




  Accuracy: 0.77
f1-score: 0.77, precision: 0.77, recall: 0.77
  Validation Loss: 0.61
  Validation took: 0:00:07

Training...
  Batch    40  of    107.    Elapsed: 0:00:37.
  Batch    80  of    107.    Elapsed: 0:01:15.

  Average training loss: 0.57
  Training epoch took: 0:01:39

Running Validation...
  Accuracy: 0.80
f1-score: 0.8, precision: 0.8, recall: 0.8
  Validation Loss: 0.56
  Validation took: 0:00:07

Training...
  Batch    40  of    107.    Elapsed: 0:00:37.
  Batch    80  of    107.    Elapsed: 0:01:15.

  Average training loss: 0.50
  Training epoch took: 0:01:40

Running Validation...
  Accuracy: 0.80
f1-score: 0.8, precision: 0.8, recall: 0.8
  Validation Loss: 0.56
  Validation took: 0:00:07

Training complete!
Total training took 0:05:16 (h:mm:ss)


In [None]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/trac2/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_trac2_trained.pth",
                          num_labels=3)
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla T4


  0%|          | 0/1066 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 1066/1066 [00:00<00:00, 1953.58it/s]
Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassific

accuracy: 0.7870544090056285, f1-score: 0.3821143913941086, precision: 0.5395185716563048, recall: 0.38487472123835764


(0.7870544090056285,
 0.3821143913941086,
 0.5395185716563048,
 0.38487472123835764)

## Testing models on SLO data

In [7]:
# Preparation for testing
max_length = 128
batch_size = 32
df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/slo-twitter-test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

labels_binary = np.minimum(labels, 1)
test_dataloader_binary = prepare_test_dataloader(comments, labels_binary, tokenizer, batch_size, max_length, device)


GPU: Tesla K80


  0%|          | 0/18459 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 18459/18459 [00:16<00:00, 1144.67it/s]
100%|██████████| 18459/18459 [00:16<00:00, 1128.27it/s]


In [9]:
# Test the Toxic comment model on multiclass Slovenian Twitter dataset

# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_toxic_comment_trained.pth",
                          num_labels=4)
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

accuracy: 0.24047889918197085, f1-score: 0.17136136084428483, precision: 0.24734685719052849, recall: 0.24948688666552163


  _warn_prf(average, modifier, msg_start, len(result))


(0.24047889918197085,
 0.17136136084428483,
 0.24734685719052849,
 0.24948688666552163)

In [13]:
# Test the Gab binary model on binarized Slovenian Twitter dataset
# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_gab_trained.pth")
model.cuda()

# Run test
test(test_dataloader_binary, model, score_average="binary")

Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

accuracy: 0.49618072484966685, f1-score: 0.045174537987679675, precision: 0.6077348066298343, recall: 0.02345915973555129


(0.49618072484966685,
 0.045174537987679675,
 0.6077348066298343,
 0.02345915973555129)

In [14]:
# Test the Reddit binary model on binarized Slovenian Twitter dataset
# Get the model and move it to GPU
model = get_bert_crosloen("/content/drive/MyDrive/offensive_language_classification/crosloen_bert/", 
                          pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/crosloen_reddit_trained.pth",
                          num_labels=2)
model.cuda()

# Run test
test(test_dataloader_binary, model, score_average="binary")

Some weights of the model checkpoint at /content/drive/MyDrive/offensive_language_classification/crosloen_bert/ were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

accuracy: 0.4973725553930332, f1-score: 0.05519348268839104, precision: 0.6131221719457014, recall: 0.028897419492429088


(0.4973725553930332,
 0.05519348268839104,
 0.6131221719457014,
 0.028897419492429088)

# **Monolingual BERT training on ENG datasets and testing on ENG and translated SLO datasets**

## Binary datasets

### Gab training and testing on ENG

In [5]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

# Load the data
df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/gab/train.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng()
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_gab_trained.pth")

GPU: Tesla K80


  0%|          | 0/20783 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 20783/20783 [00:22<00:00, 942.10it/s] 
100%|██████████| 5196/5196 [00:05<00:00, 949.84it/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassi


Training...
  Batch    40  of    650.    Elapsed: 0:01:11.
  Batch    80  of    650.    Elapsed: 0:02:20.
  Batch   120  of    650.    Elapsed: 0:03:30.
  Batch   160  of    650.    Elapsed: 0:04:40.
  Batch   200  of    650.    Elapsed: 0:05:50.
  Batch   240  of    650.    Elapsed: 0:07:00.
  Batch   280  of    650.    Elapsed: 0:08:09.
  Batch   320  of    650.    Elapsed: 0:09:19.
  Batch   360  of    650.    Elapsed: 0:10:29.
  Batch   400  of    650.    Elapsed: 0:11:39.
  Batch   440  of    650.    Elapsed: 0:12:48.
  Batch   480  of    650.    Elapsed: 0:13:58.
  Batch   520  of    650.    Elapsed: 0:15:07.
  Batch   560  of    650.    Elapsed: 0:16:17.
  Batch   600  of    650.    Elapsed: 0:17:27.
  Batch   640  of    650.    Elapsed: 0:18:37.

  Average training loss: 0.27
  Training epoch took: 0:18:54

Running Validation...




  Accuracy: 0.93
f1-score: 0.92, precision: 0.94, recall: 0.9
  Validation Loss: 0.21
  Validation took: 0:01:15

Training...
  Batch    40  of    650.    Elapsed: 0:01:10.
  Batch    80  of    650.    Elapsed: 0:02:20.
  Batch   120  of    650.    Elapsed: 0:03:30.
  Batch   160  of    650.    Elapsed: 0:04:40.
  Batch   200  of    650.    Elapsed: 0:05:50.
  Batch   240  of    650.    Elapsed: 0:07:00.
  Batch   280  of    650.    Elapsed: 0:08:10.
  Batch   320  of    650.    Elapsed: 0:09:19.
  Batch   360  of    650.    Elapsed: 0:10:29.
  Batch   400  of    650.    Elapsed: 0:11:39.
  Batch   440  of    650.    Elapsed: 0:12:49.
  Batch   480  of    650.    Elapsed: 0:13:59.
  Batch   520  of    650.    Elapsed: 0:15:09.
  Batch   560  of    650.    Elapsed: 0:16:19.
  Batch   600  of    650.    Elapsed: 0:17:29.
  Batch   640  of    650.    Elapsed: 0:18:39.

  Average training loss: 0.19
  Training epoch took: 0:18:55

Running Validation...
  Accuracy: 0.93
f1-score: 0.93, prec

In [15]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/gab/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_gab_trained.pth")
model.cuda()

# Run test
test(test_dataloader, model, score_average="binary")

GPU: Tesla K80


  0%|          | 0/6495 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 6495/6495 [00:06<00:00, 1014.68it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on anothe

accuracy: 0.9217859892224788, f1-score: 0.9110955547777388, precision: 0.9130129779024904, recall: 0.9091861683548725


(0.9217859892224788,
 0.9110955547777388,
 0.9130129779024904,
 0.9091861683548725)

### Reddit training and testing on ENG

In [8]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/reddit/train.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng()
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_reddit_trained.pth")

GPU: Tesla K80


  0%|          | 0/14214 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 14214/14214 [00:20<00:00, 703.28it/s]
100%|██████████| 3554/3554 [00:05<00:00, 665.71it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassifi


Training...
  Batch    40  of    445.    Elapsed: 0:01:10.
  Batch    80  of    445.    Elapsed: 0:02:19.
  Batch   120  of    445.    Elapsed: 0:03:29.
  Batch   160  of    445.    Elapsed: 0:04:38.
  Batch   200  of    445.    Elapsed: 0:05:47.
  Batch   240  of    445.    Elapsed: 0:06:57.
  Batch   280  of    445.    Elapsed: 0:08:06.
  Batch   320  of    445.    Elapsed: 0:09:16.
  Batch   360  of    445.    Elapsed: 0:10:25.
  Batch   400  of    445.    Elapsed: 0:11:34.
  Batch   440  of    445.    Elapsed: 0:12:44.

  Average training loss: 0.29
  Training epoch took: 0:12:51

Running Validation...




  Accuracy: 0.91
f1-score: 0.81, precision: 0.81, recall: 0.81
  Validation Loss: 0.25
  Validation took: 0:00:50

Training...
  Batch    40  of    445.    Elapsed: 0:01:09.
  Batch    80  of    445.    Elapsed: 0:02:19.
  Batch   120  of    445.    Elapsed: 0:03:28.
  Batch   160  of    445.    Elapsed: 0:04:38.
  Batch   200  of    445.    Elapsed: 0:05:47.
  Batch   240  of    445.    Elapsed: 0:06:56.
  Batch   280  of    445.    Elapsed: 0:08:06.
  Batch   320  of    445.    Elapsed: 0:09:15.
  Batch   360  of    445.    Elapsed: 0:10:24.
  Batch   400  of    445.    Elapsed: 0:11:34.
  Batch   440  of    445.    Elapsed: 0:12:43.

  Average training loss: 0.21
  Training epoch took: 0:12:51

Running Validation...
  Accuracy: 0.91
f1-score: 0.81, precision: 0.8, recall: 0.82
  Validation Loss: 0.27
  Validation took: 0:00:50

Training...
  Batch    40  of    445.    Elapsed: 0:01:09.
  Batch    80  of    445.    Elapsed: 0:02:19.
  Batch   120  of    445.    Elapsed: 0:03:28.
  Ba

In [10]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/reddit/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_reddit_trained.pth")
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla K80


  0%|          | 0/4442 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 4442/4442 [00:06<00:00, 667.45it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another

accuracy: 0.9124268347591176, f1-score: 0.8816573145423792, precision: 0.8785303259180368, recall: 0.8849259226003412


(0.9124268347591176,
 0.8816573145423792,
 0.8785303259180368,
 0.8849259226003412)

## Multi-class

### Kaggle toxic comment training and testing

In [5]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/toxic_comment_relabeled_multiclass/train.csv")
comments = df["content"].values
labels = df["type"].values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(num_labels=4)
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device, multiclass=True)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_toxic_comment_trained.pth")

GPU: Tesla K80


  0%|          | 0/15121 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 15121/15121 [00:30<00:00, 492.90it/s]
100%|██████████| 3781/3781 [00:07<00:00, 487.89it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassifi


Training...
  Batch    40  of    473.    Elapsed: 0:01:11.
  Batch    80  of    473.    Elapsed: 0:02:21.
  Batch   120  of    473.    Elapsed: 0:03:31.
  Batch   160  of    473.    Elapsed: 0:04:41.
  Batch   200  of    473.    Elapsed: 0:05:52.
  Batch   240  of    473.    Elapsed: 0:07:02.
  Batch   280  of    473.    Elapsed: 0:08:12.
  Batch   320  of    473.    Elapsed: 0:09:22.
  Batch   360  of    473.    Elapsed: 0:10:31.
  Batch   400  of    473.    Elapsed: 0:11:41.
  Batch   440  of    473.    Elapsed: 0:12:51.

  Average training loss: 0.67
  Training epoch took: 0:13:48

Running Validation...




  Accuracy: 0.75
f1-score: 0.75, precision: 0.75, recall: 0.75
  Validation Loss: 0.58
  Validation took: 0:00:54

Training...
  Batch    40  of    473.    Elapsed: 0:01:10.
  Batch    80  of    473.    Elapsed: 0:02:20.
  Batch   120  of    473.    Elapsed: 0:03:30.
  Batch   160  of    473.    Elapsed: 0:04:40.
  Batch   200  of    473.    Elapsed: 0:05:50.
  Batch   240  of    473.    Elapsed: 0:07:01.
  Batch   280  of    473.    Elapsed: 0:08:11.
  Batch   320  of    473.    Elapsed: 0:09:21.
  Batch   360  of    473.    Elapsed: 0:10:31.
  Batch   400  of    473.    Elapsed: 0:11:41.
  Batch   440  of    473.    Elapsed: 0:12:51.

  Average training loss: 0.46
  Training epoch took: 0:13:48

Running Validation...
  Accuracy: 0.77
f1-score: 0.77, precision: 0.77, recall: 0.77
  Validation Loss: 0.54
  Validation took: 0:00:54

Training...
  Batch    40  of    473.    Elapsed: 0:01:10.
  Batch    80  of    473.    Elapsed: 0:02:20.
  Batch   120  of    473.    Elapsed: 0:03:30.
  B

In [6]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/toxic_comment_relabeled_multiclass/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_toxic_comment_trained.pth",
                          num_labels=4)
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla K80


  0%|          | 0/4823 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 4823/4823 [00:09<00:00, 513.44it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another

accuracy: 0.7851959361393324, f1-score: 0.7244239088784382, precision: 0.7198556894524715, recall: 0.730058302959308


(0.7851959361393324, 0.7244239088784382, 0.7198556894524715, 0.730058302959308)

### Trac2 Subtask A training and testing

In [12]:
# Training

batch_size = 32
learning_rate = 2e-5
epochs = 3
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/trac2/train.csv")
comments = df["content"].fillna("").values
labels = df["type"].fillna(0).values

# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataloader, val_dataloader = prepare_training_dataloaders(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(num_labels=3)
model.cuda()

# Create the optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                  )

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader) * epochs)

state_dict = train(model, optimizer, scheduler, train_dataloader, val_dataloader, epochs, device, multiclass=True)
torch.save(state_dict, "/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_trac2_trained.pth")

GPU: Tesla K80


  0%|          | 0/3410 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 3410/3410 [00:02<00:00, 1427.27it/s]
100%|██████████| 853/853 [00:00<00:00, 1234.79it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassificat


Training...
  Batch    40  of    107.    Elapsed: 0:01:09.
  Batch    80  of    107.    Elapsed: 0:02:18.

  Average training loss: 0.60
  Training epoch took: 0:03:04

Running Validation...




  Accuracy: 0.77
f1-score: 0.77, precision: 0.77, recall: 0.77
  Validation Loss: 0.54
  Validation took: 0:00:12

Training...
  Batch    40  of    107.    Elapsed: 0:01:09.
  Batch    80  of    107.    Elapsed: 0:02:17.

  Average training loss: 0.45
  Training epoch took: 0:03:03

Running Validation...
  Accuracy: 0.81
f1-score: 0.81, precision: 0.81, recall: 0.81
  Validation Loss: 0.46
  Validation took: 0:00:12

Training...
  Batch    40  of    107.    Elapsed: 0:01:09.
  Batch    80  of    107.    Elapsed: 0:02:18.

  Average training loss: 0.35
  Training epoch took: 0:03:03

Running Validation...
  Accuracy: 0.80
f1-score: 0.8, precision: 0.8, recall: 0.8
  Validation Loss: 0.48
  Validation took: 0:00:12

Training complete!
Total training took 0:09:46 (h:mm:ss)


In [13]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/trac2/test.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_trac2_trained.pth",
                          num_labels=3)
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla K80


  0%|          | 0/1066 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 1066/1066 [00:00<00:00, 1345.13it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on anothe

accuracy: 0.8142589118198874, f1-score: 0.5586655082660529, precision: 0.6197413498029612, recall: 0.5388265774629412


(0.8142589118198874,
 0.5586655082660529,
 0.6197413498029612,
 0.5388265774629412)

## Testing on Slovenian translated data


In [15]:
# Testing
max_length = 128

df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/slo-twitter-dataset-translated.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_toxic_comment_trained.pth",
                          num_labels=4)
model.cuda()

# Run test
test(test_dataloader, model, score_average="macro")

GPU: Tesla K80


  0%|          | 0/18459 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 18459/18459 [00:14<00:00, 1294.83it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on ano

accuracy: 0.3891326724091229, f1-score: 0.23459841879788543, precision: 0.31249486479113586, recall: 0.32971637562382733


(0.3891326724091229,
 0.23459841879788543,
 0.31249486479113586,
 0.32971637562382733)

In [7]:
# Preparation for testing
max_length = 128
batch_size = 32
df = pd.read_csv("/content/drive/MyDrive/offensive_language_classification/final_datasets/slo-twitter-dataset-translated.csv")# (17870 false, 146001 true)
comments = df["content"].values
labels = df["type"].values


# Get Pytorch device
device = get_torch_device()

# Prepare the training and validation data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# test_dataloader = prepare_test_dataloader(comments, labels, tokenizer, batch_size, max_length, device)

labels_binary = np.minimum(labels, 1)
test_dataloader_binary = prepare_test_dataloader(comments, labels_binary, tokenizer, batch_size, max_length, device)

GPU: Tesla K80


  0%|          | 0/18459 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 18459/18459 [00:14<00:00, 1284.27it/s]


In [10]:


# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_gab_trained.pth",
                          num_labels=2)
model.cuda()

# Run test
test(test_dataloader_binary, model, score_average="binary")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

accuracy: 0.4991603012080828, f1-score: 0.039480519480519484, precision: 0.7692307692307693, recall: 0.020260183407976116


(0.4991603012080828,
 0.039480519480519484,
 0.7692307692307693,
 0.020260183407976116)

In [12]:


# Get the model and move it to GPU
model = get_bert_eng(pretrained_weights_path="/content/drive/MyDrive/offensive_language_classification/final_models/bert_eng_reddit_trained.pth",
                          num_labels=2)
model.cuda()

# Run test
test(test_dataloader_binary, model, score_average="binary")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

accuracy: 0.49715585893060293, f1-score: 0.026431718061674006, precision: 0.8076923076923077, recall: 0.013435700575815739


(0.49715585893060293,
 0.026431718061674006,
 0.8076923076923077,
 0.013435700575815739)