In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
# read the csv datasets
train_df = pd.read_csv('train_en_dataset.csv')
test_df = pd.read_csv('test_en_dataset.csv')

In [None]:
class TweetDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tweet = self.data.iloc[idx]['tweet']
    label = self.data.iloc[idx]['value']
    return  (tweet, label)

In [None]:
train_dataset = TweetDataset(train_df)
test_dataset = TweetDataset(test_df)

In [None]:
# show the first five examples in the dataset
train_dataset[0:5]

(0    “mansplaining” is literally just how intellige...
 1    if you don’t want me but your friend do, dont ...
 2    @username @username @username @username isn't ...
 3    @username's account is temporarily unavailable...
 4    @username if it wasn't for the gender biases o...
 Name: tweet, dtype: object,
 0    1.0
 1    1.0
 2    1.0
 3    0.0
 4    1.0
 Name: value, dtype: float64)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Semantic Detector model class building
class SemanticDetector(nn.Module):
    def __init__(self, padding='max_length', num_classes=1):
        super(SemanticDetector, self).__init__()
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.padding = padding

        # fully connected layers for theh pooler output
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(256, num_classes),
            nn.Sigmoid()
        )

        # set the bert parameters as non-trainable
        for param in self.bert.parameters():
            param.requires_grad = False

    def tokenize(self, texts):
        encoding = self.berttokenizer(
            texts,
            add_special_tokens=True,
            padding=self.padding,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        return input_ids, attention_mask

    def forward(self, texts):
        input_ids, attention_mask = self.tokenize(texts)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # only the special token [cls] (pooler output) is used to guarantee only semantic information in considered
        cls_token = outputs.pooler_output
        logits = self.classifier(cls_token)

        return logits

In [None]:
# train function
def train(model, train_loader, test_loader, optimizer,
          scheduler,
          epochs, device, criterion=nn.BCELoss()):
    best_acc = 0
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        # training loop
        for (texts, labels) in tqdm(train_loader):
            labels = labels.to(torch.float32).to(device)
            optimizer.zero_grad()
            logits = model(texts)
            logits = logits.squeeze(1)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

        # evaluate the model on the evaluation set after each epoch
        acc, f1 = evaluate(model, test_loader, device)
        print(f"Test Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

        # if current acc is greater than previous best acc, save a new best model
        if acc > best_acc:
            best_acc = acc
            print(f"New best model found with accuracy: {best_acc:.4f}, saving the model...")
            torch.save(model, "best_model.pth")

        # apply scheduler to adjust the learning rate every 10 epoch
        scheduler.step()

    print("Training complete!")

In [None]:
# evaluate model
def evaluate(model, dataloader, device, threshold=0.5):
  # turn to evaluation mode
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for (texts, labels) in tqdm(dataloader):
            labels = labels.to(device)
            logits = model(texts)
            logits = logits.squeeze(1)
            preds = (logits > threshold).int()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, f1

In [None]:
model = SemanticDetector()
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

SemanticDetector(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [None]:
epochs = 50

In [None]:
train(model, train_loader, test_loader, optimizer, scheduler, epochs, device)

100%|██████████| 166/166 [00:36<00:00,  4.59it/s]


Epoch 1/50, Loss: 0.6863


100%|██████████| 42/42 [00:08<00:00,  4.85it/s]


Accuracy: 0.5732
F1 Score: 0.0070
Test Accuracy: 0.5732, F1 Score: 0.0070
New best model found with accuracy: 0.5732, saving the model...


100%|██████████| 166/166 [00:35<00:00,  4.70it/s]


Epoch 2/50, Loss: 0.6750


100%|██████████| 42/42 [00:08<00:00,  4.68it/s]


Accuracy: 0.5747
F1 Score: 0.0070
Test Accuracy: 0.5747, F1 Score: 0.0070
New best model found with accuracy: 0.5747, saving the model...


100%|██████████| 166/166 [00:36<00:00,  4.59it/s]


Epoch 3/50, Loss: 0.6751


100%|██████████| 42/42 [00:09<00:00,  4.58it/s]


Accuracy: 0.5852
F1 Score: 0.2931
Test Accuracy: 0.5852, F1 Score: 0.2931
New best model found with accuracy: 0.5852, saving the model...


100%|██████████| 166/166 [00:38<00:00,  4.32it/s]


Epoch 4/50, Loss: 0.6692


100%|██████████| 42/42 [00:10<00:00,  4.02it/s]


Accuracy: 0.5777
F1 Score: 0.0604
Test Accuracy: 0.5777, F1 Score: 0.0604


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 5/50, Loss: 0.6544


100%|██████████| 42/42 [00:09<00:00,  4.32it/s]


Accuracy: 0.6410
F1 Score: 0.5882
Test Accuracy: 0.6410, F1 Score: 0.5882
New best model found with accuracy: 0.6410, saving the model...


100%|██████████| 166/166 [00:39<00:00,  4.18it/s]


Epoch 6/50, Loss: 0.6369


100%|██████████| 42/42 [00:09<00:00,  4.24it/s]


Accuracy: 0.5807
F1 Score: 0.0795
Test Accuracy: 0.5807, F1 Score: 0.0795


100%|██████████| 166/166 [00:39<00:00,  4.23it/s]


Epoch 7/50, Loss: 0.5953


100%|██████████| 42/42 [00:09<00:00,  4.22it/s]


Accuracy: 0.6350
F1 Score: 0.3632
Test Accuracy: 0.6350, F1 Score: 0.3632


100%|██████████| 166/166 [00:39<00:00,  4.18it/s]


Epoch 8/50, Loss: 0.5886


100%|██████████| 42/42 [00:09<00:00,  4.27it/s]


Accuracy: 0.5973
F1 Score: 0.1577
Test Accuracy: 0.5973, F1 Score: 0.1577


100%|██████████| 166/166 [00:39<00:00,  4.19it/s]


Epoch 9/50, Loss: 0.5697


100%|██████████| 42/42 [00:09<00:00,  4.23it/s]


Accuracy: 0.6953
F1 Score: 0.5409
Test Accuracy: 0.6953, F1 Score: 0.5409
New best model found with accuracy: 0.6953, saving the model...


100%|██████████| 166/166 [00:39<00:00,  4.26it/s]


Epoch 10/50, Loss: 0.5619


100%|██████████| 42/42 [00:09<00:00,  4.26it/s]


Accuracy: 0.6938
F1 Score: 0.5634
Test Accuracy: 0.6938, F1 Score: 0.5634


100%|██████████| 166/166 [00:40<00:00,  4.13it/s]


Epoch 11/50, Loss: 0.5499


100%|██████████| 42/42 [00:10<00:00,  4.18it/s]


Accuracy: 0.7164
F1 Score: 0.6856
Test Accuracy: 0.7164, F1 Score: 0.6856
New best model found with accuracy: 0.7164, saving the model...


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 12/50, Loss: 0.5392


100%|██████████| 42/42 [00:10<00:00,  4.15it/s]


Accuracy: 0.7044
F1 Score: 0.5644
Test Accuracy: 0.7044, F1 Score: 0.5644


100%|██████████| 166/166 [00:39<00:00,  4.18it/s]


Epoch 13/50, Loss: 0.5347


100%|██████████| 42/42 [00:09<00:00,  4.25it/s]


Accuracy: 0.5928
F1 Score: 0.1118
Test Accuracy: 0.5928, F1 Score: 0.1118


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 14/50, Loss: 0.5358


100%|██████████| 42/42 [00:09<00:00,  4.23it/s]


Accuracy: 0.7255
F1 Score: 0.6553
Test Accuracy: 0.7255, F1 Score: 0.6553
New best model found with accuracy: 0.7255, saving the model...


100%|██████████| 166/166 [00:39<00:00,  4.23it/s]


Epoch 15/50, Loss: 0.5125


100%|██████████| 42/42 [00:09<00:00,  4.27it/s]


Accuracy: 0.7059
F1 Score: 0.6012
Test Accuracy: 0.7059, F1 Score: 0.6012


100%|██████████| 166/166 [00:39<00:00,  4.21it/s]


Epoch 16/50, Loss: 0.5298


100%|██████████| 42/42 [00:09<00:00,  4.33it/s]


Accuracy: 0.7210
F1 Score: 0.6691
Test Accuracy: 0.7210, F1 Score: 0.6691


100%|██████████| 166/166 [00:39<00:00,  4.21it/s]


Epoch 17/50, Loss: 0.5127


100%|██████████| 42/42 [00:09<00:00,  4.21it/s]


Accuracy: 0.7225
F1 Score: 0.7187
Test Accuracy: 0.7225, F1 Score: 0.7187


100%|██████████| 166/166 [00:40<00:00,  4.13it/s]


Epoch 18/50, Loss: 0.5091


100%|██████████| 42/42 [00:10<00:00,  4.18it/s]


Accuracy: 0.6456
F1 Score: 0.3631
Test Accuracy: 0.6456, F1 Score: 0.3631


100%|██████████| 166/166 [00:39<00:00,  4.15it/s]


Epoch 19/50, Loss: 0.5004


100%|██████████| 42/42 [00:09<00:00,  4.22it/s]


Accuracy: 0.7119
F1 Score: 0.5945
Test Accuracy: 0.7119, F1 Score: 0.5945


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 20/50, Loss: 0.5074


100%|██████████| 42/42 [00:10<00:00,  4.18it/s]


Accuracy: 0.7285
F1 Score: 0.7273
Test Accuracy: 0.7285, F1 Score: 0.7273
New best model found with accuracy: 0.7285, saving the model...


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 21/50, Loss: 0.4836


100%|██████████| 42/42 [00:10<00:00,  4.18it/s]


Accuracy: 0.7330
F1 Score: 0.6834
Test Accuracy: 0.7330, F1 Score: 0.6834
New best model found with accuracy: 0.7330, saving the model...


100%|██████████| 166/166 [00:39<00:00,  4.21it/s]


Epoch 22/50, Loss: 0.4747


100%|██████████| 42/42 [00:09<00:00,  4.31it/s]


Accuracy: 0.7270
F1 Score: 0.6578
Test Accuracy: 0.7270, F1 Score: 0.6578


100%|██████████| 166/166 [00:39<00:00,  4.16it/s]


Epoch 23/50, Loss: 0.4728


100%|██████████| 42/42 [00:09<00:00,  4.24it/s]


Accuracy: 0.7391
F1 Score: 0.6849
Test Accuracy: 0.7391, F1 Score: 0.6849
New best model found with accuracy: 0.7391, saving the model...


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 24/50, Loss: 0.4712


100%|██████████| 42/42 [00:10<00:00,  4.15it/s]


Accuracy: 0.7360
F1 Score: 0.6789
Test Accuracy: 0.7360, F1 Score: 0.6789


100%|██████████| 166/166 [00:39<00:00,  4.16it/s]


Epoch 25/50, Loss: 0.4709


100%|██████████| 42/42 [00:09<00:00,  4.22it/s]


Accuracy: 0.7240
F1 Score: 0.6376
Test Accuracy: 0.7240, F1 Score: 0.6376


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 26/50, Loss: 0.4687


100%|██████████| 42/42 [00:10<00:00,  4.20it/s]


Accuracy: 0.7255
F1 Score: 0.6540
Test Accuracy: 0.7255, F1 Score: 0.6540


100%|██████████| 166/166 [00:40<00:00,  4.15it/s]


Epoch 27/50, Loss: 0.4711


100%|██████████| 42/42 [00:09<00:00,  4.25it/s]


Accuracy: 0.7195
F1 Score: 0.6464
Test Accuracy: 0.7195, F1 Score: 0.6464


100%|██████████| 166/166 [00:40<00:00,  4.13it/s]


Epoch 28/50, Loss: 0.4687


100%|██████████| 42/42 [00:09<00:00,  4.21it/s]


Accuracy: 0.7240
F1 Score: 0.6391
Test Accuracy: 0.7240, F1 Score: 0.6391


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 29/50, Loss: 0.4681


100%|██████████| 42/42 [00:10<00:00,  4.18it/s]


Accuracy: 0.7255
F1 Score: 0.6486
Test Accuracy: 0.7255, F1 Score: 0.6486


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 30/50, Loss: 0.4652


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7285
F1 Score: 0.6471
Test Accuracy: 0.7285, F1 Score: 0.6471


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 31/50, Loss: 0.4666


100%|██████████| 42/42 [00:09<00:00,  4.21it/s]


Accuracy: 0.7179
F1 Score: 0.6191
Test Accuracy: 0.7179, F1 Score: 0.6191


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 32/50, Loss: 0.4652


100%|██████████| 42/42 [00:09<00:00,  4.20it/s]


Accuracy: 0.7481
F1 Score: 0.7065
Test Accuracy: 0.7481, F1 Score: 0.7065
New best model found with accuracy: 0.7481, saving the model...


100%|██████████| 166/166 [00:40<00:00,  4.09it/s]


Epoch 33/50, Loss: 0.4645


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7496
F1 Score: 0.7067
Test Accuracy: 0.7496, F1 Score: 0.7067
New best model found with accuracy: 0.7496, saving the model...


100%|██████████| 166/166 [00:40<00:00,  4.15it/s]


Epoch 34/50, Loss: 0.4643


100%|██████████| 42/42 [00:10<00:00,  4.15it/s]


Accuracy: 0.7270
F1 Score: 0.6654
Test Accuracy: 0.7270, F1 Score: 0.6654


100%|██████████| 166/166 [00:39<00:00,  4.17it/s]


Epoch 35/50, Loss: 0.4635


100%|██████████| 42/42 [00:09<00:00,  4.27it/s]


Accuracy: 0.7195
F1 Score: 0.6464
Test Accuracy: 0.7195, F1 Score: 0.6464


100%|██████████| 166/166 [00:40<00:00,  4.11it/s]


Epoch 36/50, Loss: 0.4626


100%|██████████| 42/42 [00:09<00:00,  4.25it/s]


Accuracy: 0.7270
F1 Score: 0.6604
Test Accuracy: 0.7270, F1 Score: 0.6604


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 37/50, Loss: 0.4633


100%|██████████| 42/42 [00:09<00:00,  4.22it/s]


Accuracy: 0.7300
F1 Score: 0.6455
Test Accuracy: 0.7300, F1 Score: 0.6455


100%|██████████| 166/166 [00:40<00:00,  4.15it/s]


Epoch 38/50, Loss: 0.4609


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7300
F1 Score: 0.6483
Test Accuracy: 0.7300, F1 Score: 0.6483


100%|██████████| 166/166 [00:40<00:00,  4.13it/s]


Epoch 39/50, Loss: 0.4626


100%|██████████| 42/42 [00:10<00:00,  4.18it/s]


Accuracy: 0.7345
F1 Score: 0.6741
Test Accuracy: 0.7345, F1 Score: 0.6741


100%|██████████| 166/166 [00:40<00:00,  4.13it/s]


Epoch 40/50, Loss: 0.4616


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7436
F1 Score: 0.7007
Test Accuracy: 0.7436, F1 Score: 0.7007


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 41/50, Loss: 0.4582


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7300
F1 Score: 0.6616
Test Accuracy: 0.7300, F1 Score: 0.6616


100%|██████████| 166/166 [00:40<00:00,  4.13it/s]


Epoch 42/50, Loss: 0.4566


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7391
F1 Score: 0.6860
Test Accuracy: 0.7391, F1 Score: 0.6860


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 43/50, Loss: 0.4561


100%|██████████| 42/42 [00:09<00:00,  4.25it/s]


Accuracy: 0.7315
F1 Score: 0.6654
Test Accuracy: 0.7315, F1 Score: 0.6654


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 44/50, Loss: 0.4564


100%|██████████| 42/42 [00:09<00:00,  4.22it/s]


Accuracy: 0.7345
F1 Score: 0.6729
Test Accuracy: 0.7345, F1 Score: 0.6729


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 45/50, Loss: 0.4561


100%|██████████| 42/42 [00:10<00:00,  4.17it/s]


Accuracy: 0.7300
F1 Score: 0.6590
Test Accuracy: 0.7300, F1 Score: 0.6590


100%|██████████| 166/166 [00:40<00:00,  4.12it/s]


Epoch 46/50, Loss: 0.4568


100%|██████████| 42/42 [00:09<00:00,  4.21it/s]


Accuracy: 0.7315
F1 Score: 0.6629
Test Accuracy: 0.7315, F1 Score: 0.6629


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 47/50, Loss: 0.4563


100%|██████████| 42/42 [00:10<00:00,  4.19it/s]


Accuracy: 0.7345
F1 Score: 0.6729
Test Accuracy: 0.7345, F1 Score: 0.6729


100%|██████████| 166/166 [00:40<00:00,  4.11it/s]


Epoch 48/50, Loss: 0.4558


100%|██████████| 42/42 [00:10<00:00,  4.17it/s]


Accuracy: 0.7315
F1 Score: 0.6642
Test Accuracy: 0.7315, F1 Score: 0.6642


100%|██████████| 166/166 [00:39<00:00,  4.15it/s]


Epoch 49/50, Loss: 0.4567


100%|██████████| 42/42 [00:09<00:00,  4.21it/s]


Accuracy: 0.7345
F1 Score: 0.6729
Test Accuracy: 0.7345, F1 Score: 0.6729


100%|██████████| 166/166 [00:40<00:00,  4.14it/s]


Epoch 50/50, Loss: 0.4564


100%|██████████| 42/42 [00:10<00:00,  4.20it/s]

Accuracy: 0.7345
F1 Score: 0.6704
Test Accuracy: 0.7345, F1 Score: 0.6704
Training complete!





In [None]:
# load the best model
sem = torch.load('best_model.pth').to(device)

  sem = torch.load('best_model.pth').to(device)


In [None]:
evaluate(sem, test_loader, device)

100%|██████████| 42/42 [00:09<00:00,  4.25it/s]

Accuracy: 0.7496
F1 Score: 0.7067





(0.7496229260935143, 0.7067137809187279)