In [None]:
import os
import pandas as pd
import numpy as np
import torch
import json
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
# read the Chinese csv datasets
train_df = pd.read_csv('train_zh_dataset.csv')
test_df = pd.read_csv('test_zh_dataset.csv')

In [None]:
class TweetDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tweet = self.data.iloc[idx]['comment_text']
    label = self.data.iloc[idx]['label']
    return  (tweet, label)

In [None]:
train_dataset = TweetDataset(train_df)
test_dataset = TweetDataset(test_df)

In [None]:
# show some samples of the weibos in the dataset
train_dataset[0:5]

(0         其实我觉得也不能太偏激了吧。我们男性不说不代表我们不知道对错，只是不喜欢去评论这些事情。
 1                       不完全统计，十三个伏地魔相关博主被炸号，其中包括一位维权素人
 2    只是从图二里表达出来的是那些发达国家，我也没有不尊重其他国家，只是觉得一味地崇洋媚外，甚至说...
 3             其他的不说 对待舆论的态度真的圈粉 不卑不亢 掷地有声:green_heart:
 4    男人也吃男人，也有男吃女女吃男，怎么就毫无存在感了？单独拿出来说女吃女，仿佛是为了证明女性的...
 Name: comment_text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    1
 Name: label, dtype: int64)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Semantic Detector model class building
class SemanticDetector(nn.Module):
    def __init__(self, padding='max_length', num_classes=1):
        super(SemanticDetector, self).__init__()
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.padding = padding

        # fully connected layers for [CLS] token
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(256, num_classes),
            nn.Sigmoid()
        )

        # set bert parameters as non-trainable
        for param in self.bert.parameters():
            param.requires_grad = False

    def tokenize(self, texts):
        encoding = self.berttokenizer(
            texts,
            add_special_tokens=True,
            padding=self.padding,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        return input_ids, attention_mask

    def forward(self, texts):
        input_ids, attention_mask = self.tokenize(texts)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # only the special token [cls] (pooler output) is used to guarantee only semantic information in considered
        cls_token = outputs.pooler_output
        logits = self.classifier(cls_token)

        return logits

In [None]:
# train function
def train(model, train_loader, test_loader, optimizer,
          scheduler,
          epochs, device, criterion=nn.BCELoss()):
    best_acc = 0
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        # Training loop
        for (texts, labels) in tqdm(train_loader):
            labels = labels.to(torch.float32).to(device)
            optimizer.zero_grad()
            logits = model(texts)
            logits = logits.squeeze(1)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

        # evaluate the model on the evaluation set after each epoch
        acc, f1 = evaluate(model, test_loader, device)
        print(f"Test Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

        # if current acc is greater than previous best acc, save a new best model
        if acc > best_acc:
            best_acc = acc
            print(f"New best model found with accuracy: {best_acc:.4f}, saving the model...")
            torch.save(model, "best_model.pth")

        # apply scheduler to adjust the learning rate
        scheduler.step()

    print("Training complete!")

In [None]:
# evaluate model
def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for (texts, labels) in tqdm(dataloader):
            labels = labels.to(device)
            logits = model(texts)
            logits = logits.squeeze(1)
            preds = (logits > threshold).int()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, f1

In [None]:
model = SemanticDetector()
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

SemanticDetector(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [None]:
epochs = 50

In [None]:
train(model, train_loader, test_loader, optimizer, scheduler, epochs, device)

100%|██████████| 449/449 [00:50<00:00,  8.89it/s]


Epoch 1/50, Loss: 0.6169


100%|██████████| 113/113 [00:12<00:00,  9.09it/s]


Accuracy: 0.7007
F1 Score: 0.3124
Test Accuracy: 0.7007, F1 Score: 0.3124
New best model found with accuracy: 0.7007, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.28it/s]


Epoch 2/50, Loss: 0.5231


100%|██████████| 113/113 [00:12<00:00,  9.38it/s]


Accuracy: 0.7441
F1 Score: 0.6383
Test Accuracy: 0.7441, F1 Score: 0.6383
New best model found with accuracy: 0.7441, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.20it/s]


Epoch 3/50, Loss: 0.5007


100%|██████████| 113/113 [00:12<00:00,  9.31it/s]


Accuracy: 0.7631
F1 Score: 0.6339
Test Accuracy: 0.7631, F1 Score: 0.6339
New best model found with accuracy: 0.7631, saving the model...


100%|██████████| 449/449 [00:47<00:00,  9.37it/s]


Epoch 4/50, Loss: 0.4972


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7464
F1 Score: 0.5343
Test Accuracy: 0.7464, F1 Score: 0.5343


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 5/50, Loss: 0.4816


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7051
F1 Score: 0.6572
Test Accuracy: 0.7051, F1 Score: 0.6572


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 6/50, Loss: 0.4770


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7625
F1 Score: 0.6553
Test Accuracy: 0.7625, F1 Score: 0.6553


100%|██████████| 449/449 [00:48<00:00,  9.23it/s]


Epoch 7/50, Loss: 0.4666


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7676
F1 Score: 0.6551
Test Accuracy: 0.7676, F1 Score: 0.6551
New best model found with accuracy: 0.7676, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 8/50, Loss: 0.4744


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7664
F1 Score: 0.6428
Test Accuracy: 0.7664, F1 Score: 0.6428


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 9/50, Loss: 0.4672


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7564
F1 Score: 0.5695
Test Accuracy: 0.7564, F1 Score: 0.5695


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 10/50, Loss: 0.4649


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7430
F1 Score: 0.4928
Test Accuracy: 0.7430, F1 Score: 0.4928


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 11/50, Loss: 0.4574


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7620
F1 Score: 0.6772
Test Accuracy: 0.7620, F1 Score: 0.6772


100%|██████████| 449/449 [00:48<00:00,  9.23it/s]


Epoch 12/50, Loss: 0.4647


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.6767
F1 Score: 0.6523
Test Accuracy: 0.6767, F1 Score: 0.6523


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 13/50, Loss: 0.4552


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7664
F1 Score: 0.6596
Test Accuracy: 0.7664, F1 Score: 0.6596


100%|██████████| 449/449 [00:48<00:00,  9.23it/s]


Epoch 14/50, Loss: 0.4535


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7614
F1 Score: 0.6682
Test Accuracy: 0.7614, F1 Score: 0.6682


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 15/50, Loss: 0.4500


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7709
F1 Score: 0.6566
Test Accuracy: 0.7709, F1 Score: 0.6566
New best model found with accuracy: 0.7709, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 16/50, Loss: 0.4570


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7648
F1 Score: 0.5958
Test Accuracy: 0.7648, F1 Score: 0.5958


100%|██████████| 449/449 [00:48<00:00,  9.27it/s]


Epoch 17/50, Loss: 0.4558


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7709
F1 Score: 0.6155
Test Accuracy: 0.7709, F1 Score: 0.6155


100%|██████████| 449/449 [00:48<00:00,  9.22it/s]


Epoch 18/50, Loss: 0.4465


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7642
F1 Score: 0.6661
Test Accuracy: 0.7642, F1 Score: 0.6661


100%|██████████| 449/449 [00:48<00:00,  9.27it/s]


Epoch 19/50, Loss: 0.4504


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7698
F1 Score: 0.6584
Test Accuracy: 0.7698, F1 Score: 0.6584


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 20/50, Loss: 0.4517


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7715
F1 Score: 0.6279
Test Accuracy: 0.7715, F1 Score: 0.6279
New best model found with accuracy: 0.7715, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 21/50, Loss: 0.4303


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7726
F1 Score: 0.6501
Test Accuracy: 0.7726, F1 Score: 0.6501
New best model found with accuracy: 0.7726, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 22/50, Loss: 0.4289


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7720
F1 Score: 0.6628
Test Accuracy: 0.7720, F1 Score: 0.6628


100%|██████████| 449/449 [00:48<00:00,  9.23it/s]


Epoch 23/50, Loss: 0.4283


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7648
F1 Score: 0.6558
Test Accuracy: 0.7648, F1 Score: 0.6558


100%|██████████| 449/449 [00:48<00:00,  9.23it/s]


Epoch 24/50, Loss: 0.4281


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7648
F1 Score: 0.6575
Test Accuracy: 0.7648, F1 Score: 0.6575


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 25/50, Loss: 0.4274


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7737
F1 Score: 0.6617
Test Accuracy: 0.7737, F1 Score: 0.6617
New best model found with accuracy: 0.7737, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 26/50, Loss: 0.4273


100%|██████████| 113/113 [00:12<00:00,  9.31it/s]


Accuracy: 0.7737
F1 Score: 0.6633
Test Accuracy: 0.7737, F1 Score: 0.6633


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 27/50, Loss: 0.4257


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7670
F1 Score: 0.6618
Test Accuracy: 0.7670, F1 Score: 0.6618


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 28/50, Loss: 0.4268


100%|██████████| 113/113 [00:12<00:00,  9.31it/s]


Accuracy: 0.7742
F1 Score: 0.6577
Test Accuracy: 0.7742, F1 Score: 0.6577
New best model found with accuracy: 0.7742, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 29/50, Loss: 0.4255


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7670
F1 Score: 0.6602
Test Accuracy: 0.7670, F1 Score: 0.6602


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 30/50, Loss: 0.4248


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7754
F1 Score: 0.6593
Test Accuracy: 0.7754, F1 Score: 0.6593
New best model found with accuracy: 0.7754, saving the model...


100%|██████████| 449/449 [00:48<00:00,  9.28it/s]


Epoch 31/50, Loss: 0.4247


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7726
F1 Score: 0.6525
Test Accuracy: 0.7726, F1 Score: 0.6525


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 32/50, Loss: 0.4239


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7748
F1 Score: 0.6576
Test Accuracy: 0.7748, F1 Score: 0.6576


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 33/50, Loss: 0.4238


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7709
F1 Score: 0.6327
Test Accuracy: 0.7709, F1 Score: 0.6327


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 34/50, Loss: 0.4248


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7687
F1 Score: 0.6634
Test Accuracy: 0.7687, F1 Score: 0.6634


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 35/50, Loss: 0.4236


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7709
F1 Score: 0.6301
Test Accuracy: 0.7709, F1 Score: 0.6301


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 36/50, Loss: 0.4234


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7742
F1 Score: 0.6622
Test Accuracy: 0.7742, F1 Score: 0.6622


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 37/50, Loss: 0.4217


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7698
F1 Score: 0.6100
Test Accuracy: 0.7698, F1 Score: 0.6100


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 38/50, Loss: 0.4235


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7715
F1 Score: 0.6645
Test Accuracy: 0.7715, F1 Score: 0.6645


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 39/50, Loss: 0.4215


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7720
F1 Score: 0.6459
Test Accuracy: 0.7720, F1 Score: 0.6459


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 40/50, Loss: 0.4202


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7720
F1 Score: 0.6595
Test Accuracy: 0.7720, F1 Score: 0.6595


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 41/50, Loss: 0.4177


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7720
F1 Score: 0.6554
Test Accuracy: 0.7720, F1 Score: 0.6554


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 42/50, Loss: 0.4173


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7709
F1 Score: 0.6508
Test Accuracy: 0.7709, F1 Score: 0.6508


100%|██████████| 449/449 [00:48<00:00,  9.24it/s]


Epoch 43/50, Loss: 0.4176


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7715
F1 Score: 0.6572
Test Accuracy: 0.7715, F1 Score: 0.6572


100%|██████████| 449/449 [00:48<00:00,  9.26it/s]


Epoch 44/50, Loss: 0.4175


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7726
F1 Score: 0.6606
Test Accuracy: 0.7726, F1 Score: 0.6606


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 45/50, Loss: 0.4171


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7720
F1 Score: 0.6600
Test Accuracy: 0.7720, F1 Score: 0.6600


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 46/50, Loss: 0.4175


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]


Accuracy: 0.7726
F1 Score: 0.6606
Test Accuracy: 0.7726, F1 Score: 0.6606


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 47/50, Loss: 0.4178


100%|██████████| 113/113 [00:12<00:00,  9.32it/s]


Accuracy: 0.7726
F1 Score: 0.6606
Test Accuracy: 0.7726, F1 Score: 0.6606


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 48/50, Loss: 0.4170


100%|██████████| 113/113 [00:12<00:00,  9.34it/s]


Accuracy: 0.7720
F1 Score: 0.6595
Test Accuracy: 0.7720, F1 Score: 0.6595


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 49/50, Loss: 0.4169


100%|██████████| 113/113 [00:12<00:00,  9.33it/s]


Accuracy: 0.7720
F1 Score: 0.6560
Test Accuracy: 0.7720, F1 Score: 0.6560


100%|██████████| 449/449 [00:48<00:00,  9.25it/s]


Epoch 50/50, Loss: 0.4164


100%|██████████| 113/113 [00:12<00:00,  9.35it/s]

Accuracy: 0.7715
F1 Score: 0.6555
Test Accuracy: 0.7715, F1 Score: 0.6555
Training complete!





In [None]:
sem = torch.load('best_model.pth').to(device)

  sem = torch.load('best_model.pth').to(device)


In [None]:
evaluate(sem, test_loader, device)

100%|██████████| 113/113 [00:12<00:00,  9.34it/s]

Accuracy: 0.7754
F1 Score: 0.6593





(0.7753623188405797, 0.6593406593406593)