In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
# red the csv datasets
train_df = pd.read_csv('train_zh_dataset.csv')
test_df = pd.read_csv('test_zh_dataset.csv')

In [None]:
class TweetDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    tweet = self.data.iloc[idx]['comment_text']
    label = self.data.iloc[idx]['label']
    return  (tweet, label)


In [None]:
train_dataset = TweetDataset(train_df)
test_dataset = TweetDataset(test_df)

In [None]:
train_dataset[0:5]

(0         其实我觉得也不能太偏激了吧。我们男性不说不代表我们不知道对错，只是不喜欢去评论这些事情。
 1                       不完全统计，十三个伏地魔相关博主被炸号，其中包括一位维权素人
 2    只是从图二里表达出来的是那些发达国家，我也没有不尊重其他国家，只是觉得一味地崇洋媚外，甚至说...
 3             其他的不说 对待舆论的态度真的圈粉 不卑不亢 掷地有声:green_heart:
 4    男人也吃男人，也有男吃女女吃男，怎么就毫无存在感了？单独拿出来说女吃女，仿佛是为了证明女性的...
 Name: comment_text, dtype: object,
 0    0
 1    0
 2    0
 3    0
 4    1
 Name: label, dtype: int64)

In [None]:
positive_samples = sum(label == 1 for label in train_df['label'])
negative_samples = sum(label == 0 for label in train_df['label'])

In [None]:
positive_samples,negative_samples

(2465, 4710)

In [None]:
pos_weight = torch.tensor([negative_samples/ positive_samples ]).to(device)
pos_weight

tensor([1.9108], device='cuda:0')

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Semantic Detector model class building
class SemanticDetector(nn.Module):
    def __init__(self, padding='max_length', num_classes=1):
        super(SemanticDetector, self).__init__()
        self.berttokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.padding = padding


        # fully connected layers for [CLS] token
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(256, num_classes)
        )

        # set non-trainable
        for param in self.bert.parameters():
            param.requires_grad = False

    def tokenize(self, texts):
        encoding = self.berttokenizer(
            texts,
            add_special_tokens=True,
            padding=self.padding,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        return input_ids, attention_mask

    def forward(self, texts):
        input_ids, attention_mask = self.tokenize(texts)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # only the special token [cls] is used to guarantee only semantic information in considered
        cls_token = outputs.pooler_output
        features = self.classifier(cls_token)

        return features

In [None]:
# train function
def train(model, train_loader, test_loader, optimizer,
          scheduler,
          epochs, device, criterion=nn.BCEWithLogitsLoss(pos_weight=pos_weight)):
    best_acc = 0
    model.train()

    for epoch in range(epochs):
        total_loss = 0

        # Training loop
        for (texts, labels) in tqdm(train_loader):
            labels = labels.to(torch.float32).to(device)
            optimizer.zero_grad()
            logits = model(texts)
            logits = logits.squeeze(1)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

        # evaluate the model on the validation set after each epoch
        acc, f1 = evaluate(model, test_loader, device)
        print(f"Test Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

        # if current acc is greater than previous best acc, save a new best model
        if acc > best_acc:
            best_acc = acc
            print(f"New best model found with accuracy: {best_acc:.4f}, saving the model...")
            torch.save(model, "best_model.pth")

        # apply scheduler to adjust the learning rate
        scheduler.step()

    print("Training Complete!")

In [None]:
# evaluate model
sigmoid = nn.Sigmoid()

def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for (texts, labels) in tqdm(dataloader):
            labels = labels.to(device)
            features = model(texts)
            # difference from previous implementation becuase sigmoid function is not implemeted in ffn
            logits = sigmoid(features)
            logits = logits.squeeze(1)
            preds = (logits > threshold).int()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, f1

In [None]:
model = SemanticDetector()
model.to(device)

SemanticDetector(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

In [None]:
epochs = 50

In [None]:
train(model, train_loader, test_loader, optimizer, scheduler, epochs, device)

100%|██████████| 449/449 [00:51<00:00,  8.69it/s]


Epoch 1/50, Loss: 0.8773


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7135
F1 Score: 0.5901
Test Accuracy: 0.7135, F1 Score: 0.5901
New best model found with accuracy: 0.7135, saving the model...


100%|██████████| 449/449 [00:50<00:00,  8.97it/s]


Epoch 2/50, Loss: 0.7308


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.5424
F1 Score: 0.5897
Test Accuracy: 0.5424, F1 Score: 0.5897


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 3/50, Loss: 0.7036


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.6901
F1 Score: 0.6472
Test Accuracy: 0.6901, F1 Score: 0.6472


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 4/50, Loss: 0.6870


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7118
F1 Score: 0.6583
Test Accuracy: 0.7118, F1 Score: 0.6583


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 5/50, Loss: 0.6831


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7408
F1 Score: 0.5027
Test Accuracy: 0.7408, F1 Score: 0.5027
New best model found with accuracy: 0.7408, saving the model...


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 6/50, Loss: 0.6777


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7152
F1 Score: 0.6636
Test Accuracy: 0.7152, F1 Score: 0.6636


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 7/50, Loss: 0.6611


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7664
F1 Score: 0.6602
Test Accuracy: 0.7664, F1 Score: 0.6602
New best model found with accuracy: 0.7664, saving the model...


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 8/50, Loss: 0.6535


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.6912
F1 Score: 0.6614
Test Accuracy: 0.6912, F1 Score: 0.6614


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 9/50, Loss: 0.6509


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7475
F1 Score: 0.6705
Test Accuracy: 0.7475, F1 Score: 0.6705


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 10/50, Loss: 0.6534


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7174
F1 Score: 0.6667
Test Accuracy: 0.7174, F1 Score: 0.6667


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 11/50, Loss: 0.6514


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7135
F1 Score: 0.6627
Test Accuracy: 0.7135, F1 Score: 0.6627


100%|██████████| 449/449 [00:50<00:00,  8.95it/s]


Epoch 12/50, Loss: 0.6422


100%|██████████| 113/113 [00:12<00:00,  9.05it/s]


Accuracy: 0.7035
F1 Score: 0.6692
Test Accuracy: 0.7035, F1 Score: 0.6692


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 13/50, Loss: 0.6487


100%|██████████| 113/113 [00:12<00:00,  9.05it/s]


Accuracy: 0.7547
F1 Score: 0.6760
Test Accuracy: 0.7547, F1 Score: 0.6760


100%|██████████| 449/449 [00:50<00:00,  8.95it/s]


Epoch 14/50, Loss: 0.6428


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7586
F1 Score: 0.6781
Test Accuracy: 0.7586, F1 Score: 0.6781


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 15/50, Loss: 0.6403


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7642
F1 Score: 0.6564
Test Accuracy: 0.7642, F1 Score: 0.6564


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 16/50, Loss: 0.6398


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7458
F1 Score: 0.6729
Test Accuracy: 0.7458, F1 Score: 0.6729


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 17/50, Loss: 0.6344


100%|██████████| 113/113 [00:12<00:00,  9.06it/s]


Accuracy: 0.7436
F1 Score: 0.6719
Test Accuracy: 0.7436, F1 Score: 0.6719


100%|██████████| 449/449 [00:50<00:00,  8.95it/s]


Epoch 18/50, Loss: 0.6391


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7637
F1 Score: 0.6677
Test Accuracy: 0.7637, F1 Score: 0.6677


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 19/50, Loss: 0.6377


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.6906
F1 Score: 0.6610
Test Accuracy: 0.6906, F1 Score: 0.6610


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 20/50, Loss: 0.6341


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7531
F1 Score: 0.6735
Test Accuracy: 0.7531, F1 Score: 0.6735


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 21/50, Loss: 0.6060


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7547
F1 Score: 0.6769
Test Accuracy: 0.7547, F1 Score: 0.6769


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 22/50, Loss: 0.6021


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7542
F1 Score: 0.6797
Test Accuracy: 0.7542, F1 Score: 0.6797


100%|██████████| 449/449 [00:50<00:00,  8.91it/s]


Epoch 23/50, Loss: 0.6018


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7536
F1 Score: 0.6764
Test Accuracy: 0.7536, F1 Score: 0.6764


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 24/50, Loss: 0.6001


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7581
F1 Score: 0.6780
Test Accuracy: 0.7581, F1 Score: 0.6780


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 25/50, Loss: 0.5998


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7508
F1 Score: 0.6773
Test Accuracy: 0.7508, F1 Score: 0.6773


100%|██████████| 449/449 [00:50<00:00,  8.92it/s]


Epoch 26/50, Loss: 0.5982


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7514
F1 Score: 0.6754
Test Accuracy: 0.7514, F1 Score: 0.6754


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 27/50, Loss: 0.5985


100%|██████████| 113/113 [00:12<00:00,  9.01it/s]


Accuracy: 0.7419
F1 Score: 0.6737
Test Accuracy: 0.7419, F1 Score: 0.6737


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 28/50, Loss: 0.5978


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7586
F1 Score: 0.6747
Test Accuracy: 0.7586, F1 Score: 0.6747


100%|██████████| 449/449 [00:50<00:00,  8.92it/s]


Epoch 29/50, Loss: 0.5962


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7525
F1 Score: 0.6773
Test Accuracy: 0.7525, F1 Score: 0.6773


100%|██████████| 449/449 [00:50<00:00,  8.92it/s]


Epoch 30/50, Loss: 0.5962


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7447
F1 Score: 0.6756
Test Accuracy: 0.7447, F1 Score: 0.6756


100%|██████████| 449/449 [00:50<00:00,  8.92it/s]


Epoch 31/50, Loss: 0.5946


100%|██████████| 113/113 [00:12<00:00,  9.02it/s]


Accuracy: 0.7397
F1 Score: 0.6737
Test Accuracy: 0.7397, F1 Score: 0.6737


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 32/50, Loss: 0.5942


100%|██████████| 113/113 [00:12<00:00,  9.05it/s]


Accuracy: 0.7458
F1 Score: 0.6775
Test Accuracy: 0.7458, F1 Score: 0.6775


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 33/50, Loss: 0.5943


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7520
F1 Score: 0.6778
Test Accuracy: 0.7520, F1 Score: 0.6778


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 34/50, Loss: 0.5933


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7559
F1 Score: 0.6794
Test Accuracy: 0.7559, F1 Score: 0.6794


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 35/50, Loss: 0.5930


100%|██████████| 113/113 [00:12<00:00,  8.99it/s]


Accuracy: 0.7419
F1 Score: 0.6751
Test Accuracy: 0.7419, F1 Score: 0.6751


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 36/50, Loss: 0.5921


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7336
F1 Score: 0.6739
Test Accuracy: 0.7336, F1 Score: 0.6739


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 37/50, Loss: 0.5902


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7564
F1 Score: 0.6780
Test Accuracy: 0.7564, F1 Score: 0.6780


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 38/50, Loss: 0.5916


100%|██████████| 113/113 [00:12<00:00,  9.00it/s]


Accuracy: 0.7536
F1 Score: 0.6792
Test Accuracy: 0.7536, F1 Score: 0.6792


100%|██████████| 449/449 [00:50<00:00,  8.95it/s]


Epoch 39/50, Loss: 0.5902


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7375
F1 Score: 0.6798
Test Accuracy: 0.7375, F1 Score: 0.6798


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 40/50, Loss: 0.5906


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7520
F1 Score: 0.6796
Test Accuracy: 0.7520, F1 Score: 0.6796


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 41/50, Loss: 0.5858


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7503
F1 Score: 0.6795
Test Accuracy: 0.7503, F1 Score: 0.6795


100%|██████████| 449/449 [00:50<00:00,  8.92it/s]


Epoch 42/50, Loss: 0.5852


100%|██████████| 113/113 [00:12<00:00,  8.99it/s]


Accuracy: 0.7525
F1 Score: 0.6797
Test Accuracy: 0.7525, F1 Score: 0.6797


100%|██████████| 449/449 [00:50<00:00,  8.95it/s]


Epoch 43/50, Loss: 0.5857


100%|██████████| 113/113 [00:12<00:00,  9.06it/s]


Accuracy: 0.7553
F1 Score: 0.6812
Test Accuracy: 0.7553, F1 Score: 0.6812


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 44/50, Loss: 0.5844


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7508
F1 Score: 0.6791
Test Accuracy: 0.7508, F1 Score: 0.6791


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 45/50, Loss: 0.5847


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7547
F1 Score: 0.6812
Test Accuracy: 0.7547, F1 Score: 0.6812


100%|██████████| 449/449 [00:50<00:00,  8.92it/s]


Epoch 46/50, Loss: 0.5847


100%|██████████| 113/113 [00:12<00:00,  9.05it/s]


Accuracy: 0.7458
F1 Score: 0.6766
Test Accuracy: 0.7458, F1 Score: 0.6766


100%|██████████| 449/449 [00:50<00:00,  8.96it/s]


Epoch 47/50, Loss: 0.5848


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7480
F1 Score: 0.6785
Test Accuracy: 0.7480, F1 Score: 0.6785


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 48/50, Loss: 0.5843


100%|██████████| 113/113 [00:12<00:00,  9.04it/s]


Accuracy: 0.7514
F1 Score: 0.6805
Test Accuracy: 0.7514, F1 Score: 0.6805


100%|██████████| 449/449 [00:50<00:00,  8.94it/s]


Epoch 49/50, Loss: 0.5841


100%|██████████| 113/113 [00:12<00:00,  9.03it/s]


Accuracy: 0.7464
F1 Score: 0.6771
Test Accuracy: 0.7464, F1 Score: 0.6771


100%|██████████| 449/449 [00:50<00:00,  8.93it/s]


Epoch 50/50, Loss: 0.5841


100%|██████████| 113/113 [00:12<00:00,  9.05it/s]

Accuracy: 0.7469
F1 Score: 0.6771
Test Accuracy: 0.7469, F1 Score: 0.6771
Training Complete!





In [None]:
sem = torch.load('best_model.pth').to(device)

  sem = torch.load('best_model.pth').to(device)


In [None]:
evaluate(sem, test_loader, device)

100%|██████████| 113/113 [00:12<00:00,  9.02it/s]

Accuracy: 0.7664
F1 Score: 0.6602





(0.7664437012263099, 0.6601784266017843)