In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 定义数据集
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 定义模型
class NewsClassifier(nn.Module):
    def __init__(self, num_classes):
        super(NewsClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('../../bert-base-multilingual-cased')
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        logits = self.fc(dropout_output)
        return logits

# 定义训练函数
def train_model(model, dataloader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc=f'Epoch {epoch}', leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(dataloader)
    print(f'Epoch: {epoch}, Training Loss: {avg_loss:.4f}')


# 定义评估函数
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            batch_predictions = torch.argmax(logits, dim=1)

            predictions.extend(batch_predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return classification_report(true_labels, predictions, digits=4)


In [3]:

# 加载数据
file_path = '../../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'
# file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

texts = data['body'].tolist()
labels = data['category1'].tolist()

# 对标签进行编码
unique_labels = list(set(labels))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
labels = [label_to_id[label] for label in labels]
num_classes = len(unique_labels)

# 加载BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('../../bert-base-multilingual-cased')



In [4]:
# 设置超参数
max_length = 256
batch_size = 16
epochs = 2
learning_rate = 2e-5

# 使用KFold进行交叉验证
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for fold, (train_index, val_index) in enumerate(kfold.split(texts, labels)):
    print(f'Fold {fold + 1}')
    print('-' * 30)

    train_texts, val_texts = [texts[i] for i in train_index], [texts[i] for i in val_index]
    train_labels, val_labels = [labels[i] for i in train_index], [labels[i] for i in val_index]

    train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_length)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = NewsClassifier(num_classes)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_f1 = 0
    for epoch in range(epochs):
        train_model(model, train_dataloader, optimizer, scheduler, device, epoch)

        print('Validation Results:')
        report = evaluate_model(model, val_dataloader, device)
        print(report)

        # 保存最优模型
        f1 = float(report.split('\n')[-2].split()[-2])
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), f'best_MultiBert_BiLSTM_SelfAttention_model_fold_{fold + 1}.pth')
            print(f'Saved best model for fold {fold + 1} with F1 score: {best_f1:.4f}')

    print()
    # 在每个fold结束后,评估最佳模型在验证集上的性能
    best_model = NewsClassifier(num_classes)
    best_model.load_state_dict(torch.load(f'best_MultiBert_BiLSTM_SelfAttention_model_fold_{fold + 1}.pth'))
    best_model.to(device)
    val_report = evaluate_model(best_model, val_dataloader, device)
    # all_reports.append(val_report)

    print(f'Fold {fold + 1} Best Validation Report:')
    print(val_report)
    print()

Fold 1
------------------------------


                                                                   

Epoch: 0, Training Loss: 1.9395
Validation Results:
              precision    recall  f1-score   support

           0     0.8500    0.8095    0.8293        21
           1     0.8571    0.2222    0.3529        27
           2     0.6000    0.5294    0.5625        17
           3     0.3889    0.2414    0.2979        29
           4     1.0000    0.1250    0.2222        24
           5     0.9167    0.6111    0.7333        18
           6     0.8519    1.0000    0.9200        23
           7     0.3871    0.9600    0.5517        25
           8     0.3889    0.8750    0.5385        16

    accuracy                         0.5700       200
   macro avg     0.6934    0.5971    0.5565       200
weighted avg     0.6923    0.5700    0.5362       200

Saved best model for fold 1 with F1 score: 0.5362


                                                                    

Epoch: 1, Training Loss: 1.1877
Validation Results:
              precision    recall  f1-score   support

           0     0.8000    0.9524    0.8696        21
           1     0.7097    0.8148    0.7586        27
           2     0.6250    0.5882    0.6061        17
           3     0.7059    0.4138    0.5217        29
           4     0.9375    0.6250    0.7500        24
           5     0.9167    0.6111    0.7333        18
           6     0.8846    1.0000    0.9388        23
           7     0.8276    0.9600    0.8889        25
           8     0.5000    0.8750    0.6364        16

    accuracy                         0.7550       200
   macro avg     0.7674    0.7600    0.7448       200
weighted avg     0.7755    0.7550    0.7469       200

Saved best model for fold 1 with F1 score: 0.7469

Fold 1 Best Validation Report:
              precision    recall  f1-score   support

           0     0.8000    0.9524    0.8696        21
           1     0.7097    0.8148    0.7586        2

                                                                   

Epoch: 0, Training Loss: 1.9455
Validation Results:
              precision    recall  f1-score   support

           0     0.9412    0.7273    0.8205        22
           1     0.7200    0.8182    0.7660        22
           2     0.7143    0.7143    0.7143        28
           3     0.5882    0.5000    0.5405        20
           4     0.8182    0.6429    0.7200        14
           5     0.7273    0.6957    0.7111        23
           6     0.9259    1.0000    0.9615        25
           7     0.5946    0.9565    0.7333        23
           8     0.8125    0.5652    0.6667        23

    accuracy                         0.7450       200
   macro avg     0.7602    0.7356    0.7371       200
weighted avg     0.7600    0.7450    0.7419       200

Saved best model for fold 2 with F1 score: 0.7419


                                                                    

Epoch: 1, Training Loss: 1.2122
Validation Results:
              precision    recall  f1-score   support

           0     1.0000    0.9091    0.9524        22
           1     0.6471    1.0000    0.7857        22
           2     0.7200    0.6429    0.6792        28
           3     0.7333    0.5500    0.6286        20
           4     0.7500    0.8571    0.8000        14
           5     0.7083    0.7391    0.7234        23
           6     1.0000    0.9600    0.9796        25
           7     0.8095    0.7391    0.7727        23
           8     0.7619    0.6957    0.7273        23

    accuracy                         0.7850       200
   macro avg     0.7922    0.7881    0.7832       200
weighted avg     0.7950    0.7850    0.7833       200

Saved best model for fold 2 with F1 score: 0.7833

Fold 2 Best Validation Report:
              precision    recall  f1-score   support

           0     1.0000    0.9091    0.9524        22
           1     0.6471    1.0000    0.7857        2

                                                                   

Epoch: 0, Training Loss: 2.0293
Validation Results:
              precision    recall  f1-score   support

           0     0.8182    0.9000    0.8571        20
           1     0.7308    0.8636    0.7917        22
           2     0.7727    0.8095    0.7907        21
           3     0.2462    0.8000    0.3765        20
           4     1.0000    0.0500    0.0952        20
           5     1.0000    0.0417    0.0800        24
           6     0.9500    1.0000    0.9744        19
           7     0.4634    0.8636    0.6032        22
           8     1.0000    0.0625    0.1176        32

    accuracy                         0.5600       200
   macro avg     0.7757    0.5990    0.5207       200
weighted avg     0.7892    0.5600    0.4903       200

Saved best model for fold 3 with F1 score: 0.4903


                                                                    

Epoch: 1, Training Loss: 1.2958
Validation Results:
              precision    recall  f1-score   support

           0     0.9500    0.9500    0.9500        20
           1     0.7857    1.0000    0.8800        22
           2     0.7826    0.8571    0.8182        21
           3     0.5806    0.9000    0.7059        20
           4     0.8182    0.4500    0.5806        20
           5     0.9286    0.5417    0.6842        24
           6     1.0000    1.0000    1.0000        19
           7     0.6786    0.8636    0.7600        22
           8     0.8462    0.6875    0.7586        32

    accuracy                         0.7950       200
   macro avg     0.8189    0.8055    0.7931       200
weighted avg     0.8199    0.7950    0.7884       200

Saved best model for fold 3 with F1 score: 0.7884

Fold 3 Best Validation Report:
              precision    recall  f1-score   support

           0     0.9500    0.9500    0.9500        20
           1     0.7857    1.0000    0.8800        2

                                                                   

Epoch: 0, Training Loss: 2.0685
Validation Results:
              precision    recall  f1-score   support

           0     0.8750    1.0000    0.9333         7
           1     0.6098    0.9259    0.7353        27
           2     0.7000    0.9545    0.8077        22
           3     0.7407    0.8000    0.7692        25
           4     0.8889    0.3810    0.5333        21
           5     0.7500    0.3913    0.5143        23
           6     0.9091    0.9375    0.9231        32
           7     0.9048    0.6786    0.7755        28
           8     0.6316    0.8000    0.7059        15

    accuracy                         0.7550       200
   macro avg     0.7789    0.7632    0.7442       200
weighted avg     0.7816    0.7550    0.7413       200

Saved best model for fold 4 with F1 score: 0.7413


                                                                   

Epoch: 1, Training Loss: 1.3840
Validation Results:
              precision    recall  f1-score   support

           0     0.7000    1.0000    0.8235         7
           1     0.7333    0.8148    0.7719        27
           2     0.6364    0.9545    0.7636        22
           3     0.7000    0.8400    0.7636        25
           4     1.0000    0.4286    0.6000        21
           5     0.8889    0.3478    0.5000        23
           6     0.9394    0.9688    0.9538        32
           7     0.8889    0.8571    0.8727        28
           8     0.6842    0.8667    0.7647        15

    accuracy                         0.7800       200
   macro avg     0.7968    0.7865    0.7571       200
weighted avg     0.8143    0.7800    0.7651       200

Saved best model for fold 4 with F1 score: 0.7651

Fold 4 Best Validation Report:
              precision    recall  f1-score   support

           0     0.7000    1.0000    0.8235         7
           1     0.7333    0.8148    0.7719        2

                                                                   

Epoch: 0, Training Loss: 1.8662
Validation Results:
              precision    recall  f1-score   support

           0     0.8846    0.9200    0.9020        25
           1     0.8000    0.5714    0.6667        21
           2     0.8462    0.8462    0.8462        26
           3     0.7222    0.5417    0.6190        24
           4     0.7826    0.7826    0.7826        23
           5     0.8462    0.6471    0.7333        17
           6     1.0000    0.9545    0.9767        22
           7     0.6250    1.0000    0.7692        20
           8     0.6923    0.8182    0.7500        22

    accuracy                         0.7900       200
   macro avg     0.7999    0.7868    0.7829       200
weighted avg     0.8018    0.7900    0.7862       200

Saved best model for fold 5 with F1 score: 0.7862


                                                                    

Epoch: 1, Training Loss: 1.1195
Validation Results:
              precision    recall  f1-score   support

           0     0.8929    1.0000    0.9434        25
           1     0.7308    0.9048    0.8085        21
           2     0.8750    0.8077    0.8400        26
           3     0.7500    0.5000    0.6000        24
           4     0.8000    0.8696    0.8333        23
           5     0.8462    0.6471    0.7333        17
           6     1.0000    0.9545    0.9767        22
           7     0.7917    0.9500    0.8636        20
           8     0.8261    0.8636    0.8444        22

    accuracy                         0.8350       200
   macro avg     0.8347    0.8330    0.8270       200
weighted avg     0.8360    0.8350    0.8289       200

Saved best model for fold 5 with F1 score: 0.8289

Fold 5 Best Validation Report:
              precision    recall  f1-score   support

           0     0.8929    1.0000    0.9434        25
           1     0.7308    0.9048    0.8085        2