In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# 设置随机种子
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

# 加载数据
file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'
# file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# 加载BERT tokenizer和模型
model_name = '../bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

# 将模型移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [3]:
# 定义数据集类
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 将孟加拉语类别转换为数字标签
label_map = {label: i for i, label in enumerate(data['category1'].unique())}
labels = data['category1'].map(label_map).tolist()

# 定义LSTM分类器
class LSTMClassifier(nn.Module):
    def __init__(self, bert_model, hidden_size, num_classes, num_layers=1, bidirectional=True, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.bert = bert_model
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(bert_model.config.hidden_size, hidden_size, num_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_size * (2 if bidirectional else 1), num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        lstm_output, _ = self.lstm(last_hidden_state)
        lstm_output = self.dropout(lstm_output[:, -1, :])
        logits = self.fc(lstm_output)
        return logits

# 定义训练和评估函数
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(data_loader, desc='Training', leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        print(logits)
        print(labels)
        
        loss = nn.CrossEntropyLoss()(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device, label_map):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.tolist())
            true_labels.extend(labels.tolist())

    label_map_inv = {v: k for k, v in label_map.items()}
    predictions = [label_map_inv[i] for i in predictions]
    true_labels = [label_map_inv[i] for i in true_labels]

    report = classification_report(true_labels, predictions, digits=4)
    return report



In [8]:
# 设置超参数
num_epochs = 2
batch_size =16
learning_rate = 2e-5
hidden_size = 128
num_classes = len(label_map)
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# 存储所有fold的性能指标
all_reports = []

# K-Fold交叉验证
for fold, (train_idx, val_idx) in enumerate(kfold.split(data)):
    print(f'Fold {fold + 1}')
    
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    train_texts = train_data['body'].tolist()
    train_labels = train_data['category1'].map(label_map).tolist()
    val_texts = val_data['body'].tolist()
    val_labels = val_data['category1'].map(label_map).tolist()

    train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
    val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model = LSTMClassifier(bert_model, hidden_size, num_classes)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        val_report = evaluate(model, val_loader, device, label_map)

        print(f'Epoch {epoch + 1}/{num_epochs}')
        print(f'Train Loss: {train_loss:.4f}')
        print('Validation Report:')
        print(val_report)

        val_loss = 1 - float(val_report.split('\n')[-2].split()[-2])  # 提取验证集损失
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'best_model_fold_{fold + 1}.pth')

    # 在每个fold结束后,评估最佳模型在验证集上的性能
    best_model = LSTMClassifier(bert_model, hidden_size, num_classes)
    best_model.load_state_dict(torch.load(f'best_model_fold_{fold + 1}.pth'))
    best_model.to(device)
    val_report = evaluate(best_model, val_loader, device, label_map)
    all_reports.append(val_report)

    print(f'Fold {fold + 1} Best Validation Report:')
    print(val_report)
    print()
    





Fold 1


                                                                     

Epoch 1/2
Train Loss: 1.2073
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     0.9286    0.7222    0.8125        18
    অর্থনীতি     0.9600    0.8276    0.8889        29
         আইন     0.8182    1.0000    0.9000        27
    খেলাধুলা     0.9583    1.0000    0.9787        23
     বিজ্ঞান     0.9583    0.9583    0.9583        24
      বিনোদন     0.9545    1.0000    0.9767        21
     রাজনীতি     1.0000    0.9200    0.9583        25
  লাইফস্টাইল     0.9375    0.8824    0.9091        17
      শিক্ষা     0.7368    0.8750    0.8000        16

    accuracy                         0.9150       200
   macro avg     0.9169    0.9095    0.9092       200
weighted avg     0.9223    0.9150    0.9147       200



                                                                     

Epoch 2/2
Train Loss: 0.6466
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.7222    0.8387        18
    অর্থনীতি     0.9630    0.8966    0.9286        29
         আইন     0.8667    0.9630    0.9123        27
    খেলাধুলা     0.9583    1.0000    0.9787        23
     বিজ্ঞান     0.9200    0.9583    0.9388        24
      বিনোদন     0.9545    1.0000    0.9767        21
     রাজনীতি     0.9615    1.0000    0.9804        25
  লাইফস্টাইল     0.9412    0.9412    0.9412        17
      শিক্ষা     0.8750    0.8750    0.8750        16

    accuracy                         0.9350       200
   macro avg     0.9378    0.9285    0.9300       200
weighted avg     0.9377    0.9350    0.9336       200





Fold 1 Best Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.7222    0.8387        18
    অর্থনীতি     0.9630    0.8966    0.9286        29
         আইন     0.8667    0.9630    0.9123        27
    খেলাধুলা     0.9583    1.0000    0.9787        23
     বিজ্ঞান     0.9200    0.9583    0.9388        24
      বিনোদন     0.9545    1.0000    0.9767        21
     রাজনীতি     0.9615    1.0000    0.9804        25
  লাইফস্টাইল     0.9412    0.9412    0.9412        17
      শিক্ষা     0.8750    0.8750    0.8750        16

    accuracy                         0.9350       200
   macro avg     0.9378    0.9285    0.9300       200
weighted avg     0.9377    0.9350    0.9336       200


Fold 2


                                                                     

Epoch 1/2
Train Loss: 1.2319
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.8696    0.9302        23
    অর্থনীতি     1.0000    0.9000    0.9474        20
         আইন     1.0000    1.0000    1.0000        22
    খেলাধুলা     1.0000    1.0000    1.0000        25
     বিজ্ঞান     0.9286    0.9286    0.9286        14
      বিনোদন     1.0000    1.0000    1.0000        22
     রাজনীতি     0.9200    1.0000    0.9583        23
  লাইফস্টাইল     0.8438    0.9643    0.9000        28
      শিক্ষা     1.0000    0.9565    0.9778        23

    accuracy                         0.9600       200
   macro avg     0.9658    0.9577    0.9603       200
weighted avg     0.9639    0.9600    0.9604       200



                                                                     

Epoch 2/2
Train Loss: 0.6458
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.9130    0.9545        23
    অর্থনীতি     1.0000    0.8500    0.9189        20
         আইন     0.9130    0.9545    0.9333        22
    খেলাধুলা     1.0000    1.0000    1.0000        25
     বিজ্ঞান     0.8235    1.0000    0.9032        14
      বিনোদন     0.9565    1.0000    0.9778        22
     রাজনীতি     0.9565    0.9565    0.9565        23
  লাইফস্টাইল     0.9259    0.8929    0.9091        28
      শিক্ষা     0.9167    0.9565    0.9362        23

    accuracy                         0.9450       200
   macro avg     0.9436    0.9471    0.9433       200
weighted avg     0.9483    0.9450    0.9450       200





Fold 2 Best Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.8696    0.9302        23
    অর্থনীতি     1.0000    0.9000    0.9474        20
         আইন     1.0000    1.0000    1.0000        22
    খেলাধুলা     1.0000    1.0000    1.0000        25
     বিজ্ঞান     0.9286    0.9286    0.9286        14
      বিনোদন     1.0000    1.0000    1.0000        22
     রাজনীতি     0.9200    1.0000    0.9583        23
  লাইফস্টাইল     0.8438    0.9643    0.9000        28
      শিক্ষা     1.0000    0.9565    0.9778        23

    accuracy                         0.9600       200
   macro avg     0.9658    0.9577    0.9603       200
weighted avg     0.9639    0.9600    0.9604       200


Fold 3


                                                                     

Epoch 1/2
Train Loss: 1.1606
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     0.9500    0.7917    0.8636        24
    অর্থনীতি     0.9444    0.8500    0.8947        20
         আইন     0.9565    1.0000    0.9778        22
    খেলাধুলা     1.0000    1.0000    1.0000        19
     বিজ্ঞান     0.8571    0.9000    0.8780        20
      বিনোদন     1.0000    0.9500    0.9744        20
     রাজনীতি     0.8800    1.0000    0.9362        22
  লাইফস্টাইল     0.9130    1.0000    0.9545        21
      শিক্ষা     0.9375    0.9375    0.9375        32

    accuracy                         0.9350       200
   macro avg     0.9376    0.9366    0.9352       200
weighted avg     0.9370    0.9350    0.9341       200



                                                                     

Epoch 2/2
Train Loss: 0.6137
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     0.9500    0.7917    0.8636        24
    অর্থনীতি     0.9048    0.9500    0.9268        20
         আইন     0.9565    1.0000    0.9778        22
    খেলাধুলা     1.0000    1.0000    1.0000        19
     বিজ্ঞান     0.9000    0.9000    0.9000        20
      বিনোদন     1.0000    0.9500    0.9744        20
     রাজনীতি     0.8800    1.0000    0.9362        22
  লাইফস্টাইল     0.9524    0.9524    0.9524        21
      শিক্ষা     0.9375    0.9375    0.9375        32

    accuracy                         0.9400       200
   macro avg     0.9424    0.9424    0.9410       200
weighted avg     0.9415    0.9400    0.9393       200





Fold 3 Best Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     0.9500    0.7917    0.8636        24
    অর্থনীতি     0.9048    0.9500    0.9268        20
         আইন     0.9565    1.0000    0.9778        22
    খেলাধুলা     1.0000    1.0000    1.0000        19
     বিজ্ঞান     0.9000    0.9000    0.9000        20
      বিনোদন     1.0000    0.9500    0.9744        20
     রাজনীতি     0.8800    1.0000    0.9362        22
  লাইফস্টাইল     0.9524    0.9524    0.9524        21
      শিক্ষা     0.9375    0.9375    0.9375        32

    accuracy                         0.9400       200
   macro avg     0.9424    0.9424    0.9410       200
weighted avg     0.9415    0.9400    0.9393       200


Fold 4


                                                                     

Epoch 1/2
Train Loss: 1.1413
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.7391    0.8500        23
    অর্থনীতি     1.0000    1.0000    1.0000        25
         আইন     0.9310    1.0000    0.9643        27
    খেলাধুলা     1.0000    1.0000    1.0000        32
     বিজ্ঞান     0.9130    1.0000    0.9545        21
      বিনোদন     1.0000    1.0000    1.0000         7
     রাজনীতি     0.9643    0.9643    0.9643        28
  লাইফস্টাইল     0.9565    1.0000    0.9778        22
      শিক্ষা     0.8750    0.9333    0.9032        15

    accuracy                         0.9600       200
   macro avg     0.9600    0.9596    0.9571       200
weighted avg     0.9624    0.9600    0.9585       200



                                                                     

Epoch 2/2
Train Loss: 0.5602
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.6957    0.8205        23
    অর্থনীতি     0.9615    1.0000    0.9804        25
         আইন     0.9286    0.9630    0.9455        27
    খেলাধুলা     0.9697    1.0000    0.9846        32
     বিজ্ঞান     0.9524    0.9524    0.9524        21
      বিনোদন     1.0000    1.0000    1.0000         7
     রাজনীতি     0.9310    0.9643    0.9474        28
  লাইফস্টাইল     0.9167    1.0000    0.9565        22
      শিক্ষা     0.8750    0.9333    0.9032        15

    accuracy                         0.9450       200
   macro avg     0.9483    0.9454    0.9434       200
weighted avg     0.9475    0.9450    0.9427       200





Fold 4 Best Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.7391    0.8500        23
    অর্থনীতি     1.0000    1.0000    1.0000        25
         আইন     0.9310    1.0000    0.9643        27
    খেলাধুলা     1.0000    1.0000    1.0000        32
     বিজ্ঞান     0.9130    1.0000    0.9545        21
      বিনোদন     1.0000    1.0000    1.0000         7
     রাজনীতি     0.9643    0.9643    0.9643        28
  লাইফস্টাইল     0.9565    1.0000    0.9778        22
      শিক্ষা     0.8750    0.9333    0.9032        15

    accuracy                         0.9600       200
   macro avg     0.9600    0.9596    0.9571       200
weighted avg     0.9624    0.9600    0.9585       200


Fold 5


                                                                     

Epoch 1/2
Train Loss: 1.1724
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.9412    0.9697        17
    অর্থনীতি     1.0000    0.9167    0.9565        24
         আইন     1.0000    1.0000    1.0000        21
    খেলাধুলা     1.0000    1.0000    1.0000        22
     বিজ্ঞান     0.9583    1.0000    0.9787        23
      বিনোদন     0.9259    1.0000    0.9615        25
     রাজনীতি     1.0000    1.0000    1.0000        20
  লাইফস্টাইল     1.0000    1.0000    1.0000        26
      শিক্ষা     0.9545    0.9545    0.9545        22

    accuracy                         0.9800       200
   macro avg     0.9821    0.9792    0.9801       200
weighted avg     0.9809    0.9800    0.9800       200



                                                                     

Epoch 2/2
Train Loss: 0.6008
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.9412    0.9697        17
    অর্থনীতি     1.0000    0.8333    0.9091        24
         আইন     1.0000    1.0000    1.0000        21
    খেলাধুলা     1.0000    1.0000    1.0000        22
     বিজ্ঞান     0.8846    1.0000    0.9388        23
      বিনোদন     0.9259    1.0000    0.9615        25
     রাজনীতি     1.0000    1.0000    1.0000        20
  লাইফস্টাইল     1.0000    1.0000    1.0000        26
      শিক্ষা     0.9545    0.9545    0.9545        22

    accuracy                         0.9700       200
   macro avg     0.9739    0.9699    0.9704       200
weighted avg     0.9725    0.9700    0.9697       200

Fold 5 Best Validation Report:
              precision    recall  f1-score   support

    অন্যান্য     1.0000    0.9412    0.9697        17
    অর্থনীতি     1.0000    0.9167    0.9565        24
         আইন     1.0000    1.0000    1.0000        21

In [None]:
# 计算并打印所有fold的平均性能
print('Average Performance Across All Folds:')
avg_report = pd.DataFrame([report.split() for report in all_reports]).mean(axis=0)
avg_report = '\n'.join([('{:<10}'.format(col) + '{:.4f}'.format(val)) for col, val in zip(avg_report.index, avg_report.values)])
print(avg_report)

In [9]:
print(all_reports)

['              precision    recall  f1-score   support\n\n    অন্যান্য     1.0000    0.7222    0.8387        18\n    অর্থনীতি     0.9630    0.8966    0.9286        29\n         আইন     0.8667    0.9630    0.9123        27\n    খেলাধুলা     0.9583    1.0000    0.9787        23\n     বিজ্ঞান     0.9200    0.9583    0.9388        24\n      বিনোদন     0.9545    1.0000    0.9767        21\n     রাজনীতি     0.9615    1.0000    0.9804        25\n  লাইফস্টাইল     0.9412    0.9412    0.9412        17\n      শিক্ষা     0.8750    0.8750    0.8750        16\n\n    accuracy                         0.9350       200\n   macro avg     0.9378    0.9285    0.9300       200\nweighted avg     0.9377    0.9350    0.9336       200\n', '              precision    recall  f1-score   support\n\n    অন্যান্য     1.0000    0.8696    0.9302        23\n    অর্থনীতি     1.0000    0.9000    0.9474        20\n         আইন     1.0000    1.0000    1.0000        22\n    খেলাধুলা     1.0000    1.0000    1.0000        25

In [11]:
# 计算并打印所有fold的平均性能
print('Average Performance Across All Folds:')
all_lines = [report.split('\n') for report in all_reports]
header = all_lines[0][0] + '\t' + '\t'.join([line.strip() for line in all_lines[0][-4:]])

values = []
for report in all_reports:
    lines = report.split('\n')
    cls_lines = lines[1:-5]
    cls_values = []
    for line in cls_lines:
        parts = line.split()
        if len(parts) >= 5:
            cls_values.append([float(val) if val != 'nan' else 0.0 for val in parts[1:-1]])
    values.append(cls_values)

avg_values = np.mean(values, axis=0)
cls_report = '\n'.join([f'{cls}\t{prec:.4f}\t{rec:.4f}\t{f1:.4f}\t{sup:.0f}' for cls, (prec, rec, f1, sup) in zip(label_map.keys(), avg_values)])

avg_acc = np.mean([float(line.split()[-2]) for report in all_reports for line in report.split('\n') if 'accuracy' in line])
avg_macro = np.mean([float(line.split()[-2]) for report in all_reports for line in report.split('\n') if 'macro avg' in line])
avg_weighted = np.mean([float(line.split()[-2]) for report in all_reports for line in report.split('\n') if 'weighted avg' in line])

avg_report = header + '\n' + cls_report + '\n' + f"accuracy\t{avg_acc:.4f}\nmacro avg\t{avg_macro:.4f}\nweighted avg\t{avg_weighted:.4f}"
print(avg_report)

Average Performance Across All Folds:


ValueError: not enough values to unpack (expected 4, got 3)