In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# 设置随机种子
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

# 加载数据
file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled_Example.csv'
# file_path = '../datasets_FIX2/FIX2_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'

data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")


# 加载BERT tokenizer和模型
model_name = '../bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

# 将模型移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

# 定义数据集类
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
            # 'label': nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) ## 
        }

# 将孟加拉语类别转换为数字标签
label_map = {label: i for i, label in enumerate(data['category1'].unique())}
labels = data['category1'].map(label_map).tolist()

# 定义自注意力层
class SelfAttention(nn.Module):
    def __init__(self, hidden_size):
        super(SelfAttention, self).__init__()
        self.hidden_size = hidden_size
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden_states, attention_mask):
        query = self.query(hidden_states)
        key = self.key(hidden_states)
        value = self.value(hidden_states)

        attention_scores = torch.matmul(query, key.transpose(-2, -1))
        attention_scores = attention_scores / (self.hidden_size ** 0.5)
        attention_scores = attention_scores.masked_fill(attention_mask == 0, -1e9)
        attention_weights = nn.functional.softmax(attention_scores, dim=-1)

        attended_values = torch.matmul(attention_weights, value)
        return attended_values

# 定义Bi-LSTM分类器
class BiLSTMAttentionClassifier(nn.Module):
    def __init__(self, bert_model, hidden_size, num_classes, num_layers=1, dropout=0.5):
        super(BiLSTMAttentionClassifier, self).__init__()
        self.bert = bert_model
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.bilstm = nn.LSTM(bert_model.config.hidden_size, hidden_size, num_layers, bidirectional=True, batch_first=True)
        self.attention = SelfAttention(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        bilstm_output, _ = self.bilstm(last_hidden_state)
        attention_output = self.attention(bilstm_output, attention_mask.unsqueeze(1).unsqueeze(2))
        pooled_output = torch.mean(attention_output, dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# 定义训练和评估函数
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(data_loader, desc='Training', leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        print(logits)
        print(logits.shape)
        print(labels)
        print(labels.size())
        loss = nn.CrossEntropyLoss()(logits, labels)
        # test
        # outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device, label_map):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            batch_predictions = torch.argmax(logits, dim=1)
            predictions.extend(batch_predictions.tolist())
            true_labels.extend(labels.tolist())

    label_map_inv = {v: k for k, v in label_map.items()}
    print(label_map_inv)
    print(predictions)
    predictions = [label_map_inv[i] for i in predictions]
    true_labels = [label_map_inv[i] for i in true_labels]

    report = classification_report(true_labels, predictions, digits=4)
    return report

# 设置超参数
num_epochs = 2
batch_size = 4
learning_rate = 2e-5
hidden_size = 128
num_classes = len(label_map)
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)



In [3]:
# K-Fold交叉验证
for fold, (train_idx, val_idx) in enumerate(kfold.split(data)):
    print(f'Fold {fold + 1}')
    
    train_data = data.iloc[train_idx]
    val_data = data.iloc[val_idx]

    train_texts = train_data['body'].tolist()
    train_labels = train_data['category1'].map(label_map).tolist()
    val_texts = val_data['body'].tolist()
    val_labels = val_data['category1'].map(label_map).tolist()

    train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
    val_dataset = NewsDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model = BiLSTMAttentionClassifier(bert_model, hidden_size, num_classes)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        val_report = evaluate(model, val_loader, device, label_map)

        print(f'Epoch {epoch + 1}/{num_epochs}')
        print(f'Train Loss: {train_loss:.4f}')
        print('Validation Report:')
        print(val_report)

        val_loss = 1 - float(val_report.split('\n')[-2].split()[-2])  # 提取验证集损失
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'best_MultiLSTMAttModel_fold_{fold + 1}.pth')

    print()

    # 在每个fold结束后,评估最佳模型在验证集上的性能
    best_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    best_model.load_state_dict(torch.load(f'best_MultiLSTMAttModel_fold_{fold + 1}.pth'))
    best_model.to(device)
    val_report = evaluate(best_model, val_loader, device, label_map)
    all_reports.append(val_report)

    print(f'Fold {fold + 1} Best Validation Report:')
    print(val_report)
    print()


Fold 1


                                                 

tensor([[[ 1.5524e-01, -1.2517e-02, -3.1121e-02,  ...,  8.0878e-03,
          -3.2454e-03, -5.0106e-02],
         [ 4.0999e-02,  2.1166e-02, -3.2938e-02,  ..., -6.5804e-04,
          -9.5529e-02, -2.4658e-02],
         [ 1.2747e-01,  8.2562e-02, -7.3958e-02,  ...,  1.7507e-03,
          -3.9765e-02,  5.8876e-02],
         ...,
         [-1.1550e-02, -1.4806e-02, -5.3273e-02,  ...,  4.5535e-04,
          -1.1290e-02, -7.5754e-02],
         [ 2.7042e-01, -3.5473e-02, -3.7422e-02,  ..., -8.0497e-02,
          -2.5275e-02,  1.9116e-02],
         [ 1.5963e-01,  9.8522e-03, -1.2428e-01,  ...,  8.0951e-03,
           3.4637e-02,  5.5742e-02]],

        [[ 6.3545e-02, -4.7343e-02, -1.0046e-01,  ...,  1.0465e-01,
          -7.3449e-03,  2.8709e-02],
         [ 1.2167e-01, -9.3932e-02, -1.3838e-01,  ...,  9.7825e-02,
          -7.4547e-02,  6.5082e-02],
         [ 7.3854e-02, -5.7096e-02, -6.1741e-02,  ..., -3.2857e-02,
          -4.3939e-02, -3.2479e-03],
         ...,
         [ 1.5765e-01, -6



RuntimeError: Expected target size [4, 9], got [4]