In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


In [3]:
# 加载数据
data = pd.read_csv('./datasets_FIX3/FIX3_deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv',low_memory=False,lineterminator="\n")
texts = data['body'].tolist()
labels = data['category1'].tolist()

# 将标签转换为数字
label_map = {label: i for i, label in enumerate(set(labels))}
print(label_map)

num_labels = len(label_map)
print(num_labels)
labels = [label_map[label] for label in labels]
# print(labels)

# 加载tokenizer和model
# model_name = 'xlnet-base-cased'
model_name = './XL-NET-base-cased'

tokenizer = XLNetTokenizer.from_pretrained(model_name)
model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 数据预处理
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 创建数据集和数据加载器
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = NewsDataset(
        texts=texts,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 拆分数据集
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 创建数据加载器
train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, max_len=512, batch_size=8)
val_data_loader = create_data_loader(val_texts, val_labels, tokenizer, max_len=512, batch_size=8)

# 优化器和学习率调度器
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * 10
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0, total_iters=total_steps)

{'খেলাধুলা': 0, 'বিজ্ঞান': 1, 'অন্যান্য': 2, 'শিক্ষা': 3, 'অর্থনীতি': 4, 'বিনোদন': 5, 'আইন': 6, 'রাজনীতি': 7, 'লাইফস্টাইল': 8}
9
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at ./XL-NET-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 训练循环
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    progress_bar = tqdm(data_loader, desc='Training', leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(data_loader)

# 评估循环
def evaluate(model, data_loader, device, label_map):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        progress_bar = tqdm(data_loader, desc='Evaluating', leave=False)
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            batch_predictions = torch.argmax(logits, dim=-1)
            predictions.extend(batch_predictions.tolist())
            true_labels.extend(labels.tolist())

    # 将数字标签转换回原始标签
    predictions = [list(label_map.keys())[list(label_map.values()).index(pred)] for pred in predictions]
    true_labels = [list(label_map.keys())[list(label_map.values()).index(label)] for label in true_labels]

    report = classification_report(true_labels, predictions)
    return report

# 训练和评估
num_epochs = 1
best_val_loss = float('inf')

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')

    train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler)
    val_report = evaluate(model, val_data_loader, device, label_map)

    print(f'Train Loss: {train_loss:.4f}')
    print('Validation Report:')
    print(val_report)
    print('--------------------------------------------------')

    # ... (保存最佳模型部分保持不变) ...
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'XLNET_best_model.pth')

# 保存最终模型
torch.save(model.state_dict(), 'XLNET_test_model.pth')

Epoch 1/1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train Loss: 2.1500
Validation Report:
              precision    recall  f1-score   support

    অন্যান্য       0.18      0.48      0.27      1271
    অর্থনীতি       0.14      0.11      0.12      1259
         আইন       0.25      0.68      0.37      1305
    খেলাধুলা       0.59      0.03      0.07      1318
     বিজ্ঞান       0.00      0.00      0.00      1277
      বিনোদন       0.33      0.68      0.44      1260
     রাজনীতি       0.00      0.00      0.00      1269
  লাইফস্টাইল       0.19      0.14      0.16      1320
      শিক্ষা       0.00      0.00      0.00      1297

    accuracy                           0.24     11576
   macro avg       0.19      0.24      0.16     11576
weighted avg       0.19      0.24      0.16     11576

--------------------------------------------------


NameError: name 'val_loss' is not defined