In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm


In [None]:
# 加载CSV数据文件
file_path = 'deduplicated_mangoNews_Nums3000p_CategoryMerge.csv'

# 分块读取CSV文件，处理类别缺失
chunk_size = 100000  # 每次读取10万行
chunks = pd.read_csv(file_path, chunksize=chunk_size)

# 初始化类别计数器
category_counts = {}

# 逐块处理数据
for chunk in tqdm(chunks, desc="Reading CSV", leave=False):
    # 丢弃空白类别
    chunk = chunk.dropna(subset=['category1'])

    # 统计类别数量
    for category in chunk['category1']:
        if category in category_counts:
            category_counts[category] += 1
        else:
            category_counts[category] = 1

# 输出类别数量
print("Category Counts:")
for category, count in category_counts.items():
    print(f"{category}: {count}")


In [None]:

# 将类别列类别转换为整数标签
categories = list(category_counts.keys())
category_to_label = {category: label for label, category in enumerate(categories)}
print(category_to_label)

# 读取CSV文件，分块读取
chunks = pd.read_csv(file_path, chunksize=chunk_size)

# 初始化Bert tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_classes = len(categories)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)



In [None]:
# 定义数据集类
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:

# 微调模型
def train_model(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0

    with tqdm(train_loader, desc="Training", leave=False) as iterator:
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            iterator.set_postfix(loss=loss.item())

    return total_loss / len(train_loader)


In [None]:

# 评估模型
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad(), tqdm(test_loader, desc="Evaluating", leave=False) as iterator:
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds



In [None]:
# 数据预处理，包括分块读取、文本预处理、标签转换等
train_texts = []
train_labels = []

# 分块读取
chunks = pd.read_csv(file_path, chunksize=chunk_size)

for chunk in tqdm(chunks, desc="Preprocessing", leave=False):
    # 丢弃空白类别
    chunk = chunk.dropna(subset=['category1'])

    # 预处理每个块
    for index, row in chunk.iterrows():
        train_texts.append(row['body'])
        train_labels.append(category_to_label[row['category1']])

# 划分训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)


In [None]:

# 创建训练和测试数据集实例
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)

# 使用DataLoader加载数据
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 将模型移动到GPU（如果可用）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


In [None]:
# 定义优化器和损失函数
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
# 训练模型
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, optimizer, device)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}')

In [None]:
# 评估模型
true_labels, predicted_labels = evaluate_model(model, test_loader, device)
print(classification_report(true_labels, predicted_labels, target_names=categories))
