In [1]:
# undersampled 欠采样平衡后数据进行训练 ephoch=10
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

# 读取CSV文件
# file_path = 'mangoNews_Example.csv'  # 测试版数据集（规模小5m）
# file_path = 'mangoNews.csv'          # 完整版数据集（13g） 需要分块读
# file_path = 'mangoNews_Example_100000.csv'          # 100000行数据集（2.2g）
# file_path = 'mangoNews_Example_10000.csv'          # 10000行数据集（190m)
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（9.9g) 需要分块读
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge_100000_1.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（576m) 
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge_990000.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（8.1g) 
file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别(新）并随机欠采样数据平衡的数据集（12095*12条 1.2g) 


data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# Select relevant columns
data = data[['body', 'category1']]

# 统计 'category1' 列中每种类别的个数
category_counts = data['category1'].value_counts()

# 设置显示选项，完整输出结果
pd.set_option('display.max_rows', None)
print("Category Counts:")
print(category_counts)
# 恢复默认显示选项
pd.reset_option('display.max_rows')

# 将类别列转换为整数标签  注意是data['category1']
label_to_id = {label: idx for idx, label in enumerate(data['category1'].unique())}
print(label_to_id)
data['label'] = data['category1'].map(label_to_id)
num_classes = len(label_to_id)
print(num_classes)

# 划分训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 检验 'category1' 列是否还有 NaN 值
# nan_check = train_data['category1'].isnull().sum()
nan_check = train_data['category1'].isna().sum()

print("before")
if nan_check > 0:
    print(f"There are still {nan_check} NaN values in the 'category1' column.")
else:
    print("No NaN values in the 'category1' column.")
    
# 去除包含缺失值的样本
train_data = train_data.dropna(subset=['category1'])
test_data = test_data.dropna(subset=['category1'])
# todo：注意后面要提取缺失值

print("after")
# 检验 'category1' 列是否还有 NaN 值
# nan_check = train_data['category1'].isnull().sum()
nan_check = train_data['category1'].isna().sum()

if nan_check > 0:
    print(f"There are still {nan_check} NaN values in the 'category1' column.")
else:
    print("No NaN values in the 'category1' column.")

Bert_path = './uncased_L-12_H-768_A-12'  # bert-base-uncased from github
# Bert_path = './wwm_uncased_L-24_H-1024_A-16'  # bert-large-uncased(whole word masking) from github

# 初始化Bert tokenizer和模型
tokenizer = BertTokenizer.from_pretrained(Bert_path) # bert: base or large（wwm&origin） 

print(tokenizer.tokenize('I have a good time, thank you.')) # 测试

num_classes = len(data['category1'].unique())  # num_labels 表示分类任务中唯一类别的数量
model = BertForSequenceClassification.from_pretrained(Bert_path, num_labels=num_classes)
# num_classes

# 加载已有模型！！
# 加载之前保存的模型参数
model.load_state_dict(torch.load('bert_model_undersampled_e10.pth'))


# 定义数据集类
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        # label = str(self.labels[idx]) # todo: 改str试试

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long) 
        }
    
    
# 创建训练和测试数据集实例
train_dataset = CustomDataset(train_data['body'].values, train_data['label'].values, tokenizer)
test_dataset = CustomDataset(test_data['body'].values, test_data['label'].values, tokenizer)

# 使用DataLoader加载数据
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 将模型移动到GPU上（如果可用）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)


# 定义优化器和损失函数
# optimizer = AdamW(model.parameters(), lr=2e-5) # 优化器可调整 学习率可调整
optimizer = torch.optim.AdamW(model.parameters(),  lr=2e-5) # 修改新用法  优化器可调整 学习率可调整
criterion = torch.nn.CrossEntropyLoss() # 损失函数可调整

# 定义训练函数
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0

    with tqdm(loader, desc="Training", leave=False) as iterator:
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            iterator.set_postfix(loss=loss.item())

    return total_loss / len(loader)


# 定义评估函数
def evaluate(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad(), tqdm(loader, desc="Evaluating", leave=False) as iterator:
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds


# 训练模型
num_epochs = 1
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}')

    

# 评估模型
true_labels, predicted_labels = evaluate(model, test_loader, device)
print(true_labels[:100])
print(predicted_labels[:100])
# print(true_labels)
# print(predicted_labels)

# print(classification_report(true_labels, predicted_labels))
print(classification_report(true_labels, predicted_labels,zero_division=1))





  from .autonotebook import tqdm as notebook_tqdm


Category Counts:
category1
অন্যান্য       12095
জাতীয়          12095
আন্তর্জাতিক    12095
খেলাধুলা       12095
রাজনীতি        12095
বিনোদন         12095
অর্থনীতি       12095
আইন            12095
আর্কাইভ        12095
শিক্ষা         12095
বিজ্ঞান        12095
লাইফস্টাইল     12095
Name: count, dtype: int64
{'অন্যান্য': 0, 'জাতীয়': 1, 'আন্তর্জাতিক': 2, 'খেলাধুলা': 3, 'রাজনীতি': 4, 'বিনোদন': 5, 'অর্থনীতি': 6, 'আইন': 7, 'আর্কাইভ': 8, 'শিক্ষা': 9, 'বিজ্ঞান': 10, 'লাইফস্টাইল': 11}
12
before
No NaN values in the 'category1' column.
after
No NaN values in the 'category1' column.
['i', 'have', 'a', 'good', 'time', ',', 'thank', 'you', '.']


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


Training:   0%|          | 0/3629 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
                                                                        

KeyboardInterrupt: 

In [None]:
# 保存模型到文件
torch.save(model.state_dict(), 'bert_model_undersampled_e10.pth')