In [10]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
import logging

# 配置日志记录器
logging.basicConfig(filename='test_training.log', level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')




In [3]:
# 读取CSV文件
file_path = 'mangoNews_Example.csv'  # 测试版数据集（规模小5m）
# file_path = 'mangoNews.csv'          # 完整版数据集（13g） 需要分块读
# file_path = 'mangoNews_Example_100000.csv'          # 100000行数据集（2.2g）
# file_path = 'mangoNews_Example_10000.csv'          # 10000行数据集（190m)
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（9.9g) 需要分块读
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge_100000_1.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（576m) 
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge_990000.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别的数据集（8.1g) 
# file_path = './deduplicated_mangoNews_Nums3000p_CategoryMerge_new_undersampled.csv'          # 去重后保留类别对应语料数在3000+并合并同义类别(新）并随机欠采样数据平衡的数据集（12095*12条 1.2g) 


data = pd.read_csv(file_path,low_memory=False,lineterminator="\n")

# Select relevant columns
data = data[['body', 'category1']]

# 统计 'category1' 列中每种类别的个数
category_counts = data['category1'].value_counts()

# 设置显示选项，完整输出结果
pd.set_option('display.max_rows', None)
print("Category Counts:")
print(category_counts)
# 恢复默认显示选项
pd.reset_option('display.max_rows')

# 将类别列转换为整数标签  注意是data['category1']
label_to_id = {label: idx for idx, label in enumerate(data['category1'].unique())}
print(label_to_id)
data['label'] = data['category1'].map(label_to_id)
num_classes = len(label_to_id)
print(num_classes)

Category Counts:
category1
কাজী নজরুল ইসলাম                203
সুকুমার রায়                    145
মাইকেল মধুসূদন দত্ত             133
জসীমউদ্দীন                       96
হেলাল হাফিজ                      59
সুনীল গঙ্গোপাধ্যায়               52
আবিদ আনোয়ার                     41
writer                           41
রবীন্দ্রনাথ ঠাকুর                33
সত্যেন্দ্রনাথ দত্ত               21
সুফিয়া কামাল                     20
Onubad - Copy                    20
Front Page-Detective Fiction     18
কামিনী রায়                       17
Front Page-Short Stories         16
amader-kotha                     14
Front Page-Poetry                14
Front Page-Science Fiction       13
Front Page-Tales of Unease       10
ঈশ্বরচন্দ্র গুপ্ত                 8
নারী ও সমাজ                       4
Front Page-Autobiography          3
বিজ্ঞান ও প্রযুক্তি               3
বই-টই                             3
স্মরণে                            3
ব্যক্তিগত গদ্য                    2
শিল্প- সংস্কৃতি                   2
র

In [16]:
# 划分训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 检验 'category1' 列是否还有 NaN 值
# nan_check = train_data['category1'].isnull().sum()
nan_check = train_data['category1'].isna().sum()

print("before")
if nan_check > 0:
    print(f"There are still {nan_check} NaN values in the 'category1' column.")
else:
    print("No NaN values in the 'category1' column.")
    
# 去除包含缺失值的样本
train_data = train_data.dropna(subset=['category1'])
test_data = test_data.dropna(subset=['category1'])
# todo：注意后面要提取缺失值

print("after")
# 检验 'category1' 列是否还有 NaN 值
# nan_check = train_data['category1'].isnull().sum()
nan_check = train_data['category1'].isna().sum()

if nan_check > 0:
    print(f"There are still {nan_check} NaN values in the 'category1' column.")
else:
    print("No NaN values in the 'category1' column.")

Bert_path = './uncased_L-12_H-768_A-12'  # bert-base-uncased from github
# Bert_path = './wwm_uncased_L-24_H-1024_A-16'  # bert-large-uncased(whole word masking) from github

# 初始化Bert tokenizer和模型
tokenizer = BertTokenizer.from_pretrained(Bert_path) # bert: base or large（wwm&origin） 

print(tokenizer.tokenize('I have a good time, thank you.')) # 测试

num_classes = len(data['category1'].unique())  # num_labels 表示分类任务中唯一类别的数量
model = BertForSequenceClassification.from_pretrained(Bert_path, num_labels=num_classes)

model_name = "bert-base-uncased"
num_classes


before
No NaN values in the 'category1' column.
after
No NaN values in the 'category1' column.
['i', 'have', 'a', 'good', 'time', ',', 'thank', 'you', '.']


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


33

In [12]:
# # 将类别列转换为整数标签  注意是data['category1']
# label_to_id = {label: idx for idx, label in enumerate(data['category1'].unique())}
# print(label_to_id)
# train_data['label'] = train_data['category1'].map(label_to_id)
# test_data['label'] = test_data['category1'].map(label_to_id)
# num_classes = len(label_to_id)
# print(num_classes)
# # 将 'label' 列转换为字符串类型
# train_str = train_data['label'].astype(str)
# test_str = test_data['label'].astype(str)

# # 完整输出 DataFrame
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(train_str)
#     print(test_str)

In [5]:
# 定义数据集类
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        # label = str(self.labels[idx]) # todo: 改str试试

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long) 
        }
    
    
# 创建训练和测试数据集实例
train_dataset = CustomDataset(train_data['body'].values, train_data['label'].values, tokenizer)
test_dataset = CustomDataset(test_data['body'].values, test_data['label'].values, tokenizer)

# 使用DataLoader加载数据
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) 
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

#2.20 修改batchsize
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) 
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 将模型移动到GPU上（如果可用）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [6]:
# 定义优化器和损失函数
# optimizer = AdamW(model.parameters(), lr=2e-5) # 优化器可调整 学习率可调整
# optimizer = torch.optim.AdamW(model.parameters(),  lr=2e-5) # 修改新用法  优化器可调整 学习率可调整
optimizer = torch.optim.AdamW(model.parameters(),  lr=5e-5) # 修改新用法  优化器可调整 学习率可调整 2.20 5e-5
criterion = torch.nn.CrossEntropyLoss() # 损失函数可调整

# 定义训练函数
def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0

    with tqdm(loader, desc="Training", leave=False) as iterator:
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            iterator.set_postfix(loss=loss.item())

    return total_loss / len(loader)


In [7]:
# 定义评估函数
def evaluate(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    # 修改返回val_loss
    total_loss = 0.0
    num_samples = 0

    with torch.no_grad(), tqdm(loader, desc="Evaluating", leave=False) as iterator:
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            # 修改返回val_loss
            loss = criterion(logits, labels)
            total_loss += loss.item() * len(labels)
            num_samples += len(labels)

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # 修改返回val_loss
    avg_loss = total_loss / num_samples
    return all_labels, all_preds, avg_loss

In [11]:
# 训练模型
num_epochs = 2
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    t1,t2,val_loss = evaluate(model, test_loader, device)  # 假设evaluate函数用于计算验证集损失

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')
    # 在每个 epoch 结束时记录训练损失和验证损失
    logging.info(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}, Val Loss: {val_loss}')



                                                                    

Epoch 1/2, Train Loss: 2.658506906949557, Val Loss: 2.634661388397217


                                                                    

Epoch 2/2, Train Loss: 2.505264025468093, Val Loss: 2.4872194004058836




In [14]:
# 评估模型
true_labels, predicted_labels,test_loss = evaluate(model, test_loader, device)
print(true_labels[:100])
print(predicted_labels[:100])
# print(true_labels)
# print(predicted_labels)

# print(classification_report(true_labels, predicted_labels))
print(classification_report(true_labels, predicted_labels,zero_division=1))
logging.info(classification_report(true_labels, predicted_labels,zero_division=1))

                                                         

[28, 17, 16, 28, 26, 29, 28, 28, 17, 17, 17, 10, 28, 32, 30, 31, 16, 31, 17, 15, 27, 17, 17, 28, 16, 15, 17, 17, 31, 31, 17, 17, 28, 27, 17, 16, 29, 28, 31, 4, 15, 17, 15, 32, 17, 15, 30, 28, 24, 17, 28, 31, 19, 28, 0, 1, 27, 1, 15, 30, 17, 1, 27, 16, 28, 16, 28, 17, 11, 27, 17, 28, 28, 16, 17, 16, 28, 1, 28, 15, 29, 9, 14, 9, 15, 27, 17, 15, 17, 15, 21, 22, 28, 15, 15, 24, 16, 15, 8, 17]
[28, 17, 28, 28, 17, 17, 28, 28, 17, 28, 17, 17, 28, 17, 17, 17, 17, 17, 28, 17, 28, 28, 28, 28, 17, 17, 28, 28, 28, 17, 28, 17, 28, 17, 17, 28, 28, 28, 17, 17, 17, 17, 17, 17, 17, 17, 17, 28, 0, 17, 28, 17, 17, 28, 17, 17, 17, 17, 28, 17, 17, 17, 17, 17, 28, 17, 28, 28, 17, 17, 28, 28, 28, 17, 17, 28, 28, 17, 28, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 22, 0, 28, 17, 17, 0, 28, 28, 17, 28]
              precision    recall  f1-score   support

           0       0.00      0.00      1.00         1
           1       1.00      0.00      0.00         6
           2       1.00      0.00      0.00    



In [17]:
model_path = f"./models/{model_name}_model_test_epoch_{num_epochs}.pth"

# 保存模型到文件
torch.save(model.state_dict(), model_path)

# # 保存模型到文件
# torch.save(model.state_dict(), 'bert_model_test_e.pth')