# 导入包

In [1]:

# 导入transformers
import transformers
from transformers import BertModel, BertTokenizer, BertConfig, AdamW, get_linear_schedule_with_warmup

# 导入torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# 常用包
import re
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import os
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score
os.environ["TOKENIZERS_PARALLELISM"] = "false"

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 1 定义模型

In [2]:
cache_dir = 'cache'
class PaperClassifier(nn.Module):
    def __init__(self):
        n_classes = 39
        super(PaperClassifier, self).__init__()
        PRE_TRAINED_MODEL_NAME = "bert-base-uncased"
        self.robert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.bilstm = nn.LSTM(input_size=self.robert.config.hidden_size,
                              hidden_size=self.robert.config.hidden_size, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.robert.config.hidden_size * 2, n_classes)

    def forward(self, input_ids, attention_mask):

        last_hidden_out, pooled_output = self.robert(  # 只要了句子级表示？    _:[10, 300, 768]    [16, 768]
            input_ids=input_ids,
            attention_mask=attention_mask  # [16, 300]300是句子长度

        )
        last_hidden_out = self.drop(last_hidden_out)
        output_hidden, _ = self.bilstm(last_hidden_out)  # [10, 300, 768]

        output = self.drop(output_hidden)  # dropout
        output = output.mean(dim=1)

        return self.out(output)


# 2 读取数据

In [3]:
def data_process():
    train = pd.read_csv('data/train_clean_data.csv', sep='\t')
    test = pd.read_csv('data/test_clean_data.csv', sep='\t')
    label_path = "data/label_id2cate.pkl"# 进行label编码映射的字典文件
    with open(label_path, 'rb') as f:
        label_id2cate = pickle.load(f)
    return train, test, label_id2cate


# 3 封装数据集

In [4]:
class PaperDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        """
        item 为数据索引，迭代取第item条数据
        """
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(  # 等价于tokenizer.tokenize() + tokenizer.convert_tokens_to_ids()
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'texts': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [5]:
def create_data_loader(df, tokenizer, max_len, batch_size,sampler):
    ds = PaperDataset(  # dataset
        texts=df['text'].values,
        labels=df['label'].values,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        sampler = sampler,
        num_workers=4,  # 多线程
        pin_memory=True  # 页锁定内存
    )


def create_test_loader(df, tokenizer, max_len, batch_size):
    ds = PaperDataset(  # dataset
        texts=df['text'].values,
        labels=df['label'].values,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(  
        ds,
        batch_size=batch_size,
        num_workers=4,#多线程
        pin_memory=True,  # 页锁定内存
        shuffle=False
    )


# 4 定义一个epoch训练

In [6]:

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    print("start training!")
    model = model.train()
    losses = []
    pred_ls = []
    label_ls = []
    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        label_ls.extend(d["labels"])
        pred_ls.extend(preds.tolist())
    correct_predictions = accuracy_score(label_ls, pred_ls)
    return correct_predictions, np.mean(losses)

# 验证
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()  # 验证预测模式
    losses = []
    pred_ls = []
    label_ls = []
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())
            pred_ls.extend(preds.tolist())
            label_ls.extend(d["labels"])
           
        correct_predictions = accuracy_score(label_ls, pred_ls)
    return correct_predictions, np.mean(losses)


# 5 预测结果

In [7]:

def model_predictions(model, data_loader, device):
    model = model.eval()
    result = []
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            y_pred = outputs.data.cpu().numpy()
            result.extend(y_pred)
    
    return result


# 6 模型不同层的差分学习率

In [8]:
# 
def get_parameters(model, model_init_lr, multiplier, classifier_lr):
    parameters = []
    lr = model_init_lr
    for layer in range(12, -1, -1):  # 遍历模型的每一层
        layer_params = {
            'params': [p for n, p in model.named_parameters() if f'encoder.layer.{layer}.' in n],
            'lr': lr
        }
        parameters.append(layer_params)
        lr *= multiplier  # 每一层的学习率*0.95的衰减因子
    classifier_params = {
        'params': [p for n, p in model.named_parameters() if 'layer_norm' in n or 'linear' in n
                   or 'pooling' in n],
        'lr': classifier_lr  # 单独针对全连接层
    }
    parameters.append(classifier_params)
    return parameters


# 7 训练模型

In [9]:
# K折数据划分
from torch.utils import data
from sklearn.utils import resample
def load_data_kfold(dataset,BATCH_SIZE,MAX_LEN, k, n):
    print("第{}折正在划分数据集".format(n+1))

    l = len(dataset)
    print(l)
    shuffle_dataset = True
    random_seed = 42  # fixed random seed
    indices = list(range(l))

    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)  # shuffle
    # Collect indexes of samples for validation set.
    val_indices = indices[int(l / k) * n:int(l / k) * (n + 1)]
    train_indices = list(set(indices).difference(set(val_indices)))
    train_sampler = data.SubsetRandomSampler(train_indices)  # build Sampler
    valid_sampler = data.SubsetRandomSampler(val_indices)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_data_loader = create_data_loader(dataset, tokenizer, MAX_LEN, BATCH_SIZE,train_sampler)
    val_data_loader = create_data_loader(dataset, tokenizer, MAX_LEN, BATCH_SIZE,valid_sampler)

    print("划分完成")
    return train_data_loader, val_data_loader




In [10]:
#训练模型
def train_start(EPOCHS,MAX_LEN,BATCH_SIZE,train, test_data_loader, label_id2cate):
    #模型定义
    model = PaperClassifier()
    model = model.to(device)
    #普通学习率
    k_fold = 5
    predict_all = np.zeros([10000,39])#存储测试集的 预测结果
    for n in range(k_fold):
        train_data_loader, val_data_loader = load_data_kfold(train, BATCH_SIZE,MAX_LEN, k_fold, n)
        #使用差分学习率
        parameters = get_parameters(model, 2e-5, 0.95, 1e-4)
        # 使用AdamW优化器
        optimizer = AdamW(parameters)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
        loss_fn = nn.CrossEntropyLoss().to(device)
        best_accuracy = 0
        for epoch in range(EPOCHS):
            print(f'Epoch {epoch + 1}/{EPOCHS}')
            print('-' * 10)
            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler
            )

            print(f'Train loss {train_loss} accuracy {train_acc}')
            val_acc, val_loss = eval_model(
                model, val_data_loader, loss_fn, device)
            print(f'Val loss {val_loss} accuracy {val_acc}')

            if val_acc > best_accuracy:
                torch.save(model.state_dict(), 'model/best_model_state_base_aug_15w.bin')
                best_accuracy = val_acc

        #进行预测
        y_pred = model_predictions(model, test_data_loader, device)
        predict_all += np.array(y_pred)
    # 取每折的预测矩阵的平均
    predictions = predict_all/k_fold
    np.save("submit_arr.npy", predictions)
    pred = np.argmax(predictions, axis=1)
    print("计算完成")
    # 生成提交文件
    model_name = "Bert_base_cross"
    # 读取提交格式文件
    sub = pd.read_csv('data/sample_submit.csv')
    sub['categories'] = list(pred)
    sub['categories'] = sub['categories'].map(label_id2cate)
    sub.to_csv('submit/submit_{}.csv'.format(model_name), index=False)


In [12]:
train, test, label_id2cate = data_process()
EPOCHS = 3 
MAX_LEN = 300 # 文本最大长度
BATCH_SIZE = 10
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased' 
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
test_data_loader = create_test_loader(test, tokenizer, MAX_LEN, BATCH_SIZE)


In [13]:

train_start(EPOCHS,MAX_LEN,BATCH_SIZE,train, test_data_loader, label_id2cate)


第1折正在划分数据集
58312
划分完成
Epoch 1/3
----------
start training!


100%|██████████| 4665/4665 [35:41<00:00,  2.18it/s]


Train loss 2.107577014506055 accuracy 0.6956913183279743
Val loss 1.7093988679470726 accuracy 0.7665066026410564
Epoch 2/3
----------
start training!


100%|██████████| 4665/4665 [35:41<00:00,  2.18it/s]


Train loss 1.73272518421676 accuracy 0.8022079314040729
Val loss 1.5623988292619726 accuracy 0.7920596810152633
Epoch 3/3
----------
start training!


100%|██████████| 4665/4665 [35:44<00:00,  2.18it/s]


Train loss 1.5866386070384346 accuracy 0.8429367631296891
Val loss 1.5140424453507353 accuracy 0.8032069970845481
第2折正在划分数据集
58312
划分完成
Epoch 1/3
----------
start training!


100%|██████████| 4665/4665 [35:46<00:00,  2.17it/s]


Train loss 1.570397342426856 accuracy 0.8237084673097534
Val loss 1.3299409259356487 accuracy 0.8376779283141829
Epoch 2/3
----------
start training!


100%|██████████| 4665/4665 [35:47<00:00,  2.17it/s]


Train loss 1.390033440906774 accuracy 0.8621650589496249
Val loss 1.2319598364952598 accuracy 0.8507117132567312
Epoch 3/3
----------
start training!


100%|██████████| 4665/4665 [35:46<00:00,  2.17it/s]


Train loss 1.2723262687715815 accuracy 0.8927974276527331
Val loss 1.2038921132083622 accuracy 0.8537986623220717
第3折正在划分数据集
58312
划分完成
Epoch 1/3
----------
start training!


100%|██████████| 4665/4665 [35:46<00:00,  2.17it/s]


Train loss 1.3012224498157972 accuracy 0.8645016077170418
Val loss 0.9935754507448087 accuracy 0.8957297204596124
Epoch 2/3
----------
start training!


100%|██████████| 4665/4665 [35:46<00:00,  2.17it/s]


Train loss 1.1484215140470482 accuracy 0.8960557341907824
Val loss 0.9299274367865275 accuracy 0.9019893671754416
Epoch 3/3
----------
start training!


100%|██████████| 4665/4665 [35:45<00:00,  2.17it/s]


Train loss 1.0466200405506203 accuracy 0.919935691318328
Val loss 0.8978366374254431 accuracy 0.9067055393586005
第4折正在划分数据集
58312
划分完成
Epoch 1/3
----------
start training!


100%|██████████| 4665/4665 [35:45<00:00,  2.17it/s]


Train loss 1.0780534979487029 accuracy 0.8960128617363344
Val loss 0.7706730795294787 accuracy 0.9277139427199451
Epoch 2/3
----------
start training!


100%|██████████| 4665/4665 [35:39<00:00,  2.18it/s]


Train loss 0.9488447409606312 accuracy 0.9209860664523044
Val loss 0.7167486809928293 accuracy 0.9315726290516206
Epoch 3/3
----------
start training!


100%|██████████| 4665/4665 [35:46<00:00,  2.17it/s]


Train loss 0.8649915472870854 accuracy 0.9382636655948553
Val loss 0.6936552832038768 accuracy 0.9345738295318127
第5折正在划分数据集
58312
划分完成
Epoch 1/3
----------
start training!


100%|██████████| 4665/4665 [35:39<00:00,  2.18it/s]


Train loss 0.8984306016407851 accuracy 0.9174490889603429
Val loss 0.5928635238426407 accuracy 0.948036357400103
Epoch 2/3
----------
start training!


100%|██████████| 4665/4665 [35:30<00:00,  2.19it/s]


Train loss 0.7940708027008645 accuracy 0.9363558413719185
Val loss 0.548992028400851 accuracy 0.9516377979763334
Epoch 3/3
----------
start training!


100%|██████████| 4665/4665 [35:36<00:00,  2.18it/s]


Train loss 0.7230485448691623 accuracy 0.95005359056806
Val loss 0.5266872626548152 accuracy 0.9546389984565254
计算完成
