In [1]:
import numpy as np
import pandas as pd
import random
import torch
import matplotlib.pylab as plt
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW,AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')
#!nvidia-smi
#pip install matplotlib==3.3.3 -i https://pypi.tuna.tsinghua.edu.cn/simple

#超参数
SEED = 123
BATCH_SIZE = 2
learning_rate = 2e-5#0.00002可设置在0.001~0.0001之间 default=2e-5
weight_decay = 1e-2#权重衰减项，防止过拟合的一个参数。在损失函数中，weight decay 是放在正则项前面的一个系数，正则项一般指示模型的复杂度，
#所以weight decay的作用是调节模型复杂度对损失函数的影响，若weight_decay很大，则复杂的模型损失函数的值也就大#1e-2
epsilon = 1e-8#将小浮点数添加到方差中，避免除以0
epochs = 5

data_dir = './data/'
filename = 'sentiment_classification.h5'
new_path = './models/xlnetmid'
model_path = './chinese_xlnet_mid_pytorch'

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)



#读取文件
def print_df_stats(df,label=None):
    if label:
        print(label)
    print("Shape", df.shape)
    print("Columns", df.columns)
    print(df.groupby("Score").count())
    print("")


def readFile(predict=False):
    df = pd.read_hdf(data_dir + filename,index=False)
    df.columns = ["Text","newscode","typecode","classification","Score"]#rename columns
    df1 = df[df['Score']!=1]
    df2 = df[df['Score']==1]
    df3 = df2.sample(frac=0.89673059,replace=False,random_state=666,axis=0)
    df4 = pd.concat([df1,df3])
    df = df4.sample(frac=1,random_state=666,axis=0)#抽样-打乱顺序
    df.reset_index(drop=True,inplace=True)
    if not predict:
        print_df_stats(df,"Training Data")
    return df

# 将每一句转成数字 （大于125做截断，小于125做 Padding，加上首位两个标识，长度总共等于128）
def convert_text_to_token(tokenizer, sentence, limit_size = 125):
    tokens = tokenizer.encode(sentence[:limit_size])       # 直接截断
    if len(tokens) < limit_size + 3:                       # 补齐（pad的索引号就是0）
        tokens.extend([0] * (limit_size + 3 - len(tokens)))
    return tokens

#attention_masks, 在一个文本中，如果是PAD符号则是0，否则就是1
# 建立mask
def attention_masks(input_ids):
    atten_masks = []
    for seq in input_ids:                       # [10000, 128]
        seq_mask = [float(i > 0) for i in seq]  # PAD: 0; 否则: 1
        atten_masks.append(seq_mask)
    return atten_masks


#计算模型运行时间
import time
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds = elapsed_rounded)) # 返回 hh:mm:ss 形式的时间

#训练模型
def train(model, optimizer):
    torch.cuda.empty_cache()#清理内存
    t0 = time.time()
    avg_loss, avg_acc = [],[]
    model.train()
    for step, batch in enumerate(train_dataloader):

        # 每隔40个batch 输出一下所用时间.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = output[0], output[1]      # loss: 损失, logits: predict

        avg_loss.append(loss.item())

        acc = binary_acc1(logits, b_labels)       # (predict, label)
        avg_acc.append(acc)

        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0) # 大于1的梯度将其设为1.0, 以防梯度爆炸
        optimizer.step()                         # 更新模型参数
        scheduler.step()                         # 更新learning rate

    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc

#模型准确率
def binary_acc1(preds,labels): # preds.shape = [16, 2] labels.shape = [16, 1]
    # torch.max: [0]为最大值, [1]为最大值索引
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()
    #对两个张量Tensor进行逐元素的比较，若相同位置的两个元素相同，则返回1；若不同，返回0
    acc = correct.sum().item() / len(correct)
    return acc
#召回率，精确率
def binary_acc(preds,labels,label_task): # preds.shape = [16, 2] labels.shape = [16, 1]
    # torch.max: [0]为最大值, [1]为最大值索引
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()
    #对两个张量Tensor进行逐元素的比较，若相同位置的两个元素相同，则返回1；若不同，返回0
    #可以简化为整个混淆矩阵，然后提数就ok
    label_task = torch.tensor(label_task)
    b_label_task = label_task.to(device)
    num_predicts =torch.max(preds,dimm=1)[1]
    num_labels = labels.flatten()
    n = len(num_labels)
    positive_count,true_count,tp_count = 0,0,0
    for i in range(n):
        if num_labels[i]==b_label_task:
            true_count+=1
            if correct[i]==1:
                tp_count+=1
        if num_predicts[i]==b_label_task:
            positive_count+=1
    return positive_count,true_count,tp_count
    

from sklearn.metrics import f1_score,recall_score,precision_recall_fscore_support
import sklearn
#评估模型
def evaluate(model):
    torch.cuda.empty_cache()#清理内存
    avg_acc = []
    model.eval()         # 表示进入测试模式
    #统计召回率，精确率
    neg_pcnt,neu_pcnt,pos_pcnt = epsilon,epsilon,epsilon#positive count
    neg_tcnt,neu_tcnt,pos_tcnt = epsilon,epsilon,epsilon#true count
    neg_tpcnt,neu_tpcnt,pos_tpcnt = 0,0,0#tp count
    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            acc = binary_acc1(output[0], b_labels)
            avg_acc.append(acc)
            #统计召回率，精确率
            neg_pcnt_step,neg_tcnt_step,neg_tpcnt_step = binary_acc(output[0],b_labels,0)
            neu_pcnt_step,neu_tcnt_step,neu_tpcnt_step = binary_acc(output[0],b_labels,1)
            neg_tpcnt+=neg_tpcnt_step
            neg_pcnt+=neg_pcnt_step
            neg_tcnt+=neg_tcnt_step
            neu_tpcnt+=neu_tpcnt_step
            neu_pcnt+=neu_pcnt_step
            neu_tcnt+=neu_tcnt_step
            
    avg_acc = np.array(avg_acc).mean()#列表变数组并取平均
    #统计召回率recall=真正类/实际正类
    neg_rec = neg_tpcnt/neg_tcnt
    neu_rec = neu_tpcnt/neu_tcnt
    #精确率precision=真正类/预测正类
    neg_pre = neg_tpcnt/neg_pcnt
    neu_pre = neu_tpcnt/neu_pcnt
    return neg_rec,neu_rec,neg_pre,neu_pre,avg_acc

#预测
def predict(sen):

    input_id = convert_text_to_token(tokenizer, sen)
    input_token =  torch.tensor(input_id).long().to(device)            #torch.Size([128])

    atten_mask = [float(i>0) for i in input_id]
    attention_token = torch.tensor(atten_mask).long().to(device)       #torch.Size([128])

    output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_token.view(1, -1))     #torch.Size([128])->torch.Size([1, 128])否则会报错
    print(output[0])#置信度
    return torch.max(output[0], dim=1)[1]

#创建目录并保存新模型
from transformers import WEIGHTS_NAME, CONFIG_NAME
import os
def createpath(path):
    isExists = os.path.exists(path)
    if not isExists:
        os.makerdirs(path)
        print("创建目录：",path)

def savemodel(path):
    output_dir = path
    output_model_file = os.path.join(output_dir,WEIGHTS_NAME)
    output_config_file = os.path.join(output_dir,CONFIG_NAME)
    torch.save(model.state_dict(),output_model_file)
    model.config.to_json_file(output_config_file)
    tokenizer.save_vocabulary(output_dir)
    print('模型保存在：',output_dir)

print('成功加载函数')

  from .autonotebook import tqdm as notebook_tqdm


成功加载函数


In [7]:
# !pip install transformers==4.17.0 sentencepiece==0.1.96
# !pip install tables

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting sentencepiece==0.1.96
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/01/21/b78bb71b7fbab906eb1d10f67d1ba69761892016cf13c4f0c5dde123bb07/sentencepiece-0.1.96-cp37-cp37m-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 1.4 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
# df = pd.read_hdf(data_dir + filename,index=False)
# df.head()

In [2]:
#train and test

#读x和y
sentences = readFile().iloc[:,0].tolist()
targets = readFile().iloc[:,4].tolist()

#读取Tokenizer分词器
model_name = model_path
tokenizer = AutoTokenizer.from_pretrained(model_name)

input_ids = [convert_text_to_token(tokenizer, sen) for sen in sentences]
input_tokens = torch.tensor(input_ids)
print(input_tokens.shape)              # torch.Size([10000, 128])

total_targets = torch.tensor(targets)
atten_masks = attention_masks(input_ids)
attention_tokens = torch.tensor(atten_masks)
print(attention_tokens.shape)                   # torch.Size([10000, 128])

#划分训练集和测试集
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_tokens, total_targets,
                                                                        random_state=666, test_size=0.2)
train_masks, test_masks, _, _ = train_test_split(attention_tokens, input_tokens,
                                                 random_state=666, test_size=0.2)


#创建DataLoader，用来取出一个batch的数据
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE,drop_last=True)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE,drop_last=True)


# #查看一下train_dataloader的内容：
# for i, (train, mask, label) in enumerate(train_dataloader):
#     # torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 1])
#     print(train.shape, mask.shape, label.shape)
#     break

# print('len(train_dataloader) = ', len(train_dataloader))    # 500

#创建模型
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2) # num_labels表示2个分类,好评和差评
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

#定义优化器
no_decay = ['bias', 'LayerNorm.weight']
# 判断optimizer_param中所有的参数。如果不在no_decay中，则进行权重衰减;如果在no_decay中，则不进行权重衰减
optimizer_grouped_parameters = [
    {'params' : [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay' : weight_decay
    },
    {'params' : [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay' : 0.0
    }
]

optimizer = AdamW(optimizer_grouped_parameters, lr = learning_rate, eps = epsilon)

#学习率预热，训练时先从小的学习率开始训练
# training steps 的数量: [number of batches] x [number of epochs].
print("training steps 的数量:",len(train_dataloader))
total_steps = len(train_dataloader) * epochs
warmup_proportion = 0.1
#不知道这步合理不合理，还是直接用0.1？需要回顾
warmup_steps = int(total_steps*warmup_proportion)
# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)


#运行训练模型和评估模型
print('---开始训练---')
best_acc = 0
for epoch in range(1,epochs+1):
    train_loss, train_acc = train(model, optimizer)
    print('epoch={},训练准确率={}，损失={}'.format(epoch, train_acc, train_loss))

    neg_rec,neu_rec,neg_pre,neu_pre,test_acc = evaluate(model)
    print("epoch={},测试准确率={}".format(epoch, test_acc))
    print("epoch={},负面集测试召回率={}，精确率={}".format(epoch,neg_rec,neg_pre))
    print("epoch={},正面集测试召回率={}，精确率={}".format(epoch,neu_rec,neu_pre))
    
    #保存最好模型
    if test_acc > best_acc:
        best_acc = test_acc
        print("正在保存模型...")
        createpath(new_path)
        savemodel(new_path)

Training Data
Shape (254476, 5)
Columns Index(['Text', 'newscode', 'typecode', 'classification', 'Score'], dtype='object')
         Text  newscode  typecode  classification
Score                                            
0      127238    127238    127238          127238
1      127238    127238    127238          127238

Training Data
Shape (254476, 5)
Columns Index(['Text', 'newscode', 'typecode', 'classification', 'Score'], dtype='object')
         Text  newscode  typecode  classification
Score                                            
0      127238    127238    127238          127238
1      127238    127238    127238          127238

torch.Size([254476, 128])
torch.Size([254476, 128])


Some weights of the model checkpoint at ./chinese_xlnet_mid_pytorch were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at ./chinese_xlnet_mid_pytorch and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to us

training steps 的数量: 101790
---开始训练---


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 3.12 GiB already allocated; 0 bytes free; 3.44 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
#加载经过微调的模型进行推理
#加载模型
model_path = './models/xlnetmid'
num = 50
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 2) # num_labels表示2个分类,好评和差评
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

#读文件
sentences = readFile(predict=True).iloc[:,0].tolist()
targets = readFile(predict=True).iloc[:,4].tolist()
dfp1 = pd.DataFrame(sentences)
dfp2 = pd.DataFrame(targets)
res,count = [],0

for sen in sentences:
    label = predict(sen)
    res.append(int(label[0].item()))
    count+=1
    if count>=num:break

dfp3 = pd.DataFrame(res)
dfp = pd.concat([dfp1,dfp2,dfp3],axis=1)
dfp.columns = ["文本","标签","预测"]#rename columns
dfp = dfp.fillna(999)
dfp["预测"].astype("int64")
dfp.head(num)

In [2]:
pip list

Package                            VersionNote: you may need to restart the kernel to use updated packages.
---------------------------------- --------------------





absl-py                            1.0.0
aiohttp                            3.8.1
aiosignal                          1.2.0
alabaster                          0.7.12
anaconda-client                    1.9.0
anaconda-navigator                 2.1.4
anaconda-project                   0.10.2
anyio                              3.5.0
appdirs                            1.4.4
argcomplete                        1.12.3
argh                               0.26.2
argon2-cffi                        21.3.0
argon2-cffi-bindings               21.2.0
arrow                              1.2.2
asn1crypto                         1.4.0
astor                              0.8.1
astroid                            2.6.6
astropy                            4.3.1
astunparse                         1.6.3
async-generator                    1.10
async-timeout                      4.0.1
asynctest                          0.13.0
atomicwrites                       1.4.0
attrs                              21.4.0
Automat  