In [None]:
!pip install pandas
!pip install numpy
!pip install torch
!pip install scikit-learn
!pip install transformers
!pip install tqdm


In [13]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import BertTokenizer,BertModel
from tqdm import tqdm


# 随机数种子以及使用cpu/cuda

In [14]:
np.random.seed(42)
torch.manual_seed(42)
USE_CUDA = torch.cuda.is_available()

# 读取数据

In [15]:
data=pd.read_csv('./data/train.news.csv',encoding='utf-8')
data.head()

In [None]:
data_title = list(data["Title"])
data_label = list(data["label"])

# 文字索引化 

In [17]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
title_id=tokenizer(data_title,padding=True,truncation=True,max_length=200,return_tensors='pt')
label_id=torch.Tensor(data_label).long()
title_id = title_id['input_ids']

# 划分训练集验证集

In [18]:
train_title, val_title, train_label, val_label = train_test_split(title_id, label_id, test_size=0.1889, random_state=42)

# 构建数据迭代器

In [19]:
batch_size =256

train_data = TensorDataset(train_title, train_label)
valid_data = TensorDataset(val_title, val_label)

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=1,drop_last=True)

In [20]:
if(USE_CUDA):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

# 定义模型

In [21]:
class net(nn.Module):
    def __init__(self, hidden_dim,n_layers,output_size):
        super(net, self).__init__()
 
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        #Bert 
        self.bert=BertModel.from_pretrained("bert-base-chinese") #调用bert预训练模型
        for param in self.bert.parameters(): #是否优化bert的参数
            param.requires_grad = True
       
        self.lstm = nn.LSTM(768, hidden_dim, n_layers, batch_first=True) #定义lstm模型
        # self.rnn = nn.RNN(768, hidden_dim, n_layers, batch_first=True) #定义rnn模型
        # self.gru = nn.GRU(768, hidden_dim, n_layers, batch_first=True) #定义gru模型
        self.fc = nn.Linear(hidden_dim, output_size) #定义线性层
        
 
    def forward(self, x):
        x=self.bert(x)[0]     #将预处理好的文本token索引 转化为 词向量
        lstm_out, (hidden_last,cn_last) = self.lstm(x) #经过lstm融合整句话信息
        # rnn_out, hidden_last = self.rnn(x)
        # gru_out, hidden_last = self.gru(x)
        out = self.fc(hidden_last[-1]) #线性层分类
        
        return out


# 初始化模型

In [22]:
hidden_dim = 256 #隐藏层维度
output_size=2 #类别数目
n_layers = 2 #循环神经网络隐藏层数
lr=1e-5 #学习率
epochs = 50 #训练轮数


model = net(hidden_dim,n_layers,output_size) #初始化模型
criterion = nn.CrossEntropyLoss() #初始化损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=lr) #初始化优化器
if(USE_CUDA):
    model.cuda() 

# 训练

In [23]:
model.train()
for e in range(epochs):
    print(f"start train {e+1}")
    for inputs, labels in tqdm(train_loader,position=0):
        if(USE_CUDA):
            inputs, labels = inputs.cuda(), labels.cuda()
        model.zero_grad()
        output=model(inputs)
        loss = criterion(output.squeeze(), labels)
        loss.backward()
        optimizer.step()
   
    print("Epoch: {}/{}...".format(e+1, epochs),
          "Train Loss: {:.6f}...".format(loss.item()))
#保存
# PATH = f"./bert_lstm_{hidden_dim}_{n_layers}_{output_size}_{epochs}"
# torch.save(model.state_dict(), PATH)


# 验证

In [24]:
model.eval()
pre_ls = []
predict_prob_ls = []
true_ls = []
for inputs, labels in valid_loader:
    if(USE_CUDA):
        inputs, labels = inputs.cuda(), labels.cuda()
    output = model(inputs)
    output=torch.nn.Softmax(dim=1)(output)
    pred=torch.max(output, 1)[1]
    predict_prob_ls.append(output.cpu().detach().numpy()[0][labels.cpu().detach().numpy()[0]])
    pre_ls.append(int(pred[0]))
    true_ls.append(labels.cpu().detach().numpy()[0])

print(metrics.classification_report(true_ls,pre_ls))
fpr,tpr,thresholds = metrics.roc_curve(true_ls,predict_prob_ls,pos_label=0)
print("auc:",metrics.auc(fpr,tpr))

# 测试

In [25]:
data=pd.read_csv('./data/test.news.csv',encoding='utf-8')
data_title = list(data["Title"])
data_label = list(data["label"])
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
title_id=tokenizer(data_title,padding=True,truncation=True,max_length=200,return_tensors='pt')
label_id=torch.Tensor(data_label).long()
title_id = title_id['input_ids']

In [26]:
test_data = TensorDataset(title_id, label_id)
test_loader = DataLoader(test_data, shuffle=True, batch_size=1,drop_last=True)

In [27]:
model.eval()
pre_ls = []
predict_prob_ls = []
true_ls = []
for inputs, labels in test_loader:
    if(USE_CUDA):
        inputs, labels = inputs.cuda(), labels.cuda()
    output = model(inputs)
    output=torch.nn.Softmax(dim=1)(output)
    pred=torch.max(output, 1)[1]
    predict_prob_ls.append(output.cpu().detach().numpy()[0][labels.cpu().detach().numpy()[0]])
    pre_ls.append(int(pred[0]))
    true_ls.append(labels.cpu().detach().numpy()[0])

print(metrics.classification_report(true_ls,pre_ls))
fpr,tpr,thresholds = metrics.roc_curve(true_ls,predict_prob_ls,pos_label=0)
print("auc:",metrics.auc(fpr,tpr))