Linking: https://blog.csdn.net/weixin_44376341/article/details/119956299

使用torchtext处理数据集

In [None]:
# 导入常用库
import math
import torch
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
import torchtext
from torchtext.vocab import Vectors
# 比较新版本的需要使用torchtext.legacy.data，旧版本的torchtext使用torchtex.data
from torchtext.data import TabularDataset
import warnings
warnings.filterwarnings("ignore")


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
#导入数据集
train_data = pd.read_csv('train_data_sentiment.csv')
train_data

Unnamed: 0,utterance,label
0,also I was the point person on my company’s tr...,0
1,You must’ve had your hands full.,0
2,That I did. That I did.,0
3,So let’s talk a little bit about your duties.,0
4,My duties? All right.,1
...,...,...
9984,You or me?,0
9985,"I got it. Uh, Joey, women don't have Adam's ap...",0
9986,"You guys are messing with me, right?",1
9987,Yeah.,0


In [None]:
# 定义Field
# 这里使用默认分词器split(),按照空格进行分词
TEXT = torchtext.data.Field(sequential=True, lower=True, fix_length=30)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)


In [None]:
train_x = TabularDataset(path='train_data_sentiment.csv',
                         format='csv', skip_header=True,
                         fields=[('utterance', TEXT), ('label', LABEL)])


In [None]:
#构建词表
TEXT.build_vocab(train_x)  #构建了10440个词，从0-10439
for w,i in TEXT.vocab.stoi.items():
    print(w,i)

In [None]:
#加载glove词向量，第一次使用会自动下载，也可以自己下载好该词向量，我这里用的是400000词，每个词由100维向量表示
TEXT.vocab.load_vectors('glove.6B.100d',unk_init=torch.Tensor.normal_) #将数据中有但glove词向量中不存在的词进行随机初始化分配100维向量

In [None]:
print(TEXT.vocab.vectors.shape) #torch.Size([10440, 100])

In [None]:
# 创建迭代器
batch_size = 64
train_iter = torchtext.data.Iterator(
    dataset=train_x, batch_size=64, shuffle=True, sort_within_batch=False, repeat=False, device=device)
len(train_iter)  # 157


In [None]:
#查看构建的迭代器
list(train_iter)

In [None]:
#查看批数据的大小
for batch in train_iter:
    print(batch.utterance.shape)

In [None]:
#查看第一条数据
batch.utterance[:,0]#我们取的是第1列，因为第1列表示第一条数据，即第64列表示第64条数据。每条数据由30个词组成，下面非1部分表示第一条数据中的词在词表中的索引，剩下的1表示补长的部分。

In [None]:
#查看第一条数据中的词所对应的索引值
list_a=[]
for i in batch.utterance[:,0]:
    if i.item()!=1:
        list_a.append(i.item())
print(list_a)
for i in list_a:
    print(TEXT.vocab.itos[i],end=' ')

In [None]:
#查看迭代器中的数据及其对应的文本
l =[]
for batch in list(train_iter)[:1]:
    for i in batch.utterance:
        l.append(i[0].item())
    print(l)
    print(' '.join([TEXT.vocab.itos[i] for  i in l]))

搭建LSTM+Self-Attention网络模型

- vocab_size: 构建的词表中的词数
- embedding_size: 每个词的词向量维度
- hidden_dim：LSTM中隐藏层的单元个数
- n_layers：LSTM中的隐藏层数量
- num_class：类别数

In [5]:
vocab_size = 10440
embedding_size = 100
hidden_dim = 128
n_layers = 1
num_class = 3

In [6]:
class LSTM_Attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, num_class):
        super(LSTM_Attention, self).__init__()

        # 从LSTM得到output之后，将output通过下面的linear层，然后就得到了Q,K,V
        # 这里我是用的attention_size是等于hidden_dim的，这里可以自己换成别的attention_size
        self.W_Q = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.W_K = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.W_V = nn.Linear(hidden_dim, hidden_dim, bias=False)

        # embedding层
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM
        self.rnn = nn.LSTM(input_size=embedding_dim,
                           hidden_size=hidden_dim, num_layers=n_layers)
        # Linear层,因为是三分类，所以后面的维度为3
        self.fc = nn.Linear(hidden_dim, num_class)
        # dropout
        self.dropout = nn.Dropout(0.5)

    # 用来计算attention
    def attention(self, Q, K, V):

        d_k = K.size(-1)
        scores = torch.matmul(Q, K.transpose(1, 2)) / math.sqrt(d_k)
        alpha_n = F.softmax(scores, dim=-1)
        context = torch.matmul(alpha_n, V)

        # 这里都是组合之后的矩阵之间的计算，所以.sum之后，得到的output维度就是[batch_size,hidden_dim]，并且每一行向量就表示一句话，所以总共会有batch_size行
        output = context.sum(1)

        return output, alpha_n

    def forward(self, x):
        #x.shape = [seq_len,batch_size] = [30,64]

        # embedding.shape = [seq_len,batch_size,embedding_dim = 100]
        embedding = self.dropout(self.embedding(x))
        # embedding.shape = [batch_size,seq_len,embedding_dim]
        embedding = embedding.transpose(0, 1)
        # 进行LSTM
        # out.shape = [batch_size,seq_len,hidden_dim=128]
        output, (h_n, c) = self.rnn(embedding)

        Q = self.W_Q(output)  # [batch_size,seq_len,hidden_dim]
        K = self.W_K(output)
        V = self.W_V(output)

        # 将得到的Q，K，V送入attention函数进行运算
        attn_output, alpha_n = self.attention(Q, K, V)
        # attn_output.shape = [batch_size,hidden_dim=128]
        #alpha_n.shape = [batch_size,seq_len,seq_len]

        out = self.fc(attn_output)  # out.shape = [batch_size,num_class]
        return out


In [9]:
# 看一下我们搭建的网络模型
net = LSTM_Attention(vocab_size=vocab_size, embedding_dim=embedding_size,
                     hidden_dim=hidden_dim, n_layers=n_layers, num_class=num_class).to(device)
net


LSTM_Attention(
  (W_Q): Linear(in_features=128, out_features=128, bias=False)
  (W_K): Linear(in_features=128, out_features=128, bias=False)
  (W_V): Linear(in_features=128, out_features=128, bias=False)
  (embedding): Embedding(10440, 100)
  (rnn): LSTM(100, 128)
  (fc): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

模型训练及结果

In [None]:
net.embedding.weight.data.copy_(TEXT.vocab.vectors)  # 给模型的Embedding层传入我们的词嵌入矩阵
optimizer = optim.Adam(net.parameters(), lr=1e-3)  # 定义优化器，lr是学习率可以自己调
criterion = nn.CrossEntropyLoss().to(device)  # 定义损失函数
train_x_len = len(train_x)  # 这一步是我为了计算后面的Acc而获取的数据数量，也就是9989


In [None]:
# 定义训练函数
def train(net, iterator, optimizer, criterion, train_x_len):
    epoch_loss = 0  # 初始化loss值
    epoch_acc = 0  # 初始化acc值
    for batch in iterator:
        optimizer.zero_grad()  # 梯度清零
        preds = net(batch.utterance)  # 前向传播，求出预测值
        loss = criterion(preds, batch.label)  # 计算loss
        epoch_loss += loss.item()  # 累加loss，作为下面求平均loss的分子
        loss.backward()  # 反向传播
        optimizer.step()  # 更新网络中的权重参数
        epoch_acc += ((preds.argmax(axis=1)) ==
                      batch.label).sum().item()  # 累加acc，作为下面求平均acc的分子
    return epoch_loss/(len(iterator)), epoch_acc/train_x_len  # 返回的是loss值和acc值


In [None]:
n_epoch = 100
acc_plot = []  # 用于后面画图
loss_plot = []  # 用于后面画图
for epoch in range(n_epoch):
    train_loss, train_acc = train(
        net, train_iter, optimizer, criterion, train_x_len)
    acc_plot.append(train_acc)
    loss_plot.append(train_loss)
    if (epoch+1) % 10 == 0:
        print('epoch: %d \t loss: %.4f \t train_acc: %.4f' %
              (epoch+1, train_loss, train_acc))


In [None]:
# 使用画图函数matplotlib
plt.figure(figsize=(10, 5), dpi=80)
plt.plot(acc_plot, label='train_acc')
plt.plot(loss_plot, color='coral', label='train_loss')
plt.legend(loc=0)
plt.grid(True, linestyle='--', alpha=1)
plt.xlabel('epoch', fontsize=15)
plt.show()
