代码参考：
https://www.pythonf.cn/read/128035

In [2]:
import torch
import torch.nn as nn
from torchtext.legacy import data,datasets
import pandas as pd

from torchtext.vocab import Vectors
from torch.nn import init
from sklearn.model_selection import train_test_split
import jieba    # 用来预处理文本（分词等）

import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import torch.optim as optim
import torch.nn.functional as F

In [3]:
seed=2019
torch.manual_seed(seed)
torch.backends.cudnn.deterministic=True

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # 选择Gpu或Cpu

In [4]:
df=pd.read_csv('data/train.tsv', sep='\t')
df_test=pd.read_csv('data/test.tsv', sep='\t')

In [5]:
train, val = train_test_split(df, test_size=0.2)
train.to_csv("./data/train.csv", index=False)
val.to_csv("./data/val.csv", index=False)

In [6]:
def tokenizer(text):    
    return [wd for wd in jieba.cut(text, cut_all=False)]

en_stopwords=stopwords.words('english')

LABEL = data.Field(sequential=False, use_vocab=False)

TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, stop_words=en_stopwords)

In [7]:
train, val = data.TabularDataset.splits(
    path='./data', train='train.csv', validation='val.csv', format='csv', skip_header=True,
    fields=[('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT), ('Sentiment', LABEL)]
)

test = data.TabularDataset('./data/test.tsv', format='tsv', skip_header=True, 
                           fields=[('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT)])
print(len(train),train[2].Phrase, train[2].Sentiment)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zoro\AppData\Local\Temp\jieba.cache
Loading model cost 0.594 seconds.
Prefix dict has been built successfully.


124848 [' ', 'could', ' ', ' ', 'wrapped', ' ', 'things', ' ', ' ', ' ', '80', ' ', 'minutes'] 1


In [8]:
# # 建立vocab（不需要加载预训练的词向量） 
# TEXT.build_vocab(train, val)
# LABEL.build_vocab(train, val)

# 建立vocab（加载预训练的词向量，如果路径没有该词向量，会自动下载）
TEXT.build_vocab(train, vectors='glove.6B.100d')#, max_size=30000)
# 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
TEXT.vocab.vectors.unk_init = init.xavier_uniform

In [9]:
# 构造迭代器
'''
sort_key指在一个batch内根据文本长度进行排序。
'''
train_iter = data.BucketIterator(train, batch_size=128, sort_key=lambda x: len(x.Phrase), 
                                 shuffle=True,device=DEVICE)

val_iter = data.BucketIterator(val, batch_size=128, sort_key=lambda x: len(x.Phrase), 
                                 shuffle=True,device=DEVICE)

# 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
test_iter = data.Iterator(dataset=test, batch_size=128, train=False,
                          sort=False, device=DEVICE)

# 查看trainiter一个batch

# batch = next(iter(train_iter))
# a= batch.Phrase
# label = batch.Sentiment
# print(a.shape)
# print(batch.Phrase)

In [11]:
#文本中的唯一标记
print("Size of TEXT vocabulary:",len(TEXT.vocab))

# #标签中唯一令牌的集合
# print("Size of LABEL vocabulary:",len(LABEL.vocab))

#常用单词
# print(TEXT.vocab.freqs.most_common(10))

# 词库向量的大小
print(TEXT.vocab.vectors.shape)
# #单词词典
# print(LABEL.vocab.stoi)

Size of TEXT vocabulary: 15160
torch.Size([15160, 100])


In [16]:
import numpy as np
"""
由于目的是学习torchtext的使用，所以只定义了一个简单模型
"""
len_vocab = len(TEXT.vocab)

class Enet(nn.Module):
    def __init__(self):
        super(Enet, self).__init__()
        self.embedding = nn.Embedding(len_vocab,100)
        self.lstm = nn.LSTM(100,128,3,batch_first=True)#,bidirectional=True)
        self.linear = nn.Linear(128,5)
        
    def forward(self, x):
        batch_size,seq_num = x.shape
        vec = self.embedding(x)
        out, (hn, cn) = self.lstm(vec)
        out = self.linear(out[:,-1,:])
        out = F.softmax(out,-1)
        return out


model = Enet()
"""
将前面生成的词向量矩阵拷贝到模型的embedding层
这样就自动的可以将输入的word index转为词向量
如果没有使用预训练词向量，name就用随机生成的，会跟着模型进行更新
vocab_size是所用词的总数，embedding_dim是预设的词向量维度。
model.embedding = nn.Embedding(vocab_size, embedding_dim)
"""
model.embedding.weight.data.copy_(TEXT.vocab.vectors)   
model.to(DEVICE)




# 训练
optimizer = optim.Adam(model.parameters())#,lr=0.000001)

n_epoch = 20

best_val_acc = 0

for epoch in range(n_epoch):

    for batch_idx, batch in enumerate(train_iter):
        data = batch.Phrase
        target = batch.Sentiment
        target = torch.sparse.torch.eye(5).index_select(dim=0, index=target.cpu().data)
        target = target.to(DEVICE)
        data = data.permute(1,0)
        optimizer.zero_grad()

        out = model(data)
        loss = -target*torch.log(out)-(1-target)*torch.log(1-out)
        loss = loss.sum(-1).mean()

        loss.backward()
        optimizer.step()

        if (batch_idx+1) %200 == 0:
            _,y_pre = torch.max(out,-1)
            acc = torch.mean((torch.tensor(y_pre == batch.Sentiment,dtype=torch.float)))
            print('epoch: %d \t batch_idx : %d \t loss: %.4f \t train acc: %.4f'
                  %(epoch,batch_idx,loss,acc))
    
    val_accs = []
    for batch_idx, batch in enumerate(val_iter):
        data = batch.Phrase
        target = batch.Sentiment
        target = torch.sparse.torch.eye(5).index_select(dim=0, index=target.cpu().data)
        target = target.to(DEVICE)
        data = data.permute(1,0)
        out = model(data)
        
        _,y_pre = torch.max(out,-1)
        acc = torch.mean((torch.tensor(y_pre == batch.Sentiment,dtype=torch.float))).cpu()
        val_accs.append(acc)
    
    acc = np.array(val_accs).mean()
    if acc > best_val_acc:
        print('val acc : %.4f > %.4f saving model'%(acc,best_val_acc))
        torch.save(model.state_dict(), 'params.pkl')
        best_val_acc = acc
    print('val acc: %.4f'%(acc))

epoch: 0 	 batch_idx : 199 	 loss: 2.1322 	 train acc: 0.4688
epoch: 0 	 batch_idx : 399 	 loss: 1.9495 	 train acc: 0.5156
epoch: 0 	 batch_idx : 599 	 loss: 1.9382 	 train acc: 0.5391
epoch: 0 	 batch_idx : 799 	 loss: 2.0284 	 train acc: 0.5156
val acc : 0.5144 > 0.0000 saving model
val acc: 0.5144
epoch: 1 	 batch_idx : 199 	 loss: 2.0110 	 train acc: 0.4922
epoch: 1 	 batch_idx : 399 	 loss: 1.9866 	 train acc: 0.4922
epoch: 1 	 batch_idx : 599 	 loss: 2.0713 	 train acc: 0.5000
epoch: 1 	 batch_idx : 799 	 loss: 2.0416 	 train acc: 0.4922
val acc : 0.5144 > 0.5144 saving model
val acc: 0.5144
epoch: 2 	 batch_idx : 199 	 loss: 2.0745 	 train acc: 0.4297
epoch: 2 	 batch_idx : 399 	 loss: 1.8750 	 train acc: 0.5312
epoch: 2 	 batch_idx : 599 	 loss: 1.8860 	 train acc: 0.5312
epoch: 2 	 batch_idx : 799 	 loss: 1.9514 	 train acc: 0.5234
val acc: 0.5144
epoch: 3 	 batch_idx : 199 	 loss: 2.0162 	 train acc: 0.5312
epoch: 3 	 batch_idx : 399 	 loss: 2.0408 	 train acc: 0.5312
epoch: