代码参考：
https://discourse.qingxzd.com/t/pytorch/58

辅助参考:
https://www.pythonf.cn/read/128035

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from torchtext.legacy import data,datasets

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.vocab import Vectors
from torch.nn import init
from sklearn.model_selection import train_test_split

import jieba    # 用来预处理文本（分词等）


import nltk
from nltk.corpus import stopwords
from tqdm import tqdm


In [None]:
seed=2019
torch.manual_seed(seed)
torch.backends.cudnn.deterministic=True

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")   # 选择Gpu或Cpu

In [None]:
df=pd.read_csv('data/train.tsv', sep='\t')
df_test=pd.read_csv('data/test.tsv', sep='\t')

In [None]:
train, val = train_test_split(df, test_size=0.2)
train.to_csv("./data/train.csv", index=False)
val.to_csv("./data/val.csv", index=False)

In [None]:
def tokenizer(text):    
    return [wd for wd in jieba.cut(text, cut_all=False)]

en_stopwords=stopwords.words('english')
LABEL = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, stop_words=en_stopwords)

In [None]:
train, val = data.TabularDataset.splits(
    path='./data', train='train.csv', validation='val.csv', format='csv', skip_header=True,
    fields=[('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT), ('Sentiment', LABEL)]
)

test = data.TabularDataset('./data/test.tsv', format='tsv', skip_header=True, 
                           fields=[('PhraseId', None), ('SentenceId', None), ('Phrase', TEXT)])
print(len(train),train[2].Phrase, train[2].Sentiment)

In [None]:
# # 建立vocab（不需要加载预训练的词向量） 
# TEXT.build_vocab(train, val)
# LABEL.build_vocab(train, val)

# 建立vocab（加载预训练的词向量，如果路径没有该词向量，会自动下载）
TEXT.build_vocab(train, vectors='glove.6B.100d')#, max_size=30000)
# 当 corpus 中有的 token 在 vectors 中不存在时 的初始化方式.
TEXT.vocab.vectors.unk_init = init.xavier_uniform

In [None]:
# 构造迭代器
'''
sort_key指在一个batch内根据文本长度进行排序。
'''
train_iter = data.BucketIterator(train, batch_size=128, sort_key=lambda x: len(x.Phrase), 
                                 shuffle=True,device=DEVICE)

val_iter = data.BucketIterator(val, batch_size=128, sort_key=lambda x: len(), 
                                 shuffle=True,device=DEVICE)

# 在 test_iter , sort一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
test_iter = data.Iterator(dataset=test, batch_size=128, train=False,
                          sort=False, device=DEVICE)

# 查看trainiter一个batch



In [None]:
train[2].Phrase

In [None]:
batch = next(iter(train_iter))
a= batch.Phrase
label = batch.Sentiment
print(a.shape)
print(batch.Phrase)

In [None]:
#文本中的唯一标记
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#标签中唯一令牌的集合
# print("Size of LABEL vocabulary:",len(LABEL.vocab))

#常用单词
# print(TEXT.vocab.freqs.most_common(10))

# 词库向量的大小
print(TEXT.vocab.vectors.shape)
#单词词典
# print(LABEL.vocab.stoi)

In [None]:


class LSTM_base(nn.Module):
    #定义模型中使用的所有层
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        #构造函数
        super().__init__()
        #embeddding层
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #lstm层
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim , output_dim)
    def forward(self, text):
        #text = [batch size,sent_length]
        embedded = self.embedding(text)        
        out,_=self.lstm(embedded)
        out=self.fc(out[:,-1,:])
        #最终激活函数
        out = F.softmax(out,-1)
        return out


In [None]:
class LSTM_MAX(nn.Module):
    #定义模型中使用的所有层
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        #构造函数
        super().__init__()
        #embeddding层
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #lstm层
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim,
                           num_layers=1,
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim*2 , output_dim)
    def forward(self, text):
        #text = [batch size,sent_length]
        h_embedding = self.embedding(text)     

        h_lstm1, _ = self.lstm(h_embedding)

        h_lstm2, _ = self.lstm(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        h_conc = torch.cat((max_pool, avg_pool), 1)
        out=self.fc(h_conc)
        #最终激活函数
        out = F.softmax(out,-1)
        return out

#定义超参数
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 100
num_output_nodes = 5
num_layers = 2
bidirection = False
dropout = 0.4

#实例化模型
model = LSTM_base(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = bidirection, dropout = dropout)
# model = LSTM_MAX(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
#                    bidirectional = bidirection, dropout = dropout)   

#模型框架
print(model)
#可训练参数的数量
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
#初始化预训练的词嵌入
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
print(pretrained_embeddings.shape)  

#定义优化器和损失
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
#定义度量指标
def binary_accuracy(preds, y):

    #round预测到最接近的整数
    # rounded_preds = torch.round(preds)
    correct = (preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
#转化为cuda（如果可用）
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [None]:
def Train(model, iterator, optimizer, criterion):
    #每个epoch进行初始化
    epoch_loss = 0
    epoch_acc = 0
    #将模型设置为训练阶段
    model.train()
    predictions_val=[]
    for batch in tqdm(iterator):
        #重设梯度
        optimizer.zero_grad()
        #获取文本和单词数量
        text = batch.Phrase
        text = text.permute(1,0)
        # print(text.shape)
        #转换为一维张量
        predictions = model(text).squeeze()
        #计算loss
        loss = criterion(predictions, batch.Sentiment)
        #计算二分类准确度
        predictions_val=  predictions.argmax(dim=1)


        acc = binary_accuracy(predictions_val, batch.Sentiment.float())
        #后向传播损失并计算梯度
        loss.backward()
        #更新权重
        optimizer.step()
        #损失和准确度
        
        epoch_loss += loss.item()  
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def Evaluate(model, iterator, criterion):
    #每个epoch进行初始化
    epoch_loss = 0
    epoch_acc = 0
    #停用dropout层
    model.eval()
    #停用自动求导
    with torch.no_grad():
        for batch in tqdm(iterator):
            #获取文本和单词数量
            text = batch.Phrase
            text = text.permute(1,0)
            #转换为一维张量
            predictions = model(text).squeeze()
            #计算损失和准确度
            loss = criterion(predictions, batch.Sentiment)
            predictions_val=   predictions.argmax(dim=1)

            acc = binary_accuracy(predictions_val, batch.Sentiment.float())
            #跟踪损失和准确度
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def Test(model, iterator, criterion):
    #每个epoch进行初始化
    epoch_loss = 0
    epoch_acc = 0
    #停用dropout层
    model.eval()
    predict_list=[]
    #停用自动求导
    with torch.no_grad():
        for batch in tqdm(iterator):
            #获取文本和单词数量
            text = batch.Phrase
            text = text.permute(1,0)
            #转换为一维张量
            predictions = model(text).squeeze()
            #计算损失和准确度

            predict=   predictions.argmax(dim=1).cpu()
            predict_list+=predict.numpy().flatten().tolist()

    return predict_list

In [None]:
N_EPOCHS = 50
best_valid_loss = float('inf')

train_loss_list=[]
valid_loss_list=[]
train_acc_list=[]
valid_acc_list=[]

for epoch in range(N_EPOCHS):
    #训练模型
    print('epoch:',epoch)
    train_loss, train_acc = Train(model, train_iter, optimizer, criterion)
    #评估模型
    valid_loss, valid_acc = Evaluate(model, val_iter, criterion)
    #保存模型
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    train_loss_list.append(train_loss)
    valid_loss_list.append(valid_loss)
    train_acc_list.append(train_acc)
    valid_acc_list.append(valid_acc)

    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:


def drawpic(train_loss_list=[],test_loss_list=[],epoch_number=10,title='1',root_path='./'):
    # make data
    x = [i for i in range(epoch_number)]
    # plot
    fig, ax = plt.subplots()
    plt.title(title)
    ax.plot(x, train_loss_list, linewidth=2.0)
    ax.plot(x, test_loss_list, linewidth=2.0)
    path=root_path+title+'.jpg'
    print('![]({})'.format(path))
    plt.savefig(path)
    plt.show()
root_path='./pic/'
title='lstm-glove-loss-{}'.format(str(N_EPOCHS))

drawpic(train_loss_list=train_loss_list,test_loss_list=valid_loss_list,epoch_number=N_EPOCHS,title=title,root_path=root_path)

title='lstm-glove-acc-{}'.format(str(N_EPOCHS))

drawpic(train_loss_list=train_acc_list,test_loss_list=valid_acc_list,epoch_number=N_EPOCHS,title=title,root_path=root_path)

In [None]:
predict=Test(model, test_iter, criterion)
len(predict)


In [None]:

df_test=pd.read_csv('./data/sampleSubmission.csv')
df_test['Sentiment']=predict
df_test.to_csv('./submission.csv',index=None)

In [None]:
#加载权重
path='./saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()
#推断 

def predict(model, text):
    tokenized = [wd for wd in jieba.cut(text, cut_all=False)]
  #令牌化(tokenize)句子 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #转换为整数序列
    length = [len(indexed)]                                    #计算单词个数
    tensor = torch.LongTensor(indexed).to(DEVICE)              #转换为张量
    tensor = tensor.unsqueeze(1).T                             #reshape成[batch, 单词个数]
    length_tensor = torch.LongTensor(length)                   #转换为张量
    prediction = model(tensor)                  #预测
    return prediction.item()

#进行预测
print(predict(model, "fuck fuck shit bitch?"))
#不真诚的问题
print(predict(model, "Why Indian girls go crazy about marrying Shri. Rahul Gandhi ji?"))

In [None]:
import torch as t
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

a = t.tensor([[1,2,3],[6,0,0],[4,5,0]]) #(batch_size, max_length)
lengths = t.tensor([3,1,2])

# 排序
a_lengths, idx = lengths.sort(0, descending=True)
_, un_idx = t.sort(idx, dim=0)
a = a[un_idx]

# 定义层 
emb = t.nn.Embedding(20,2,padding_idx=0) 
lstm = t.nn.LSTM(input_size=2, hidden_size=4, batch_first=True) 
fc = nn.Linear(4, 1)
        #激活函数
act = nn.Sigmoid() 

a_input = emb(a)
a_packed_input = t.nn.utils.rnn.pack_padded_sequence(input=a_input, lengths=a_lengths, batch_first=True)
packed_out, _ = lstm(a_packed_input)
out, _ = pad_packed_sequence(packed_out, batch_first=True)
# 根据un_idx将输出转回原输入顺序
out = t.index_select(out, 0, un_idx)
linear=fc(out)
final=act(linear)
print('a_input.shape:',a_input.shape)
print('a_packed_input.data.shape',a_packed_input.data.shape)
print('packed_out.data.shape:',packed_out.data.shape)
print('out.shape:',out.shape)
print(_[0].shape,_[1].shape)
print(linear.shape)

print(final.shape)