将data.xlsx划分为训练集和测试集，文件里面已经有train.tsv和test.tsv，可以不用运行

In [2]:
import pandas as pd
data = pd.read_excel('data/data.xlsx',encoding='utf-8')
data = data.sample(frac=1).reset_index(drop=True)
data.iloc[:8000].to_csv('data/train.tsv',sep='\t',index=False,encoding='utf-8')
data.iloc[8000:].to_csv('data/test.tsv',sep='\t',index=False,encoding='utf-8')

#### 导入包和定义Field   include_lengths参数表示是否返回每个sequence的长度

In [1]:
import torch
from torchtext import data

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
def tokenizer(x):
    return [y for y in x]
TEXT = data.Field(tokenize = tokenizer)  ## include_lengths = True
LABEL = data.LabelField(dtype = torch.long)

#### 将tsv格式转换成TabularDataset类型

In [2]:
train_data  = data.TabularDataset('data/train.tsv',format='tsv',skip_header=True,
        fields=[('sentence', TEXT),('label', LABEL)])
test_data = data.TabularDataset('data/test.tsv', format='tsv',skip_header=True,
        fields=[('sentence', TEXT),('label', LABEL)])

查看数据

In [3]:
print(train_data[6])
print(train_data[6].__dict__.keys())
print(train_data[6].sentence)
print(train_data[6].label)

<torchtext.data.example.Example object at 0x000001EC5F070BE0>
dict_keys(['sentence', 'label'])
['就', '是', '一', '个', '商', '业', '区', '，', '不', '能', '算', '旅', '游', '景', '点', '吧', '。']
0


#### 构建词典

In [4]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

使用BucketIterator准备可迭代的数据，每个epoch里面接受一个batch_size的数据

In [5]:
BATCH_SIZE = 64
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator,test_iterator = data.BucketIterator.splits(
    (train_data,test_data), batch_size = BATCH_SIZE,device = DEVICE,sort_key=lambda x: len(x.sentence),sort_within_batch = True)

定义模型参数和LSTM模型，我们取LSTM模型输出的前向最后一个hidden和后向最后一个hidden做拼接

In [6]:
import torch.nn as nn
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,n_layers,bidirectional,dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim,padding_idx = 1)
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          dropout = dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = self.dropout(self.embedding(text))   ##[sent_len,batch_size,emb_dim]
#         packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)   ### 是否需要截断padding部分
        packed_output,(hidden,cell) = self.rnn(embedded)
#         output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  ##[sent_len, batch size, hid dim * num directions]
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        ### hidden[-2,:,:]  表示倒数第二层（第一层）后向传播最后一个time step的输出
        ###  hidden[-2,:,:]  表示倒数第一层（第二层）后向传播最后一个time step的输出
        return self.fc(hidden)

In [41]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim,output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim,hidden_dim)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = self.embedding(text)   ## shape = [sent len, batch size, emb dim]
        output, hidden = self.rnn(embedded)  #output = [sent len, batch size, hid dim]  #hidden = [1, batch size, hid dim]
        return self.fc(hidden.squeeze(0))

定义优化器和损失函数以及评估指标函数

In [7]:
import torch.optim as optim
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,N_LAYERS, BIDIRECTIONAL,DROPOUT)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

###分类准确率
def accuracy(preds, y):
    pred = torch.max(preds,-1)[1]
    correct = (pred == y).float()
    acc = correct.sum() / len(correct)
    return acc


定义训练过程和评估过程

In [8]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.sentence).squeeze(1)
#         print(predictions,batch.label)
        loss = criterion(predictions, batch.label)
        acc = accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.sentence).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

开始训练

In [9]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.421 | Train Acc: 80.14%
	 Val. Loss: 0.471 |  Val. Acc: 83.44%
Epoch: 02
	Train Loss: 0.300 | Train Acc: 87.31%
	 Val. Loss: 0.310 |  Val. Acc: 88.28%
Epoch: 03
	Train Loss: 0.259 | Train Acc: 89.64%
	 Val. Loss: 0.218 |  Val. Acc: 91.89%
Epoch: 04
	Train Loss: 0.222 | Train Acc: 90.74%
	 Val. Loss: 0.231 |  Val. Acc: 90.50%
Epoch: 05
	Train Loss: 0.199 | Train Acc: 91.92%
	 Val. Loss: 0.242 |  Val. Acc: 88.54%


可以看出双向lstm的效果和fasttext的效果是差不多的