In [1]:
import torch
import torchtext
import torch.nn as nn
from torchtext.legacy import data
from torchtext.legacy import datasets
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')

TEXT = data.Field(include_lengths=True)

# If you want to use English tokenizer from SpaCy, you need to install SpaCy and download its English model:
# pip install spacy
# python -m spacy download en_core_web_sm
# TEXT = data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True)

LABEL = data.LabelField(dtype=torch.long)
train_data, valid_data, test_data = datasets.SST.splits(TEXT, LABEL, train_subtrees=True, filter_pred=lambda ex: ex.label != 'neutral')

TEXT.build_vocab(train_data)
# Here, you can also use some pre-trained embedding
# TEXT.build_vocab(train_data,
#                  vectors="glove.6B.100d",
#                  unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    sort_key=lambda x: len(x.text),
    batch_size=batch_size, device=device)
print("here")

here


## Data preprocessing

In [21]:
print(len(TEXT.vocab.itos))

18003


In [13]:
for batch in train_iterator.batches:
    print(len(batch))
    # print(batch)
    print('LABEL\tLENGTH\tTEXT'.ljust(10))
    for example in batch:
        print('%s\t%d\t%s'.ljust(10) % (example.label, len(example.text), example.text))
        # print('\n')
    break

64
LABEL	LENGTH	TEXT
positive	3	['extremely', 'talented', 'musicians']  
positive	5	['will', 'definitely', 'win', 'some', 'hearts']  
positive	1	['Divine']  
negative	10	["'re", 'not', 'big', 'fans', 'of', 'teen', 'pop', 'kitten', 'Britney', 'Spears']  
positive	10	['delighted', 'simply', 'to', 'spend', 'more', 'time', 'with', 'familiar', 'cartoon', 'characters']  
positive	10	['Hollywood', 'has', 'crafted', 'a', 'solid', 'formula', 'for', 'successful', 'animated', 'movies']  
negative	1	['destruction']  
negative	12	['it', 'was', 'co-written', 'by', 'Mattel', 'executives', 'and', 'lobbyists', 'for', 'the', 'tinsel', 'industry']  
negative	1	['-LRB-']  
negative	3	['a', 'bad', 'plot']  
negative	18	['Mark', 'Wahlberg', 'and', 'Thandie', 'Newton', 'are', 'not', 'Hepburn', 'and', 'Grant', ',', 'two', 'cinematic', 'icons', 'with', 'chemistry', 'galore', '.']  
positive	1	['succeeds']  
negative	1	['cannibal']  
negative	12	['have', 'benefited', 'from', 'a', 'little', 'more', 'dramatic', '

In [10]:
for i, ((encode, length), labels) in enumerate(train_iterator):
    tencode = encode.transpose(0,1)
    print(tencode)
    print(len(tencode))
    print(encode)
    print(length)
    print(labels)
    print("-----------------------------------------------------")
    break

tensor([[  11,    3,  327,  ...,    1,    1,    1],
        [ 238,    1,    1,  ...,    1,    1,    1],
        [   9,   28,    4,  ...,    1,    1,    1],
        ...,
        [  56,    1,    1,  ...,    1,    1,    1],
        [  48,    1,    1,  ...,    1,    1,    1],
        [  14, 5033,   14,  ...,    1,    1,    1]])
64
tensor([[  11,  238,    9,  ...,   56,   48,   14],
        [   3,    1,   28,  ...,    1,    1, 5033],
        [ 327,    1,    4,  ...,    1,    1,   14],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([23,  1,  6,  1, 17,  1,  9,  5,  4,  5,  3,  4,  2,  2,  1,  1,  4,  9,
         7,  7, 23,  1,  2,  5,  1,  1,  1, 20,  1, 17,  9, 26,  1,  3,  9,  4,
         3,  5,  1,  9,  3,  5,  1, 40, 31, 25,  9,  5,  4,  1,  3,  1, 16,  1,
        16,  4, 23,  3,  7,  3, 11,  1,  1,  7])
tensor([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 

## Model

In [40]:
embedding_size = 128
hidden_size = 128
vocab_size = len(TEXT.vocab.itos)
num_classes = 1
num_epoch= 20

class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.word_vec = nn.Embedding(vocab_size, embedding_size)
        # bidirectional双向LSTM
        self.bilstm = nn.LSTM(embedding_size, hidden_size, 1, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
 
    def forward(self, input):
        embedding_input = self.word_vec(input)
        # 调换第一维和第二维度
        embedding_input = embedding_input.permute(1, 0, 2)
        output, (h_n, c_n) = self.bilstm(embedding_input)
        # 使用正向LSTM与反向LSTM最后一个输出做拼接
        encoding1 = torch.cat([h_n[0], h_n[1]], dim=1) # dim=1代表横向拼接
        # 使用双向LSTM的输出头尾拼接做文本分类
        encoding2 = torch.cat([output[0], output[-1]], dim=1)
        fc_out = self.fc(encoding1).squeeze()
        return fc_out
 
model = BiLSTM()
print(model)

BiLSTM(
  (word_vec): Embedding(18003, 128)
  (bilstm): LSTM(128, 128, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [41]:
class LogisticLoss(nn.Module):
    def __init__(self):
        super(LogisticLoss, self).__init__()
    def forward(self, inputs, target):
        return torch.mean(torch.log(1.0/torch.sigmoid(target * inputs)))

In [42]:
criterion = LogisticLoss()
optimizer = torch.optim.SGD(params = model.parameters(), lr = 0.01, momentum=0.9)

In [43]:
print(len(train_iterator))
print(len(test_iterator))

1544
29


In [44]:
import os
import sys
fname = sys.path[0] + os.sep + "log" + os.sep + "LSTM_loss.txt"
print(fname)

/home/jovyan/LSTM/log/LSTM_loss.txt


In [None]:
from torch.autograd import Variable
with open(fname, 'w') as f:
    for epoch in range(num_epoch):
        total_loss = 0
        for i, ((encode, length), labels) in enumerate(train_iterator):
            labels = Variable(2 * (labels.float() - 0.5))
            pred = model(encode.transpose(0,1))
            loss = criterion(pred, labels)
            total_loss += loss.item()
            if (i + 1) % 100 == 0:
                print('Epoch: [% d/% d], Step: [% d/% d], Loss: %.4f'
                    % (epoch + 1, num_epoch, i + 1,
                        len(train_iterator), loss.item()))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        # Print your results every epoch
        epoch_loss = total_loss / len(train_iterator)
        correct = 0
        total = 0
        for i, ((encode, length), labels) in enumerate(test_iterator):
            # if i%5 == 0:
                # print(i)
            labels = Variable(2 * (labels.float() - 0.5))
            outputs = model(encode.transpose(0,1))
            predicted = torch.where(torch.sigmoid(outputs)>0.5, 1, -1)
            total += labels.size(0)
            correct += (predicted == labels).sum()
        f.write('Epoch: [% d/% d]: Loss: %.4f , Accuracy of the currenent model: % .4f %%\n' % (
            epoch + 1, num_epoch,
            epoch_loss,
            100.0 * correct / total))
        print('Epoch: [% d/% d]: Loss: %.4f , Accuracy of the currenent model: % .4f %%\n' % (
            epoch + 1, num_epoch,
            epoch_loss,
            100.0 * correct / total))
        print("**********************************")

In [4]:
for i, ((encode, length), labels) in enumerate(train_iterator):
    tencode = encode.transpose(0,1)
    print(tencode)
    print(len(tencode))
    print(encode)
    print(length)
    print(labels)
    print("-----------------------------------------------------")
    break

tensor([[1852,    1,    1,  ...,    1,    1,    1],
        [   2,   26, 2961,  ...,    1,    1,    1],
        [1775, 6106,    1,  ...,    1,    1,    1],
        ...,
        [1639,    1,    1,  ...,    1,    1,    1],
        [   6, 3690,  322,  ...,    1,    1,    1],
        [  30,  655,    1,  ...,    1,    1,    1]])
64
tensor([[1852,    2, 1775,  ..., 1639,    6,   30],
        [   1,   26, 6106,  ...,    1, 3690,  655],
        [   1, 2961,    1,  ...,    1,  322,    1],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([ 1, 14,  2,  9, 14,  6,  1,  1, 31,  9,  7, 21,  1,  3, 12,  3,  9,  1,
         6, 35,  1,  2,  4, 22,  6, 26,  2,  1,  1,  5, 18,  1,  2, 15,  3,  4,
         1,  8,  9,  1,  8,  1, 29,  8,  8,  2,  8,  4,  5,  5,  7, 23,  1,  9,
         2,  3,  1, 17,  4,  3, 26,  1, 19,  2])
tensor([1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 