In [None]:
pip install torchtext==0.6.0

In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field()
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.2MB/s]


In [3]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [4]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        embedded = self.dropout(self.embedding(text))

        output, (hidden, cell) = self.lstm(embedded)

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))

        return self.fc(hidden.squeeze(0))

In [24]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 500
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [25]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)
print(pretrained_embeddings[1000,:])

torch.Size([25002, 100])
tensor([ 3.7147e-01,  5.0121e-01,  4.8555e-01, -5.5063e-01,  2.5080e-01,
         2.6959e-01,  2.0017e-01,  3.3553e-01,  8.6169e-02, -4.1090e-01,
         6.1886e-01,  5.9603e-01,  5.4824e-01,  3.2363e-01, -2.5534e-01,
        -4.9561e-01, -3.6369e-01,  5.7786e-01, -4.2725e-01,  5.2505e-01,
         5.5768e-01,  1.1907e+00,  4.3717e-02, -2.4367e-01,  7.8373e-02,
         2.5709e-01, -2.8348e-01, -5.0639e-01,  1.8961e-01, -3.1080e-01,
        -4.4251e-02,  2.5159e-02,  7.8117e-02, -1.0888e-01, -2.2321e-01,
         3.5466e-01, -3.0925e-01, -2.3112e-01, -1.4595e-01, -4.1172e-01,
        -2.6876e-01,  5.3870e-02, -8.4707e-02, -1.0966e-01, -1.7103e-02,
        -2.5846e-01,  7.6151e-02,  5.5325e-01,  4.6639e-01, -9.7016e-01,
         1.4094e-01, -4.9948e-01,  2.1720e-01,  8.3326e-01,  2.3583e-01,
        -1.8191e+00, -2.7994e-01,  3.8383e-01,  1.2150e+00,  4.7570e-01,
        -8.8251e-02,  7.9652e-01, -3.1654e-01, -2.3924e-01,  7.1291e-01,
         2.7181e-01,  1.84

In [26]:
print(model)

print(model.embedding.weight)

LSTM(
  (embedding): Embedding(25002, 100)
  (lstm): LSTM(100, 500, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=1000, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
Parameter containing:
tensor([[-0.0176, -0.8107, -0.1327,  ...,  0.9424, -0.6172,  1.3785],
        [-1.5438, -1.0465,  0.5786,  ..., -0.3126, -1.1279, -0.9778],
        [-0.3459,  0.6506, -0.2974,  ..., -1.2373, -1.2125,  0.1990],
        ...,
        [-0.5406, -1.1519, -0.4007,  ...,  0.2827, -0.2266, -0.5412],
        [-0.1612, -0.0119,  0.6595,  ...,  0.3948,  0.4155, -0.6196],
        [-0.4476,  0.5694,  0.2976,  ...,  1.9488, -1.0970, -0.0042]],
       requires_grad=True)


In [27]:
model.embedding.weight.data.copy_(pretrained_embeddings)

print(model.embedding.weight)

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       requires_grad=True)


In [28]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [29]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [30]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [31]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = binary_accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            predictions = model(batch.text).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.687 | Train Acc: 55.34% | Val. Loss: 0.637 | Val. Acc: 64.20% |
| Epoch: 02 | Train Loss: 0.656 | Train Acc: 61.24% | Val. Loss: 0.628 | Val. Acc: 66.51% |
| Epoch: 03 | Train Loss: 0.461 | Train Acc: 78.59% | Val. Loss: 0.461 | Val. Acc: 81.50% |
| Epoch: 04 | Train Loss: 0.298 | Train Acc: 88.31% | Val. Loss: 0.476 | Val. Acc: 77.08% |
| Epoch: 05 | Train Loss: 0.200 | Train Acc: 92.56% | Val. Loss: 0.322 | Val. Acc: 87.67% |


In [34]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.361 | Test Acc: 85.74% |


In [35]:
import spacy

nlp = spacy.load("en_core_web_sm")

def predict_sentiment(sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()



In [36]:
predict_sentiment("This film is terrible")

0.005391507875174284

In [38]:
predict_sentiment("This film is great")

0.9832133054733276

In [39]:
# torch.save(model.state_dict(), 'model.pth')