In [0]:
!pip3 install torch torchtext

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/7e/60/66415660aa46b23b5e1b72bc762e816736ce8d7260213e22365af51e8f9c/torch-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (591.8MB)
[K    100% |████████████████████████████████| 591.8MB 24kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x614f6000 @  0x7f4156b432a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
[?25hCollecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/c6/bc/b28b9efb4653c03e597ed207264eea45862b5260f48e9f010b5068d64db1/torchtext-0.3.1-py3-none-any.whl (62kB)
[K    100% |████████████████████████████████| 71kB 24.1MB/s 
Installing collected packages: torch, torchtext
Successfully installed torch-1.0.0 torchtext-0.3.1


In [0]:
import gensim
import torch
from torchtext import data,datasets
from torchtext.vocab import GloVe
from torch import nn, optim
import torch.nn.functional as F
from torch.autograd import Variable

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
# lowercase all the text, tokenize the text, and trim it to a maximum length of 20
TEXT = data.Field(lower=True, batch_first=True, fix_length=40)
LABEL = data.Field(sequential=False)

In [0]:
train, test = datasets.IMDB.splits(TEXT, LABEL)

In [0]:
print(train.fields)

{'text': <torchtext.data.field.Field object at 0x7f326cb9f828>, 'label': <torchtext.data.field.Field object at 0x7f326cb9f860>}


In [0]:
TEXT.build_vocab(train, max_size=10000, min_freq=10,
                vectors = GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

In [0]:
devopt = None if torch.cuda.is_available() else -1
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=32, device=devopt)

In [0]:
train_iter.repeat, test_iter.repeat = False, False

In [0]:
# In this network, we lose the sequential nature of the text and we just use them as a bag of words

class EmbNet(nn.Module):
  
  def __init__(self, emb_size, hidden_size1, hidden_size2 = 400):
    super().__init__()
    
    # size of the vocabulary and the dimensions that we wish to create for each word
    self.embedding = nn.Embedding(emb_size, hidden_size1)
    # word embeddings to the category
    self.fc = nn.Linear(hidden_size2, 3)
  
  def forward(self, x):
    embeds = self.embedding(x).view(x.size(0), -1)
    out = self.fc(embeds)
    return F.log_softmax(out, dim=-1)

In [0]:
model = EmbNet(len(TEXT.vocab.stoi), 10)
model.to(device)

EmbNet(
  (embedding): Embedding(120159, 10)
  (fc): Linear(in_features=400, out_features=3, bias=True)
)

In [0]:
optimizer = optim.Adam(model.parameters(),lr=0.001)

In [0]:
def fit(epoch, model, data_loader, phase='training', volatile=False):
  
  running_loss = 0.0
  running_correct = int(0)
  
  if phase == 'training':
    model.train()
  else:
    volatile = True
    model.eval()
  
  for batch in data_loader:
    text, label = batch.text.to(device), batch.label.to(device)
    
    output = model(text)
    loss = F.nll_loss(output, label)
    
    running_loss += loss.item()
    preds = output.data.max(dim=1, keepdim=True)[1]
    running_correct += preds.eq(label.data.view_as(preds)).cpu().sum()
    
    if phase == 'training':
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
  
  loss = running_loss/len(data_loader.dataset)
  accuracy = 100. * float(running_correct)/float(len(data_loader.dataset))
  print(f'Epoch {epoch}'+'='*10, end='>')
  print(f'{phase} loss is {loss:{3}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}')
  
  return loss, accuracy

In [0]:
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

for epoch in range(1,10):
  epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
  val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
  train_losses.append(epoch_loss)
  train_accuracy.append(epoch_accuracy)
  val_losses.append(val_epoch_loss)
  val_accuracy.append(val_epoch_accuracy)



## Using pretrained Glove word embeddings

In [0]:
model = EmbNet(len(TEXT.vocab.stoi), 300, 12000)

model.embedding.weight.data = TEXT.vocab.vectors

model.to(device)

EmbNet(
  (embedding): Embedding(10148, 300)
  (fc): Linear(in_features=12000, out_features=3, bias=True)
)

In [0]:
model.embedding.weight.requires_grad = False
optimizer = optim.SGD([ param for param in model.parameters() if param.requires_grad == True], lr=0.001)

In [0]:
train_losses , train_accuracy = [],[]
val_losses , val_accuracy = [],[]

for epoch in range(1,10):
  epoch_loss, epoch_accuracy = fit(epoch,model,train_iter,phase='training')
  val_epoch_loss , val_epoch_accuracy = fit(epoch,model,test_iter,phase='validation')
  train_losses.append(epoch_loss)
  train_accuracy.append(epoch_accuracy)
  val_losses.append(val_epoch_loss)
  val_accuracy.append(val_epoch_accuracy)

