In [3]:
import torch
from torchtext import data,datasets
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import sys

### torchtext.data

    - data.dataset
    - data.Iterator.split
    
There are two crucial concepts (in high level concept) in torchtext : dataset and Iterator

`dataset` object hold field(which I explain later), examples, and several other settings.  
`Iterator` object literally iterate over batches of the whole dataset. It manages batches efficiently with many options including batch_size, shuffle, sort_key etc.
We can construct dataset objects by using examples and fields and in case where we have data format of tsv, csv, json, by using `TabularDataset` class.

### Field object

    - data.Field
    
Field object holds vocabs and word embeddings of given source. You can easily manage and make stats of vocabs and lables with `Field` object. One can set preprocessing pipeline of his/her own.

In [None]:
text_field.preprocessing = lambda x:x ## Doing nothing is default preprocessing.

In [4]:
### create text and label fields : they hold vocabs and labels respectively
### tokenize option specifies which tokenizer to use. if 'spacy', it use spacy english tokenizer. 
### If not specified, it uses default str.split() method.
### Set sequential option true if you want seq to seq modeling.
### tensor_type determines types of batch tensors you get 

text_field = data.Field(lower=True, tokenize='spacy',tensor_type=torch.LongTensor)
label_field = data.Field(sequential=False)

In [5]:
### create dataset object and iterator object
### fields designate the meaning of each column. Here, first column stands for 'text' and second column means 'label'

pr_data = data.TabularDataset(path='polarity.tsv',format='tsv',fields=[('text',text_field),('label',label_field)])

In [6]:
### split pr_data into train and test sets :
### unfortunately, there is no method for data spliting in torchtext, so we manually construct train and test dataset objects
### we can construct dataset object if we have 'example' and 'fields'

import numpy as np

examples =  pr_data.examples
np.random.shuffle(examples)

# train : test = 8 : 2
train_ex =examples[:int(len(examples)*0.8)]
test_ex =examples[int(len(examples)*0.8):]

train_data = data.Dataset(examples=train_ex,fields={'text':text_field,'label':label_field})
test_data = data.Dataset(examples=test_ex,fields={'text':text_field,'label':label_field})

In [7]:
### build vocabs of each fields : create vocab object which counts and manages vocabularies in data 
### In case of label_field, vocabularies would be 'True' and 'False' or '1' and '0' sort of things
### you can download pre-trained embeddings if you specify vector='glove.6B.100d' option (many other embeddings are supported)

text_field.build_vocab(train_data,test_data) # vector='glove.6B.100d'
label_field.build_vocab(train_data,test_data)

`Iterator.splits` method receives `Dataset` object and returns corresponding `Iterator` object. Helpful if multiple Dataset objects need converting. Be aware that in case of one `Dataset` object being converted, returned object is also wrapped in tuple.

In [17]:
train_iter,test_iter = data.Iterator.splits(datasets=(train_data,test_data),
                                        batch_sizes = (20,20),
                                        repeat=False,  # if you don't specify this, your iterator will not stop yielding batches.
                                        sort_key = lambda ex:len(ex.text)) # torchtext supports dynamic padding. Not to pad overly,
                                                                            # Iterator sorts texts with key of text length. 

### Train simple lstm model

We build simple lstm model for train and test.

In [9]:
class LSTM(nn.Module):
    def __init__(self,vocab_size):    
        super(LSTM,self).__init__()
        self.lstm = nn.LSTM(100, 200, 2, batch_first= True)
        self.fc = nn.Linear(200, 2)
        self.embedding = nn.Embedding(vocab_size,100)
    def forward(self, x, h_0, c_0):
        x = self.embedding(x)
        x, (_, _) = self.lstm(x,(h_0,c_0))
        x = x[:,x.size(1)-1,:].squeeze(1)
        x = self.fc(x)
        return x

In [20]:
### model train

num_epoches = 50
vocab_size = len(text_field.vocab.itos)

model = LSTM(vocab_size)
model.cuda()
model.train()

hidden state, cell state initiation
h_0,c_0 = Variable(torch.zeros(2,20,200)).cuda(), Variable(torch.zeros(2,20,200)).cuda()

# setting parameter optimizer
optimizer= torch.optim.Adam(model.parameters(),lr=0.01)

for epoch in range(1,num_epoches+1):
    for idx, batch in enumerate(train_iter):
        feature,target = batch.text, batch.label
        feature.data.t_(), target.data.sub_(1)  # batch first and index align : label_field contains <unk>, '0', '1' 
        
        optimizer.zero_grad()
        out = model(feature,h_0,c_0)
        loss = F.cross_entropy(out,target)
        loss.backward()
        optimizer.step()

        correct = (torch.max(out.data,1)[1] == target.data).sum()
        total = target.data.size(0)
        accuracy = correct/total * 100.0
        
        sys.stdout.write('\repoch %d | batch %d | acc : %.2f | loss : %.4f'%(epoch,idx+1, accuracy, loss.data[0]))

epoch 1 | batch 73 | acc : 95.00 | loss : 0.10128

KeyboardInterrupt: 

### evaluate the model

we evaluate the model on test set.

In [24]:
correct = 0
total = 0
avg_loss = .0

model.eval()
for batch in test_iter:
    feature,target = batch.text, batch.label
    feature.data.t_(), target.data.sub_(1)
    feature.volatile, target.volatile = True, True
    
    out = model(feature,h_0,c_0)
    feature.volatile, target.volatile = False, False
    
    avg_loss += F.cross_entropy(out,target).data[0]
    correct += (torch.max(out.data,1)[1] == target.data).sum()
    total += target.data.size(0)
    
avg_loss /= len(test_iter)
accuracy = correct / total * 100

sys.stdout.write('\rvalidation | acc : %.2f | loss : %.4f'%(accuracy, avg_loss))

  


validation | acc : 49.75 | loss : 1.5857