In [87]:
'''
First, we import all required packages.
'''
import random
from random import shuffle
random.seed(11)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(11)

<torch._C.Generator at 0x1053abcb0>

In [140]:
'''
Next, we read in the unlabeled data from a CSV file. Each line contains one example.
'''
def load_data(fname = 'amazon.tsv'):
    data = []
    five_counter = 0
    for count, line in enumerate(open(fname, encoding="utf8", errors="ignore")):
        if count == 0:
            continue
        line = line.strip().split('\t')
        y = line[8]
        if y == "5":
            y = 1
        else:
            y = 0
        x = line[-2]
        data.append((x, y))
    
    shuffle(data)
    
    train_set = data[:10000]
    dev_set = data[-1000:]
    for element in train_set:
        if element[1] == 1:
            five_counter += 1
    print(five_counter)
    five_counter = 0
    for element in dev_set:
        if element[1] == 1:
            five_counter += 1
    print(five_counter)

    return train_set, dev_set

In [125]:
'''
Methods needed to convert the data into input that can be fed into the neural network.
'''
def make_feature_vectors(data, w2i):
    new_data = []

    for (x, y) in data:
        sentence = []
        for word in x.split(' '):
            if word in w2i:
                sentence.append(int(w2i[word]))
            else:
                sentence.append(int(w2i['<UNK>']))

        new_data.append((torch.tensor(sentence, dtype=torch.long), torch.tensor([int(y)])))
        
    return new_data

def get_vocab(data):
    w2i = {'<UNK>': 0}
    
    for (x, y) in data:
        for word in x.split(' '):
            if word not in w2i:
                w2i[word] = len(w2i)

    return w2i

In [90]:
'''
Now, we define our neural network model.
'''
class MyNNModel(nn.Module):

    def __init__(self, vocab_size, emb_dim, hidden_dim, output_dim):
        super(MyNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, input):
        x_1 = self.embedding(input)
        x_2 = self.lstm(x_1.unsqueeze(1))[1][0].squeeze(0).squeeze(0)
        return self.linear(x_2)

In [91]:
'''
An evaluation method so we can see how well the model is doing.
'''
def eval(model, data):
    model.eval()
    
    total = right = 0
    for (x, y) in data:
        probs = F.softmax(model(x), dim=0)
        y_hat = torch.argmax(probs)
        if y_hat == y:
            right += 1
        total += 1
    print("Accuracy: " + str((right * 1.0)/total))

In [141]:
'''
Finally, let's put everything together and train the model!
'''
train, dev = load_data()
vocab = get_vocab(train)
train = make_feature_vectors(train, vocab)
dev = make_feature_vectors(dev, vocab)
    
# TODO[3]: substitute -1 by more reasonable values!
model = MyNNModel(len(vocab), 64, 64, 2)
    
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=.1)
    
eval(model, train)
eval(model, dev)
print()
  
for i in range(10):
    model.train()
    for (x, y) in train:
        model.zero_grad()
        raw_scores = model(x)
        loss = loss_function(raw_scores.unsqueeze(0), y)
        loss.backward()
        optimizer.step()
        
    eval(model, train)
    eval(model, dev)
    print()

6030
622
Accuracy: 0.5105
Accuracy: 0.531

Accuracy: 0.665
Accuracy: 0.637

Accuracy: 0.7215
Accuracy: 0.676

Accuracy: 0.7713
Accuracy: 0.671

Accuracy: 0.7876
Accuracy: 0.682

Accuracy: 0.8141
Accuracy: 0.68

Accuracy: 0.8226
Accuracy: 0.698

Accuracy: 0.8442
Accuracy: 0.704

Accuracy: 0.8576
Accuracy: 0.703



KeyboardInterrupt: 