In [16]:
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
import numpy
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt

torch.manual_seed(1)

<torch._C.Generator at 0x10b066fd0>

In [17]:
def load_data(path="../train.tsv", length=500):
    df = pd.read_csv(path, sep="\t")[:length]
    df.drop(['PhraseId','SentenceId'], axis=1, inplace=True)
    x, y = df["Phrase"].values, df["Sentiment"].values
    return (x, y)

x, y = load_data("../train.tsv", 500)

def clean_text(x):
  # Removes special symbols and just keep
  # words in lower or upper form
  
  x = [i.lower() for i in x]
  x = [re.sub(r'[^A-Za-z]+', ' ', i) for i in x]
  
  return x

x = clean_text(x)
print(x[:5], y[:5])

['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amounts to much of a story ', 'a series of escapades demonstrating the adage that what is good for the goose', 'a series', 'a', 'series'] [1 2 2 2 2]


In [29]:
def build_dict(x):
    ret = []
    for i in x:
        ret += i.split()
    
    return list(set(ret))

d = build_dict(x)

word2inx = {d[i]:i for i in range(len(d))}

vocab_size = len(word2inx)

n_classes = 5

print(word2inx)

{'dizzily': 0, 'joy': 1, 'should': 2, 'entertaining': 3, 'independent': 4, 'considers': 5, 'shakespearean': 6, 'pow': 7, 'despite': 8, 'unless': 9, 'i': 10, 'nearly': 11, 'still': 12, 'goose': 13, 'demonstrating': 14, 'proportions': 15, 'midlife': 16, 'serious': 17, 'remain': 18, 'sweet': 19, 'wong': 20, 'mainland': 21, 'mythic': 22, 'a': 23, 'dialogue': 24, 'sometimes': 25, 'quiet': 26, 'ways': 27, 'performance': 28, 'oedekerk': 29, 'or': 30, 'character': 31, 'juicy': 32, 'would': 33, 'mood': 34, 'work': 35, 'realization': 36, 'hong': 37, 'love': 38, 'ultimately': 39, 'thick': 40, 'is': 41, 'setting': 42, 'plodding': 43, 'through': 44, 'path': 45, 'reading': 46, 'performances': 47, 'mess': 48, 'all': 49, 'leave': 50, 'dreams': 51, 't': 52, 'movies': 53, 'hilarity': 54, 'are': 55, 'epic': 56, 'day': 57, 'to': 58, 'companion': 59, 'wit': 60, 'good': 61, 'some': 62, 'betrayal': 63, 'whitewash': 64, 'hate': 65, 'little': 66, 'can': 67, 'inspired': 68, 'by': 69, 'sincere': 70, 'who': 71, '

In [47]:
def sentence2vector(sentence):
    v = []
    for word in sentence.split():
        v.append(word2inx[word])
    return torch.tensor(v, dtype=torch.long)

"""
def sentiment2target(sentiment):
    tmp = torch.zeros(n_classes)
    tmp[sentiment] += 1
    return tmp
"""

test_input = sentence2vector(x[0])
#test_target = sentiment2target(y[0])
test_target = torch.tensor(y[0], dtype=torch.long)


print(test_input, test_target)

def get_batch(x, batch_size=2, point=0):
    return  [sentence2vector(i) for i in x[point:point+batch_size]]

test_batch = get_batch(x)

print(test_batch)


tensor([ 23, 108, 127, 111,  14, 122, 192, 142, 166,  41,  61, 190, 122,  13,
         41, 118,  61, 190, 122, 105,  62, 127, 119, 156, 187, 158, 101, 127,
        119, 194,  58, 163, 127,  23, 167]) tensor(1)
[tensor([ 23, 108, 127, 111,  14, 122, 192, 142, 166,  41,  61, 190, 122,  13,
         41, 118,  61, 190, 122, 105,  62, 127, 119, 156, 187, 158, 101, 127,
        119, 194,  58, 163, 127,  23, 167]), tensor([ 23, 108, 127, 111,  14, 122, 192, 142, 166,  41,  61, 190, 122,  13])]


In [49]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        y_ = self.hidden2tag(lstm_out.view(len(sentence), -1))
        predict = F.log_softmax(y_, dim=1)
        return predict

loss_function = nn.NLLLoss()

optimizer = optim.SGD(net.parameters(), lr=0.1)

net = LSTM(100, 100, vocab_size, 5)

prob = net(test_input)

print(prob[-1].unsqueeze(0), test_target)

loss = loss_function(prob[-1].unsqueeze(0), test_target.unsqueeze(0))

loss.backward()

optimizer.step()


tensor([[-1.6168, -1.5143, -1.4933, -1.6957, -1.7520]],
       grad_fn=<UnsqueezeBackward0>) tensor(1)
