In [2]:
import sys
import os
import torch.nn as nn
import torch.nn.functional as F
import torch

In [3]:
from nltk.tokenize import WordPunctTokenizer
import random

class TextLoader(object):
    
    def __init__(self, filename):
        #load file 
        thefile = open(filename,'r')
        
        tokenizer = WordPunctTokenizer()
        self.sentences = []
        self.predicted = []
        
        #tokenizer
        for line in thefile:
            sentence = tokenizer.tokenize(line)
            if sentence:
                self.sentences.append(["<s>"] + sentence)
                self.predicted.append(sentence + ["<e>"])
        
        #index 
        uniquevocab = list(set(sum(self.sentences, []))) + ["<e>"]
        self.vocabindex = {}
        for i in range(len(uniquevocab)):
            self.vocabindex[uniquevocab[i]] = i
            
        #replace words with index
        self.int_sentences = [[self.vocabindex[x] for x in y] 
                               for y in self.sentences]
        
        self.int_predicted = [[self.vocabindex[x] for x in y] for
                              y in self.predicted]
        
        self.pairs = list(zip(self.int_sentences, self.int_predicted))
    
    
    def shuffle(self):
        random.shuffle(self.pairs)
      
    #get_item turns everything/makes it behave like an array
    def __getitem__(self, n):
        return self.pairs[n]


In [4]:
t1 = TextLoader('data/hunt_snark.txt')

In [5]:
reverseindex = {t1.vocabindex[x]:x for x in t1.vocabindex}

In [6]:
class LSTMFun(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.6):
        self.emb = nn.Embedding(vocab_size, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, dropout=dropout)
        self.linear = nn.Linear(vocab_size)
        self.softmax = F.log_softmax()
        #we want log softmax becasue of terrible underflow (when the numbers are suuuper small)
        #we need a linnear layer to represent the prediction to the category
        
        self.hidden_size = hidden_size
        
    def forward(self, sentence):
        init_hidden = self.init_hidden(len(sentence))
        #should be able to feed 
        output = self.emb(sentence)
        #magic behind the scenes to turn objects into functions (very smart)
        hidden, output = self.lstm(init_hidden, output)
        output = self.linear(output)
        return self.softmax(output)
    
    def init_hidden(self, sen_len):
        return torch.zeros(self.hidden_size, sen_len)
        #we need a special dimension

The predicted will not be corrected as it will be a list of integers, we will need to turn them into one-hot vectors becasue we have softmax

In [7]:
sentence = [t1[300][0]]

In [8]:
sentence

[[2010, 507, 1785, 1739, 1636, 403, 1785, 1739, 2070, 2147, 1588, 971]]

In [9]:
hidden = torch.zeros(200,12)

In [10]:
hidden

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])