In [27]:
import pandas as pd 
import torch
import torch.nn as nn
from nltk.tokenize import sent_tokenize
import gensim.parsing.preprocessing as gen_preproc

import spacy
nlp = spacy.load('en_core_web_sm')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

# Data Loading and Cleanup
Identical to what occured in the Initial Models file

In [7]:
df = pd.read_csv( 'parsed_train.txt', sep='\t', header=None,
                   names=['label', 'text'] )
df['label'] = df['label'].apply(lambda x: int( x.replace("__label__", "") ) )

basicPreproc = [lambda x: x.lower(), gen_preproc.strip_tags, \
    gen_preproc.strip_punctuation, gen_preproc.strip_non_alphanum, \
    gen_preproc.strip_multiple_whitespaces] #, gen_preproc.strip_short, gen_preproc.remove_stopwords

df['basicProc'] = df['text'].apply(lambda x: ' '.join( gen_preproc.preprocess_string(x, basicPreproc)))
df['stemmed'] = df['basicProc'].apply(lambda x: gen_preproc.stem_text(x))
df['lemmatized'] = df['basicProc'].apply(lambda x: ' '.join( [token.lemma_ for token in nlp(x)] ) )
df.head()

Unnamed: 0,label,text,basicProc,stemmed,lemmatized
0,4,The Rock is destined to be the 21st Century 's...,the rock is destined to be the 21st century s ...,the rock is destin to be the 21st centuri s ne...,the rock be destine to be the 21st century s n...
1,5,The gorgeously elaborate continuation of `` Th...,the gorgeously elaborate continuation of the l...,the gorgeous elabor continu of the lord of the...,the gorgeously elaborate continuation of the l...
2,4,Singer/composer Bryan Adams contributes a slew...,singer composer bryan adams contributes a slew...,singer compos bryan adam contribut a slew of s...,singer composer bryan adams contribute a slew ...
3,3,You 'd think by now America would have had eno...,you d think by now america would have had enou...,you d think by now america would have had enou...,-PRON- d think by now america would have have ...
4,4,Yet the act is still charming here .,yet the act is still charming here,yet the act is still charm here,yet the act be still charm here


Do the same but for the test set.

In [9]:
test_df = pd.read_csv( 'parsed_test.txt', sep='\t', header=None,
                   names=['label', 'text'] )
test_df['label'] = test_df['label'].apply(lambda x: int( x.replace("__label__", "") ) )
test_df['basicProc'] = test_df['text'].apply(lambda x: ' '.join( gen_preproc.preprocess_string(x, basicPreproc)))
test_df['stemmed'] = test_df['basicProc'].apply(lambda x: gen_preproc.stem_text(x))
test_df['lemmatized'] = test_df['basicProc'].apply(lambda x: ' '.join( [token.lemma_ for token in nlp(x)] ) )
test_df.head()

Unnamed: 0,label,text,basicProc,stemmed,lemmatized
0,3,Effective but too-tepid biopic,effective but too tepid biopic,effect but too tepid biopic,effective but too tepid biopic
1,4,If you sometimes like to go to the movies to h...,if you sometimes like to go to the movies to h...,if you sometim like to go to the movi to have ...,if -PRON- sometimes like to go to the movie to...
2,5,"Emerges as something rare , an issue movie tha...",emerges as something rare an issue movie that ...,emerg as someth rare an issu movi that s so ho...,emerge as something rare an issue movie that s...
3,3,The film provides some great insight into the ...,the film provides some great insight into the ...,the film provid some great insight into the ne...,the film provide some great insight into the n...
4,5,Offers that rare combination of entertainment ...,offers that rare combination of entertainment ...,offer that rare combin of entertain and educ,offer that rare combination of entertainment a...


## Setup Tokenization for input into LSTM

Like other networks, an LSTM takes inputs of vectors. Therefore we do need to turn our input text back into vectors. However, we will be feeding in each word at a time, so its not necessary to do anything too complicated. 

#### Questions:
- How are ngrams used in an LSTM?
- Is there any benefit to feeding in something like a w2v vector instead of a one-hot encoded vector? One hot obviously relies on a consistant vocab, which fasttext could get around. 
- Why would you want to stack multiple levels of LSTMs on top of eachother?


In [17]:
maxNumWords = 5000 #Most frequent
maxSeqLeng = 45 #See Initial Models file for reviews length distro justifying this. 
embedDim = 100

tokenizer = Tokenizer( num_words=maxNumWords ) #already did other preproc, but could include more here
tokenizer.fit_on_texts(df['lemmatized'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 12724 unique tokens.


In [35]:
X = tokenizer.texts_to_sequences(df['lemmatized'].values)
X = pad_sequences(X, maxlen=maxSeqLeng)  #What does padding do?
print('Shape of Input data tensor:', X.shape)
X[:,0]

Shape of Input data tensor: (8544, 45)


array([0, 0, 0, ..., 0, 0, 0])

## LSTM Model Setup

We want to have multiple layers that do different things. 
- First we embed our text into vectors 
- Then we setup our LSTM network 
- The LSTM outputs are naturally the same size as the historical data it keeps - eg the same size as the input; so we add a dense layer that transforms our historical information into the info we care about - 1 predicted label. 
- Of course, we need to decide how we're evaluating our model

In [26]:
model = Sequential()
model.add(Embedding(maxNumWords, embedDim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(embedDim, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

NotImplementedError: Cannot convert a symbolic Tensor (lstm_3/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

In [28]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.layer_dim = layer_dim #Setup class things
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim,hidden_dim,layer_dim,batch_first=True) #Majority of important stuff
        self.fc = nn.Linear(hidden_dim, output_dim)
    def forward(self,x):
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0),self.hidden_dim)) #Setup initial memory states
        c0 = Variable(torch.zeros(self.layer_dim, x.size(0),self.hidden_dim))
        out,(hn,cn)= self.lstm(x,(h0,c0)) # hn shape layer_dim, batch_size, hidden_dim out, shape batch_size, seq_dim, hidden_dim
        out = self.fc(out[:,-1,:])
        return out


In [37]:
model = LSTMModel(input_dim=maxNumWords, hidden_dim=embedDim, layer_dim=1, output_dim=1).float()
criterion = nn.MSELoss()
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
Y = list( df['label'] )

4

In [40]:
#Actual Training - taken from online, not totally adapted yet. Does not work.

num_epochs = 15
iter_counter = 0
for epoch in range(num_epochs):
    for index, row in df.iterrows():
        #inputData = row['lemmatize'] 
        inputVector = X[:,index] #Recall we already transformed our input.
        label = Y[index]
        
        #Uneditted past here - Why is the model not being fed in an x & y? Is there any ~fit equivalent method? 
        #Images implies plural = feeding in all X at once? Why?
        images= Variable(images)
        labels = Variable(labels)
        optimizer.zero_grad()
        outputs = model(images.float())
        loss = criterion(outputs,labels.float())
        loss.backward()
        optimizer.step()
        iter_counter +=1
        if iter_counter% 500 ==0:
            error = 0
            for images, labels in test_loader:
                images = Variable(images.float())
                outputs = model(images.float())
                error += ((outputs.data - labels.data)**2).mean()
            print("Iteration: {} Loss: {} Error: {}".format(iter_counter, loss, error))

NameError: name 'Variable' is not defined