#### Load Data


In [1]:

import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string

In [2]:
df_cols = np.load('data/df_cols.npy')

X_test = pd.DataFrame(np.load('data/X_test.npy'), columns=df_cols)
X_train = pd.DataFrame(np.load('data/X_train.npy'), columns=df_cols)
X_val = pd.DataFrame(np.load('data/X_val.npy'), columns=df_cols)

y_test = np.load('data/y_test.npy')
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')


X_train.shape, X_test.shape, X_val.shape

((53480, 50), (18183, 50), (17471, 50))

##### Random Forest Classifier - Tfidf

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer 

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = TfidfVectorizer(min_df= 5, ngram_range=(1,1))
classifier = RandomForestClassifier()

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Load sample data
train = list(zip(X_train['Text_clean'], y_train))
val = list(zip(X_val['Text_clean'], y_val))
test = list(zip(X_test['Text_clean'], y_test))


train = [(review[0].split(), [str(int(review[1]))]) for review in train]
val = [(review[0].split(), [str(int(review[1]))]) for review in val]
test = [(review[0].split(), [str(int(review[1]))]) for review in test]


# # Create model and measure accuracy
# pipe.fit([x[0] for x in train], [x[1] for x in train]) 
# pred_data_val = pipe.predict([x[0] for x in val]) 
# pred_data_train = pipe.predict([x[0] for x in train]) 

# print("Validation accuracy score: ", accuracy_score(y_val, pred_data_val))
# print("Training accuracy score: ", accuracy_score(y_train, pred_data_train))

In [42]:
train[0]

(['get',
  'cookie',
  'week',
  'ago',
  'bahlsen',
  'chocolate',
  'filling',
  'wonderful',
  'cookie',
  '$',
  '1',
  'pack',
  'not',
  'wrong',
  'thing',
  'quick',
  'coffee',
  'milk',
  'dunk',
  'not',
  'let',
  'soak',
  'light',
  'cookie',
  'fall',
  'apart',
  'quick',
  'dip',
  'set',
  'fine',
  '!',
  'come',
  'packed',
  'wrap',
  'recommend',
  'highly',
  '!'],
 ['10'])

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f678c7331d0>

### Preprocess the dataset
Create a list of the reviews and their label to fit the LSTM model architecture

In [90]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

training_data = train[:20]  # Small subset for testing

word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
tag_to_ix = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, 
             "6": 6, "7": 7, "8": 8, "9": 9, "10": 10}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6 # Typically 300
HIDDEN_DIM = 6 # Typically 200

### Create the model

In [77]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores
        return tag_scores[-1].unsqueeze(0) # Returns the last hidden state because we tag for a whole sentense and not every word


### Train the Model

In [81]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(1):  # Number of epocs, should be arount 10s
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

