## LSTM model

In [58]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Preparing Data


In [2]:
train_data = pd.read_csv('./archive/Train.csv')
test_data = pd.read_csv('./archive/Test.csv')
validation_data = pd.read_csv('./archive/Valid.csv')

In [50]:
x_tr, y_tr = train_data['text'].values.astype(str), train_data['label'].values
x_val, y_val = validation_data['text'].values.astype(str), validation_data['label'].values
x_test,y_test = test_data['text'].values.astype(str) , test_data['label'].values
print(len(x_tr), len(y_tr))
print(len(x_val), len(y_val))
print(len(x_test), len(y_test))


40000 40000
5000 5000
5000 5000


In [61]:
y_tr.dtype

dtype('int64')

In [51]:
#Tokenize the sentences
from string import punctuation
#convert all revies to lowercase
x_tr = np.char.lower(x_tr)
x_val = np.char.lower(x_val)
x_test = np.char.lower(x_test)

#remove punctuation
x_tr= [c for c in x_tr if c not in punctuation]
x_val = [c for c in x_val if c not in punctuation]
x_test = [c for c in x_test if c not in punctuation]


40000 40000
5000 5000
5000 5000
40000 40000
5000 5000
5000 5000


In [52]:
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_val_seq = tokenizer.texts_to_sequences(x_val)
x_test_seq = tokenizer.texts_to_sequences(x_test)


#padding to prepare sequences of same length
x_tr_pad  = pad_sequences(x_tr_seq, maxlen=1000)
x_val_pad = pad_sequences(x_val_seq, maxlen=1000)
x_test_pad = pad_sequences(x_test_seq,maxlen=1000)


40000 40000
5000 5000
5000 5000
40000
5000
5000


In [53]:
#length of the reviews
seqlen_train = [len(x) for x in x_tr_seq]
seqlen_val = [len(x) for x in x_val_seq] 
seqlen_test = [len(x) for x in x_test_seq] 

In [56]:
# this might be an issue 
vocab=len(tokenizer.word_index) + 1 #+1 for padding
print(vocab)

112204


In [11]:
#import gensim


In [12]:
#embedding_file = "glove.6B.100d.txt"
#embedding = gensim.models.KeyedVectors.load_word2vec_format(embedding_file, binary=False)

## Creating Tensors

In [55]:
X_train = torch.from_numpy(np.array(list(x_tr_pad)))
y_train = torch.LongTensor(y_tr)
seqlen_train = torch.LongTensor(seqlen_train)
print(f"{X_train.size() = },\n{y_train.size() = }\n{seqlen_train.size() = }\n")

X_val = torch.from_numpy(np.array(list(x_val_pad)))
y_val = torch.LongTensor(y_val)
seqlen_val = torch.LongTensor(seqlen_val)
print(f"{X_val.size() = },\n{y_val.size() = }\n{seqlen_val.size() = }\n")


X_test = torch.from_numpy(np.array(list(x_test_pad)))
y_test = torch.LongTensor(y_test)
seqlen_test = torch.LongTensor(seqlen_test)
print(f"{X_test.size() = },\n{y_test.size() = }\n{seqlen_test.size() = }")




X_train.size() = torch.Size([40000, 1000]),
y_train.size() = torch.Size([40000])
seqlen_train.size() = torch.Size([40000])

X_val.size() = torch.Size([5000, 1000]),
y_val.size() = torch.Size([5000])
seqlen_val.size() = torch.Size([5000])

X_test.size() = torch.Size([5000, 1000]),
y_test.size() = torch.Size([5000])
seqlen_test.size() = torch.Size([5000])


## Model

In [59]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(
            input_size=embedding_size,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=True,
            batch_first=True,
        )
        self.dense = nn.Linear(hidden_size * 2, output_size)
        
    # the input signature of forward changes
    def forward(self, sequences, sequence_lens):
        embedded = self.embedding(sequences)
        
        # THIS IS THE MODIFIED PART
        # returns a PackedSequence object
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded,
            sequence_lens,
            enforce_sorted=False,
            batch_first=True)
        packed_outputs, (h, c) = self.lstm(packed)
        # extract LSTM outputs (not used here)
        lstm_outputs, lens = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        
        h = torch.cat((h[0], h[1]), dim=-1)
        output = self.dense(h)
        return output

## Instantiating the model

In [63]:
input_size = vocab
embedding_size = 30
hidden_size = 64
# number of labels i.e 2
output_size = train_data.label.nunique()

model = LSTMClassifier(input_size, embedding_size, hidden_size, output_size)
model

LSTMClassifier(
  (embedding): Embedding(112204, 30)
  (lstm): LSTM(30, 64, batch_first=True, bidirectional=True)
  (dense): Linear(in_features=128, out_features=2, bias=True)
)

## Batching

In [64]:
class BatchedIterator:
    def __init__(self, *tensors, batch_size):
        # all tensors must have the same first dimension
        assert len(set(len(tensor) for tensor in tensors)) == 1
        self.tensors = tensors
        self.batch_size = batch_size
    
    def iterate_once(self):
        num_data = len(self.tensors[0])
        for start in range(0, num_data, self.batch_size):
            end = start + self.batch_size
            yield tuple(tensor[start:end] for tensor in self.tensors)
            


In [71]:
train_iter = BatchedIterator(X_train, seqlen_train, y_train, batch_size=501)
#prints number of batches
for X, seqlens, y in train_iter.iterate_once():
    print(f"{X.size() = }, {seqlens.size() = }, {y.size() = }")

X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Size([501])
X.size() = torch.Size([501, 1000]), seqlens.size() = torch.Size([501]), y.size() = torch.Si

## Loss function

In [72]:
from torch import optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

## Sanity check on Untrained models

In [73]:
# requires a large memory as we have a large vocab
logits = model(X_train, seqlen_train)
y = logits.argmax(axis=1)
accuracy = torch.sum(torch.eq(y, y_train)) / y.size(0)
print(f"Train accuracy: {accuracy:.1%}")

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:75] data. DefaultCPUAllocator: not enough memory: you tried to allocate 12978176 bytes. Buy new RAM!

In [74]:
logits = model(X_val, seqlen_val)
y = logits.argmax(axis=1)
accuracy = torch.sum(torch.eq(y, y_val)) / y.size(0)
print(f"Dev accuracy: {accuracy:.1%}")

RuntimeError: start (1155502) + length (14) exceeds dimension size (1155502).