# LSTM for text classification

Assume that we have divide the practical.csv into two train/test sets

### LOADING PACKAGES AND DATASETS

In [1]:
import torch
import sklearn
from torchtext import data, datasets
import csv
import numpy as np

train_csv_path="./train.csv"
test_csv_path="./train.csv"

TEXT = data.Field(tokenize = 'spacy', include_lengths=True) # Processing the texts into tokens using 'spacy' | To tokenize the document
LABEL = data.LabelField(dtype = torch.long)             # Extracting the labels

fields=[('label',LABEL), (None,None), ('text', TEXT)]   # To Extract Labels from 1st column and Texts from 3rd column. 
                                                        # Ignoring 2nd Column due to its irrelevancy
                                                        # 'text': Contains List of tokens for each observatons/rows.
                                                        # 'label': Denotes the class of each observations/rows.
            
train_data,test_data = data.TabularDataset.splits(                # Import datasets from the test and train csv files and saved as dictionaries
                                                 path = './',     # The datasets created follows the 'field' format
                                                 train = train_csv_path,
                                                 test = test_csv_path,
                                                 format = 'csv',
                                                 fields = fields,
                                                 skip_header = False)

print(vars(train_data.examples[0]))                     # Dictionary of the 1st Sentence

train_data,valid_data=train_data.split()                # Splits into train and valid data in a ratio of 8:2
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data,                            # To build 'TEXT' Vocabulary, 
                 max_size = MAX_VOCAB_SIZE,             # Maps the tokens in each rows/obeservations into a vector representation of itself using 'glove dictionary'.
                 vectors = "glove.6B.100d",             # Converts into a Vector of dimension 100
                                                        # For memory efficiency & to find connections between certain sets of words
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


{'label': '3', 'text': ['Reuters', '-', 'Short', '-', 'sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra', '-', 'cynics', ',', 'are', 'seeing', 'green', 'again', '.']}
Number of training examples: 2525
Number of validation examples: 1082
Unique tokens in TEXT vocabulary: 12386
Unique tokens in LABEL vocabulary: 4


### CREATING BATCHES TO TRAIN OUR MODEL

In [2]:
BATCH_SIZE = 64                                         # Define Batch Size
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                              batch_size = BATCH_SIZE,
                                                             device = device,
                                                             sort_key= lambda x: len(x.text),
                                                             sort_within_batch = True)
                                                        # Creating train and validation batches of size 64
print(train_iterator)

<torchtext.data.iterator.BucketIterator object at 0x0000028943B71E88>


### DEFINING OUR LSTM ARCHITECTURE

In [3]:
import torch.nn as nn
class RNN(nn.Module):
     def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                  bidirectional, dropout, pad_idx):
         super().__init__()

         self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=pad_idx)

         self.rnn = nn.LSTM(embedding_dim,
                             hidden_dim,
                            num_layers=n_layers,
                             bidirectional=bidirectional,
                             dropout=dropout
                             )
         self.fc = nn.Linear(hidden_dim*2, output_dim)
         self.dropout = nn.Dropout(dropout)

     def forward(self, text,text_lengths):
         #text = [sent len, batch size]
         embedded = self.dropout(self.embedding(text))
         #embedded = [sent len, batch size, emb dim]

         #pack sequence
         packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,text_lengths)
         packed_output, (hidden,cell) = self.rnn(packed_embedded)

         #hidden = [num layers * num directions, batch size, hid dim]
         #cell = [num layers * num directions, batch size, hid dim]

         #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden
            #layers
         #and apply dropout
         hidden = self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))

         return self.fc(hidden)


### LOSS FUNCTION, OPTIMIZER AND DEFINING OF OTHER NECESSARY VARIABLES

In [4]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM,
             EMBEDDING_DIM,
             HIDDEN_DIM,
             OUTPUT_DIM,
             N_LAYERS,
             BIDIRECTIONAL,
             DROPOUT,
             PAD_IDX)

model = model.to(device)

pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)
model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

def count_parameters(model):                               # Total number of parameters that needs to be trained
     return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

torch.Size([12386, 100])
The model has 3,550,796 trainable parameters


### Training our LSTM model

In [5]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = torch.nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

print(OUTPUT_DIM)

losses_eval = []

def evaluate(model, iterator, criterion):
     epoch_loss = 0
     epoch_acc = 0

     model.eval()

     with torch.no_grad():

         for batch in iterator:
             l=nn.functional.one_hot(batch.label,OUTPUT_DIM)
             text, text_lengths = batch.text

             predictions = model(text, text_lengths).squeeze(1)

             loss = criterion(predictions,l.float())
             losses_eval.append(loss.data)
             epoch_loss += loss.item()
             epoch_acc += (predictions.argmax(1) == batch.label).sum().item()/len(batch.label)

     return epoch_loss / len(iterator), epoch_acc / len(iterator)

losses_train = []
def train(model,iterator, optimizer,criterion):

     train_loss=0
     train_acc=0

     model.train()
     for batch in iterator:
         l=nn.functional.one_hot(batch.label,OUTPUT_DIM)

         optimizer.zero_grad()
         text, text_lengths = batch.text
         output = model(text,text_lengths).squeeze(1)
         loss = criterion(output, l.float())
         losses_train.append(loss.data)
         train_loss += loss.item()
         loss.backward()
         optimizer.step()
         train_acc += (output.argmax(1) == batch.label).sum().item()/len(batch.label)
     return train_loss / len(iterator), train_acc / len(iterator)
N_EPO=20
import time
for epoch in range(N_EPO):
     start_time = time.time()
     train_loss, train_acc= train(model,train_iterator,optimizer,criterion)
     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

     secs = int(time.time() - start_time)
     mins = secs / 60
     secs = secs % 60
     print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
     print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
     print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

4
Epoch: 1  | time in 0 minutes, 3 seconds
	Loss: 0.5538(train)	|	Acc: 38.3%(train)
	Loss: 0.4969(valid)	|	Acc: 50.0%(valid)
Epoch: 2  | time in 0 minutes, 2 seconds
	Loss: 0.4232(train)	|	Acc: 60.3%(train)
	Loss: 0.3593(valid)	|	Acc: 66.7%(valid)
Epoch: 3  | time in 0 minutes, 1 seconds
	Loss: 0.3405(train)	|	Acc: 70.6%(train)
	Loss: 0.2711(valid)	|	Acc: 78.1%(valid)
Epoch: 4  | time in 0 minutes, 1 seconds
	Loss: 0.2874(train)	|	Acc: 75.7%(train)
	Loss: 0.2614(valid)	|	Acc: 77.7%(valid)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 0.2660(train)	|	Acc: 77.2%(train)
	Loss: 0.2593(valid)	|	Acc: 78.5%(valid)
Epoch: 6  | time in 0 minutes, 1 seconds
	Loss: 0.2430(train)	|	Acc: 79.3%(train)
	Loss: 0.2331(valid)	|	Acc: 80.6%(valid)
Epoch: 7  | time in 0 minutes, 2 seconds
	Loss: 0.2255(train)	|	Acc: 82.0%(train)
	Loss: 0.2812(valid)	|	Acc: 76.3%(valid)
Epoch: 8  | time in 0 minutes, 1 seconds
	Loss: 0.2417(train)	|	Acc: 80.9%(train)
	Loss: 0.3265(valid)	|	Acc: 70.0%(valid)
Epoch: 9  | ti

In [6]:
import matplotlib.pyplot as plt
losses=np.array(losses,dtype=np.float)
plt.plot(losses_train)
plt.show()
print('Training accuracy', train_acc*100)

ModuleNotFoundError: No module named 'matplotlib.colorbar'

In [None]:
plt.plot(losses_eval)
plt.show()

print('prediction accuracy', valid_acc*100)