`!pip install torchtext`

#### In this demo we will build a machine learning model to classify sms texts as ham or spam

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### SMS Spam Collection Dataset
Source: https://www.kaggle.com/uciml/sms-spam-collection-dataset


The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.

In [2]:
data = pd.read_csv('datasets/ham-spam/spam.csv', encoding='latin-1')

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### Cleaning Data

In [3]:
data = data.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [4]:
data = data.rename(index = str, columns = {'v1': 'labels', 'v2': 'text'})

data.head()

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [6]:
train.reset_index(drop=True), test.reset_index(drop=True)

(     labels                                               text
 0       ham  No I'm in the same boat. Still here at my moms...
 1      spam  (Bank of Granite issues Strong-Buy) EXPLOSIVE ...
 2       ham     They r giving a second chance to rahul dengra.
 3       ham     O i played smash bros  &lt;#&gt;  religiously.
 4      spam  PRIVATE! Your 2003 Account Statement for 07973...
 ...     ...                                                ...
 4452    ham  I came hostel. I m going to sleep. Plz call me...
 4453    ham                             Sorry, I'll call later
 4454    ham      Prabha..i'm soryda..realy..frm heart i'm sory
 4455    ham                         Nt joking seriously i told
 4456    ham                In work now. Going have in few min.
 
 [4457 rows x 2 columns],
      labels                                               text
 0       ham  Funny fact Nobody teaches volcanoes 2 erupt, t...
 1       ham  I sent my scores to sophas and i had to do sec...
 2      spam

In [7]:
train.head()

Unnamed: 0,labels,text
1978,ham,No I'm in the same boat. Still here at my moms...
3989,spam,(Bank of Granite issues Strong-Buy) EXPLOSIVE ...
3935,ham,They r giving a second chance to rahul dengra.
4078,ham,O i played smash bros &lt;#&gt; religiously.
4086,spam,PRIVATE! Your 2003 Account Statement for 07973...


In [8]:
test.head()

Unnamed: 0,labels,text
3245,ham,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
944,ham,I sent my scores to sophas and i had to do sec...
1044,spam,We know someone who you know that fancies you....
2484,ham,Only if you promise your getting out as SOON a...
812,spam,Congratulations ur awarded either å£500 of CD ...


In [9]:
train.shape, test.shape

((4457, 2), (1115, 2))

Saving Train and test data in csv files

In [10]:
train.to_csv('datasets/ham-spam/train.csv', index=False)
test.to_csv('datasets/ham-spam/test.csv', index=False)

In [11]:
!dir datasets\ham-spam

 Volume in drive D is SSD
 Volume Serial Number is F6B3-93A4

 Directory of D:\Google Drive\Jupyter Notebooks\Pluralsight - NLP with PyTorch\datasets\ham-spam

07/31/2020  10:53 PM    <DIR>          .
07/31/2020  10:53 PM    <DIR>          ..
06/14/2019  03:18 AM           503,663 spam.csv
08/13/2020  08:11 PM            98,560 test.csv
08/13/2020  08:11 PM           386,286 train.csv
               3 File(s)        988,509 bytes
               2 Dir(s)  920,706,551,808 bytes free


In [12]:
import numpy as np

import torch
import torchtext

from torchtext.data import Field, BucketIterator, TabularDataset

#### NLTK provides a function called word_tokenize() for splitting strings into tokens (nominally words). It splits tokens based on white space and punctuation.

In [13]:
import nltk
nltk.download('punkt')

from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Behnam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### The parameters of a Field specify how the data should be processed.We use the TEXT field to define how the text should be processed, and the LABEL field to process the labels.

In [14]:
# operates on text and tokenize it using the tokenizer from nltk
TEXT = torchtext.data.Field(tokenize = word_tokenize)

In [15]:
# operates on labels and convert them to numeric values
LABEL = torchtext.data.LabelField(dtype = torch.float)

In [16]:
datafields = [("labels", LABEL), ("text", TEXT)]
# list of 2 tuples

#### the following code splits data into the canonical train/test splits as torchtext.datasets objects. It process the data using the Fields we have previously defined.
- If working with csv files, use `TabularDataset`

In [18]:
trn, tst = torchtext.data.TabularDataset.splits(path = './datasets/ham-spam', 
                                                train = 'train.csv',
                                                test = 'test.csv' ,    
                                                format = 'csv',
                                                skip_header = True,
                                                fields = datafields)

### Each element in `trn` and `tst` is an `example` object

In [20]:
trn[:5]

[<torchtext.data.example.Example at 0x1b6e50dffc8>,
 <torchtext.data.example.Example at 0x1b6e50e00c8>,
 <torchtext.data.example.Example at 0x1b6e50e0048>,
 <torchtext.data.example.Example at 0x1b6e50e0408>,
 <torchtext.data.example.Example at 0x1b6e522e808>]

#### We can see how many examples are in each split by checking their length.

In [21]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

Number of training examples: 4457
Number of testing examples: 1115


### Let's check train number 5:

In [22]:
train.iloc[5]

labels                                                 ham
text      G says you never answer your texts, confirm/deny
Name: 4919, dtype: object

In [23]:
trn[5].__dict__.keys()

dict_keys(['labels', 'text'])

### Always perfom this check to make sure that label and text are correct¶

In [24]:
trn[5].text

['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']

In [25]:
trn[5].labels

'ham'

### Always perfom this check to make sure that label and text are correct 
- In the dataset

> 1st is `label`

> 2nd is `text`

The `vars()` function returns the `__dict__` attribute of the given object.

In [26]:
# a dict shape object which key is the label and value is a list of tokenized words
print(vars(trn.examples[5]))

{'labels': 'ham', 'text': ['G', 'says', 'you', 'never', 'answer', 'your', 'texts', ',', 'confirm/deny']}



#### Next, we have to build a vocabulary. This is a effectively a look up table where every unique word in your data set has a corresponding index (an integer). Each index is used to construct a one-hot vector for each word.
There are two ways effectively cut down our vocabulary, we can either only take the top $n$ most common words or ignore words that appear less than $m$ times. We'll do the former, only keeping the top 10,500 words.
The words that appear in examples but we have cut from the are replaced  with a special unknown  token.

In [27]:
# build_vocab creates a vocab of the input data in this case, training data
TEXT.build_vocab(trn, max_size = 10500)

In [28]:
LABEL.build_vocab(trn)

The vocab size is 10502 because, one of the addition tokens is the `unk` token and the other is a `pad` token.

In [29]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10502
Unique tokens in LABEL vocabulary: 2


### The additional 2 words in `TEXT.vocab` are:
- `pad` : to fill empty spaces for each example
- `unk`: to show words which are not present in train vocab but are in test vocab

#### We can also view the most common words in the vocabulary and their frequencies.

In [30]:
# most common 50 words in vocab with their indices
print(TEXT.vocab.freqs.most_common(50))

[('.', 3890), ('to', 1750), ('I', 1571), (',', 1468), ('you', 1460), ('?', 1256), ('!', 1134), ('a', 1067), ('...', 1007), ('the', 946), ('&', 772), ('i', 743), ('and', 669), ('in', 663), ('is', 646), (';', 641), ('u', 628), ('me', 586), (':', 570), ('for', 527), ('my', 494), ('of', 471), ('your', 461), ('it', 456), ('have', 395), ('on', 393), (')', 393), ('2', 390), ('that', 384), ("'s", 383), ("'m", 320), ('now', 317), ('are', 316), ('do', 311), ('call', 307), ('at', 301), ('or', 298), ('U', 295), ('not', 294), ("n't", 281), ('be', 275), ('lt', 267), ('gt', 267), ('with', 267), ('get', 265), ('will', 263), ('so', 252), ('#', 245), ('can', 243), ('ur', 237)]


### The integers you see beside each word, is the index position of that word in the `TEXT.vocab` dictionary

#### We can also see the vocabulary directly using either the `stoi` (string to int) or `itos` (int to string) method.

In [31]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'to', 'I', ',', 'you', '?', '!', 'a']


In [32]:
print(TEXT.vocab.stoi['really'])

206


In [33]:
print(LABEL.vocab.stoi)

defaultdict(None, {'ham': 0, 'spam': 1})


In [34]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x1b6e59f6e88>

Now, we will create iterators that will iterate over these in the training/evaluation loop, and they return a batch of examples (indexed and converted into tensors) at each iteration.
#### We'll use a BucketIterator which is a special type of iterator that will return a batch of examples where each example is of a similar length, minimizing the amount of padding per example.

In [35]:
batch_size = 64

train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
   (trn, tst),
    batch_size = batch_size,
    sort_key = lambda x: len(x.text), # sort based on the length of email text 
    sort_within_batch = False)

#### Build The Model

- <b>The embedding layer</b> is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (each word is a vector of length 15002)
- The RNN layer is our RNN which takes in our dense vector and the previous hidden state $h_{t-1}$, which it uses to calculate the next hidden state, $h_t$
- Finally, the linear layer takes the final hidden state and feeds it through a fully connected layer, $f(h_T)$, transforming it to the correct output dimension.


The RNN returns 2 tensors, `output` of size [sentence length, batch size, hidden dim] and `hidden` of size [1, batch size, hidden dim]. output is the concatenation of the hidden state from every time step, whereas hidden is simply the final hidden state. We verify this using the assert statement. Note the squeeze method, which is used to remove a dimension of size 1. Finally, we feed the last hidden state, hidden, through the linear layer, fc, to produce a prediction.

In [36]:
import torch.nn as nn

In [37]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
    
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        hidden_1D = hidden.squeeze(0)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

In [38]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        #self.model = model.cuda()
        
        # Creates dense embeding for each word instead of one-hot repesentation
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        # hidden_dim is the output of the previous state of LSTM or RNN
        # we feed one word at a time to the LSTM layer
        # (1st word at time-step t, 2nd word at t+1, ...)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        # input is the last hidden state of the LSTM and output is the prediction
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, text):
        
        # text: (sentence_length, batch_size)
        # Every input sentence is a list of indices of one-hot encoded words
        embedded = self.embedding(text)
        # after applying embedded layer, every word in each sentence represented with its embedding
        # embedded: (sentence_length, batch_size, embedding_dim)
        
        # to prevent overfitting
        embedded_dropout = self.dropout(embedded)
        
        # the ouput batch of the embedding layer is the input to the RNN layer
        # the preprocessing ensures that all sentences in each batch has the same length
        output, (hidden, cell) = self.rnn(embedded_dropout)
        # output: (sentence_length, batch_size, hiden_dim)
        # output has all the hidden states for all words in a sentence
        # hidden: (1, batch_size, hiden_dim), the last hidden state for each sentence
        # cell is the output of last cell state for LSTM (Long Term Memory)
        
        # get rid of unnecessary dimension 1
        hidden_1D = hidden.squeeze(0)
        # hidden_1D: (batch_size, hiden_dim) only the last hidden state
        
        # to make sure that the output of the last hidden state is equal to hidden_1D
        assert torch.equal(output[-1, :, :], hidden_1D)
        
        return self.fc(hidden_1D)

In [109]:

class RNN(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
  
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        # we give text as an input to the model when we want to train it
        # print("text", text.shape) = (seq_length, 64)
        
        embedded = self.embedding(text)
        # print("embedded", embedded.shape) = (seq_length, 64, 100)
        
        output, (hidden, _) = self.rnn(embedded)
        #print("output", output.shape) = (seq_length, 64, 256) 
        # print("hidden", hidden.shape) = (1, 64, 256)
        
        hidden_1D = hidden.squeeze(0)
        # print("hidden_1D", hidden_1D.shape) = (64, 128)
        
        assert torch.equal(output[-1, :, :], hidden_1D)
                
        #print('last', self.fc(hidden_1D).shape) = (64, 1)
        
        return self.fc(hidden_1D)

#### We now create an instance of our RNN class.

- The input dimension is the dimension of the one-hot vectors, which is equal to the vocabulary size.
- The embedding dimension is the size of the dense word vectors.
- The hidden dimension is the size of the hidden states
- The output dimension is usually the number of classes, however in the case of only 2 classes the output value is between 0 and 1 and thus can be 1-dimensional, i.e. a single scalar real number.

In [110]:
input_dim = len(TEXT.vocab) # this network accepts one-hot encoded words 15002

embedding_dim = 100

hidden_dim = 256

output_dim = 1 # binary classification has only 1 neuron in the last layer

In [111]:
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)

In [112]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 1e-6)

#### we will use `BCEWithLogitsLoss`  as our loss function - Creates a criterion that measures the Binary Cross Entropy between the target and the output
This loss combines a Sigmoid layer and the BCELoss in one single class.

In [113]:
criterion = nn.BCEWithLogitsLoss()

#### Training
- For each batch, we first zero the gradients. Each parameter in a model has a grad attribute which stores the gradient calculated by the criterion.
- We then feed the batch of sentences, batch.text, into the model
- The loss and accuracy are then calculated using our predictions and the labels, batch.labels, with the loss being averaged over all examples in the batch.
- We calculate the gradient of each parameter and then update the parameters using the gradients and optimizer algorithm
- Finally, we return the loss and accuracy

##### Calculating Accuracy 
We first feeds the predictions through a sigmoid layer, squashing the values between 0 and 1, we then round them to the nearest integer. This rounds any value greater than 0.5 to 1 (spam) and the rest to 0 (ham).

We then calculate how many rounded predictions equal the actual labels and average it across the batch.

In [114]:
# each batch has 64 sentences of different length
counter = 0
for batch in train_iterator:
    counter += 1
    print(batch.text.shape)
    
counter
#1st element is the length of sentences in each batch, the 2nd element is the batch_size

torch.Size([95, 64])
torch.Size([78, 64])
torch.Size([88, 64])
torch.Size([161, 64])
torch.Size([56, 64])
torch.Size([41, 64])
torch.Size([100, 64])
torch.Size([48, 64])
torch.Size([41, 64])
torch.Size([75, 64])
torch.Size([40, 64])
torch.Size([112, 64])
torch.Size([42, 64])
torch.Size([69, 64])
torch.Size([45, 64])
torch.Size([42, 64])
torch.Size([44, 64])
torch.Size([61, 64])
torch.Size([89, 64])
torch.Size([60, 64])
torch.Size([42, 64])
torch.Size([76, 64])
torch.Size([80, 64])
torch.Size([44, 64])
torch.Size([67, 64])
torch.Size([44, 64])
torch.Size([98, 64])
torch.Size([78, 64])
torch.Size([75, 64])
torch.Size([53, 64])
torch.Size([59, 64])
torch.Size([40, 64])
torch.Size([91, 64])
torch.Size([112, 64])
torch.Size([67, 64])
torch.Size([61, 64])
torch.Size([219, 64])
torch.Size([113, 64])
torch.Size([49, 64])
torch.Size([42, 64])
torch.Size([112, 64])
torch.Size([44, 64])
torch.Size([53, 64])
torch.Size([56, 64])
torch.Size([44, 64])
torch.Size([79, 64])
torch.Size([45, 64])
torch.

70

### Run the model on GPU

In [115]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [116]:
def train(model, iterator, optimizer, criterion):
    

    model.to(device)

    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        batch.text = batch.text.to(device)
        batch.labels = batch.labels.to(device)
        
        # output of the model is: (batch_size, 1) and we get rid of 1 with sueeze(dim=1)
        predictions = model(batch.text).squeeze(1)
                
        loss = criterion(predictions, batch.labels)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.labels).float()
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### the loss is decreasing with each epoch and we get a final accuracy of ~85%

In [117]:
num_epochs = 5

for epoch in range(num_epochs):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% ')

| Epoch: 01 | Train Loss: 0.643 | Train Acc: 85.22% 
| Epoch: 02 | Train Loss: 0.631 | Train Acc: 85.26% 
| Epoch: 03 | Train Loss: 0.619 | Train Acc: 85.32% 
| Epoch: 04 | Train Loss: 0.607 | Train Acc: 85.47% 
| Epoch: 05 | Train Loss: 0.596 | Train Acc: 85.40% 


evaluate is similar to train, with a few modifications as you don't want to update the parameters when evaluating.

In [None]:
epoch_loss = 0
epoch_acc = 0

In [None]:
model.eval()

In [None]:
with torch.no_grad(): # turn off gradinet calculations

    for batch in test_iterator:
        
        batch.text = batch.text.to(device)
        batch.labels = batch.labels.to(device)
        
        predictions = model(batch.text).squeeze(1)

        loss = criterion(predictions, batch.labels)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        
        correct = (rounded_preds == batch.labels).float() 
        acc = correct.sum() / len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()

test_loss = epoch_loss / len(test_iterator)
test_acc  = epoch_acc / len(test_iterator)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

# Another example
- [here](https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/)

Quora wants to keep track of insincere questions on their platform so as to make users feel safe while sharing their knowledge. An insincere question in this context is defined as a question intended to make a statement rather than looking for helpful answers. To break this down further, here are some characteristics that can signify that a particular question is insincere:

Has a non-neutral tone
Is disparaging or inflammatory
Isn’t grounded in reality
Uses sexual content (incest, bestiality, pedophilia) for shock value, and not to seek genuine answers
The training data includes the question that was asked, and a flag denoting whether it was identified as insincere (target = 1). The ground-truth labels contain some amount of noise, i.e. they are not guaranteed to be perfect. Our task will be to identify if a given question is ‘insincere’.

In [None]:
#deal with tensors
import torch   

#handling text data
from torchtext import data    

In [None]:
#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [None]:
from nltk import word_tokenize

TEXT = data.Field(tokenize=word_tokenize,batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [None]:
fields = [(None, None), ('text',TEXT),('label', LABEL)]

In [None]:
#loading custom dataset
training_data=data.TabularDataset(path = 'datasets/quora.csv',format = 'csv',fields = fields,skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

In [None]:
import random
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))

In [None]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

In [None]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [None]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [None]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [None]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

In [None]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
#load weights
path='/content/saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()                                   
view rawinference.py hosted with ❤ by GitHub
Amazing! Let us use this model to make predictions for few questions:

#make predictions
predict(model, "Are there any sports that you don't like?")

#insincere question
predict(model, "Why Indian girls go crazy about marrying Shri. Rahul Gandhi ji?")