# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint



### Not for Grading

## NLP with CNNs

In [None]:
#@title Case Study Walkthrough
#@markdown  NLP with CNNs
from IPython.display import HTML

HTML("""<video width="320" height="240" controls>
  <source src="https://cdn.talentsprint.com/talentsprint/archives/sc/aiml/aiml_2018_b7_hyd/preview_videos/nlp_with_cnns.mp4">
</video>
""")

The objective of this experiment is to see the application of Convolutional Neural Networks in NLP.

####Note that this case study based on this [paper.](http://www.aclweb.org/anthology/D14-1181)

In [None]:
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week9/Exp2/AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin&sa=D&source=hangouts&ust=1550651743825000&usg=AFQjCNHh2LSwNi9czsqAAuBLvx_vDeUE_Q


In [None]:
!ls

##Importing required packages

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

## Code for accessing CUDA

In [None]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
torch.cuda.set_device(gpus[0])
FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

## Function to split the data in to batches

In [None]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

## Function to add the padding to batches if required

In [None]:
def pad_to_batch(batch):
    x,y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

## Function to prepare the sequence

In [None]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    #print(idxs)
    return Variable(LongTensor(idxs))

## Data load & Preprocessing

### TREC question dataset(http://cogcomp.org/Data/QA/QC/)

The following command gets the required TREC question dataset.

In [None]:
!wget http://cogcomp.org/Data/QA/QC/train_5500.label

Task involves
classifying a question into 6 question
types (whether the question is about person,
location, numeric information, etc.)

## Load the data

In [None]:
data = open('train_5500.label', 'r', encoding='latin-1').readlines()

In [None]:
data[:5]

## Split the data by seperating the labels

In [None]:
data = [[d.split(':')[1][:-1], d.split(':')[0]] for d in data]

In [None]:
data[:5]

In [None]:
X, y = list(zip(*data))
X = list(X)

In [None]:
print(X[:5])
print(y[:5])

## Print the labels in the data

In [None]:
set(y)

## Number masking 

In [None]:
for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', x).split()

Replacing the numbers with # (hash)

It reduces the search space. 

For example, 

my birthday is 12.22 ==> my birthday is ##.##

In [None]:
X[:2]

## Building the Vocabulary

In [None]:
vocab = list(set(flatten(X)))
print(len(vocab))
print(vocab)

## Check for number of classes

In [None]:
len(set(y)) # num of class

## Create the index to words in the vocabulary

In [None]:
word2index={'<PAD>': 0, '<UNK>': 1}
print(len(word2index))

In [None]:
print(word2index.get('<PAD>'))
print(word2index.get('<UNK>'))
print(word2index.get(vocab[1]))

In [None]:
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
#print(word2index)
index2word = {v:k for k, v in word2index.items()}
#print(index2word)

## Create the index to target

In [None]:
target2index = {}

for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)

index2target = {v:k for k, v in target2index.items()}

## Preparing the data in tensor format

In [None]:
X_p, y_p = [], []
for pair in zip(X,y):
    ## Create the indexes for the list of split words of questions present in X and changing to tensor format
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1)) 
    ## Changes the format of labels to tensor format
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))

## Zipping both the data and labels and shuffle randomly

In [None]:
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)

## Split the data into train and test

In [None]:
train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]

## Load Pretrained word vector

In [None]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('AIML_DS_GOOGLENEWS-VECTORS-NEGATIVE-300_STD.bin', binary=True)
len(model.index2word)

In [None]:
model.index2word

In [None]:
word2index.keys()

In [None]:
print(model['pail'].shape)
print(np.random.randn(300).shape)

## Get the vector corresponding to the word using the pretrained model

In [None]:
pretrained = []

for index, key in enumerate(word2index.keys()):
    try:
        pretrained.append(model[key])
    except:
        #print(index, key)
        pretrained.append(np.random.randn(300))
        
pretrained_vectors = np.vstack(pretrained)
#print(pretrained)

## Modeling 


![alttxt](https://cdn.talentsprint.com/aiml/Casestudies_slides/NLP_with_CNN/NLP_with_CNN.png)





The above image is borrowed from this [paper.](http://www.aclweb.org/anthology/D14-1181)

## Define CNN classifier architecture for classification as per the paper 

In [None]:
class  CNNClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs]
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs]

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

## Training the model 

##It takes for a while if you use just cpu.

## Set the parameters

In [None]:
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [2,2,2]
KERNEL_DIM = 100
LR = 0.001

## Set up the defined CNN model and  Initialize embedding matrix using pretrained vectors

In [None]:
model = CNNClassifier(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)
model.init_weights(pretrained_vectors) # initialize embedding matrix using pretrained vectors

## Switch on the CUDA

In [None]:
if USE_CUDA:
    model = model.cuda()

## Define loss function and optimizer

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

## Train the data batch wise

In [None]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        inputs,targets = pad_to_batch(batch)
        
        model.zero_grad()
        preds = model(inputs, True)
        
        loss = loss_function(preds, targets)
        losses.append(loss.data.item())
        loss.backward()
        
        #for param in model.parameters():
        #    param.grad.data.clamp_(-3, 3)
        
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

## Predict the test data with the trained model and calculate the test accuracy 

In [None]:
accuracy = 0
for test in test_data:
    pred = model(test[0].cuda()).max(1)[1]
    pred = pred.data.tolist()[0]
    target = test[1].data.tolist()[0][0]
    if pred == target:
        accuracy += 1

print(accuracy/len(test_data) * 100)