# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint

### Not for grading

## Sentence Level Author identification using RNN

In [None]:
#@title Case Study Walkthrough
#@markdown  Sentence Level Author identification using RNN
from IPython.display import HTML

HTML("""<video width="320" height="240" controls>
  <source src="https://cdn.talentsprint.com/talentsprint/archives/sc/aiml/aiml_2018_b7_hyd/preview_videos/sentence_level_author_identification_using_rnn.mp4">
</video>
""")

In [None]:
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/test.csv
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/train.csv
! wget https://cdn.talentsprint.com/aiml/Experiment_related_data/week12/Exp2/val.csv
! wget https://cdn.talentsprint.com/aiml/CaseStudies/Sentence_level_rnn_trained_0.66.pt
    

In [None]:
ls

[0m[01;34msample_data[0m/                        train.csv
Sentence_level_rnn_trained_0.66.pt  U4W19_CS_Author_identification_RNN.ipynb
test.csv                            val.csv


###Importing required packages

In [None]:
import pandas as pd
import numpy as np
import gensim
import re

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

### Assign numbers to the labels using dictionary data structure


In [None]:
label = {'EAP':0, 'HPL':1, 'MWS':2}

### Load the train data

In [None]:
train_data = pd.read_csv('train.csv', encoding='latin1')

In [None]:
train_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


### Collect the sentences from the train data

In [None]:
sentences = train_data[['text','author']].values

### Load the test data

In [None]:
test_data = pd.read_csv('test.csv', encoding='latin1')

### Collect the sentences from the test data

In [None]:
test_sentences = test_data[['text']].values.flatten()

### Define the class for stemming / preprocessing sentences

In [None]:
#stopWords = pd.read_csv('stopwords.txt').values

class MySentences(object):
    def __init__(self, fnamelist):
        self.fnamelist = fnamelist
        # Creating a set of vocabulary
        self.vocabulary = set([])
        #self.sentences = self.train_data.text.values
        #self.labels = [self.train_data.text.values.flatten()

    def __iter__(self):
        for fname in self.fnamelist:
            self.data = pd.read_csv(fname, encoding='latin1')
            self.sentences = self.data.text.values
            for line in self.sentences:
                words = re.findall(r'(\b[A-Za-z][a-z]{2,15}\b)', line)
                # Stemming a word.
                words = [ word.lower() for word in words]
                for word in words:
                    self.vocabulary.add(word)
                yield words

In [None]:
sentences = MySentences(['train.csv', 'val.csv','test.csv']) # a memory-friendly iterator
# for i in sentences:
#     print(i)

### Use gensims.model.Word2Vec to get vectors for the sentences and save the model as a .bin file

In [None]:
model = gensim.models.Word2Vec(sentences, min_count=1)
model.save("AuthID2Vec.bin")

### Count the corpus

In [None]:
model.corpus_count

27971

### Import required torch packages

In [None]:
import torch 
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchvision.transforms as transforms
import os

### Define custom dataset loader

In [None]:
class CustomDataset(torch.utils.data.Dataset):    ### custom data loader
    
    def __init__(self, data_file_path,  train=True):
        self.data_file_path = data_file_path
        self.train = train
        self.data = pd.read_csv(data_file_path, encoding='latin1')
        self.ids = self.data.id.values
        self.sentences = self.data.text.values
        if self.train:
            self.label_dict = {'EAP':0, 'HPL':1, 'MWS':2}
            self.labels = [self.label_dict[i] for i in self.data.author.values]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        if self.train:
            return self.sentences[index], self.labels[index]
        else:
            return self.sentences[index], self.ids[index]

### Set the batch size




In [None]:
batch_size = 16

### Use the custom dataset loader to load the .csv files for train, val and test data into batches

In [None]:
# You can then use the prebuilt data loader. 
train_set = CustomDataset("train.csv", train=True)
val_set = CustomDataset("val.csv", train=True)
test_set = CustomDataset("test.csv", train=False)

In [None]:
trainloader = torch.utils.data.DataLoader(train_set,batch_size=batch_size, shuffle = True)
valloader = torch.utils.data.DataLoader(val_set,batch_size=batch_size, shuffle = True)
testloader = torch.utils.data.DataLoader(test_set,batch_size=batch_size, shuffle = False)

In [None]:
for X,y in trainloader:
    print(X)
    print(y.size())
    break

('Ay, ay," continued he, observing my face expressive of suffering, "M.', 'Such was the maniac language of her enthusiasm.', 'It was small relief to him that our discipline should gain us success in such a conflict; while plague still hovered to equalize the conqueror and the conquered, it was not victory that he desired, but bloodless peace.', 'The inhabitants of the island, and of the fort, thronged out, of course, to see the balloon; but it was with the greatest difficulty that any one could be made to credit the actual voyage the crossing of the Atlantic.', 'It is indeed demonstrable that every such impulse given the air, must, in the end, impress every individual thing that exists within the universe; and the being of infinite understanding the being whom we have imagined might trace the remote undulations of the impulse trace them upward and onward in their influences upon all particles of an matter upward and onward for ever in their modifications of old forms or, in other words

### Load the saved gensim model which contains vectors for the sentences 

In [None]:
model_load = gensim.models.Word2Vec.load('AuthID2Vec.bin')

### Set the deivce to CUDA

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Creating the recurrent neural network

In [None]:
### Creating recurrent neural network
class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.gru = nn.GRU(input_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax()
    
    def forward(self, input):
        outputs = []
        for sentence in input:
            hidden= self.init_hidden()
            word_embeddings = self.get_embedding(sentence)
            for word_embedding in word_embeddings:
                output, hidden = self.gru(word_embedding.unsqueeze(0).unsqueeze(0), hidden)
            try:
                #print(output.size())
                outputs.append(output)
            except:
                outputs.append(torch.rand(1,1,20))
                continue
        outputs = torch.cat(outputs)
        outputs = self.softmax(self.decoder(outputs))
        return outputs

    def init_hidden(self):
        return torch.zeros(self.n_layers, 1, self.hidden_size)
    
    def get_embedding(self, sentence):
        #print(len(sentence))
        #sentence_wt = word_tokenize(sentence.lower())
        words = re.findall(r'(\b[A-Za-z][a-z]{2,15}\b)', sentence)
        words = [ word.lower() for word in words]
        embedding = []
        for word in words:
            #print(word)
            embedding.append(torch.tensor(model_load[word]))
        #print(embedding[0].size())
        return embedding

### Implement the RNN by setting up the required parameters

In [None]:
rnn = RNN(100,20,3,n_layers=1) # Set the denfined RNN model with 100 input layers, 20 hidden layers and 3 output layers
lr = 0.001 # learning rate
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr) # Set the optimizer 
## Loss function
criterion = nn.CrossEntropyLoss()

### Train and test RNN model

### This will take quite a lot of time

In [None]:
losses  = []
val_accuracy = 0
for j in range(100):
    i = 0
    correct_train = 0
    for X,y in trainloader:
        #print(index, end='\r')
        output = rnn(X)
        optimizer.zero_grad()
        loss = criterion(output.squeeze(1),y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        _,predicted = torch.max(output.squeeze(1).data, 1)
        #print(predicted)
        correct_train += predicted.eq(y.data).cpu().sum().item()
        i=i+1
    print('Epoch: {}, Train Accuracy: {}, Average Loss: {}'.format(j, correct_train/(i*16), sum(losses)/len(losses)))
    correct_val = 0 
    for X,y in valloader:
        #print(index, end='\r')
        output = rnn(X)
        _,predicted = torch.max(output.squeeze(1).data, 1)
        correct_val += predicted.eq(y.data).cpu().sum().item()
    if val_accuracy<correct_val/6580:
        val_accuracy = correct_val/6580
        torch.save(rnn.state_dict(), 'Sentence_level_rnn_trained_{:.2f}.pt'.format(val_accuracy))
    print('Epoch: {}, Validation Accuracy: {}'.format(j, correct_val/6580))

Epoch: 0, Train Accuracy: 0.4516451414514145, Average Loss: 1.0788897133694657
Epoch: 0, Validation Accuracy: 0.4857142857142857
Epoch: 1, Train Accuracy: 0.5349015990159902, Average Loss: 1.0665253377708561
Epoch: 1, Validation Accuracy: 0.5452887537993921
Epoch: 2, Train Accuracy: 0.5498923739237392, Average Loss: 1.0594833154519985
Epoch: 2, Validation Accuracy: 0.5562310030395137
Epoch: 3, Train Accuracy: 0.559040590405904, Average Loss: 1.0547626712811975
Epoch: 3, Validation Accuracy: 0.5680851063829787
Epoch: 4, Train Accuracy: 0.5665744157441575, Average Loss: 1.0515242544340764
Epoch: 4, Validation Accuracy: 0.5785714285714286
Epoch: 5, Train Accuracy: 0.5782595325953259, Average Loss: 1.0486958682683707
Epoch: 5, Validation Accuracy: 0.5844984802431611
Epoch: 6, Train Accuracy: 0.573570110701107, Average Loss: 1.0467121162893818
Epoch: 6, Validation Accuracy: 0.5764437689969605
Epoch: 7, Train Accuracy: 0.5851783517835178, Average Loss: 1.0450556893428606
Epoch: 7, Validation