## Making a vocabulary object

In [90]:
import pandas as pd
import numpy as np 
import re 

In [91]:
class Vocabulary():

    def __init__(self, corpus):
        self.corpus = corpus
        self.max_len = max([len(x) for x in corpus])

    def createVocab(self):
        self.vocab = {}
        i = 0
        for line in self.corpus:
            for word in line.split(" "):
                word = re.sub(r"[+,'-,.!?]", "", word).lower()
                if word not in self.vocab:
                    self.vocab.update({word: i})
                    i += 1

    def vect_helper(self, encoding, words, i):
        indexes = [self.vocab.get(x) for x in words]
        for i, index in enumerate(indexes):
            encoding[i][index] = 1
        return encoding

    def createEncodings(self):
        encodings = np.zeros((len(self.corpus), self.max_len, len(self.vocab)))
        for i, line in enumerate(self.corpus):
            line = re.sub(r"[+,'-,.!?]", "", line).lower()
            words = line.split(" ")
            encodings[i] = self.vect_helper(encodings[i], words, i)

        return encodings


In [92]:
df = pd.read_csv("RickandMortyScripts.csv")
corpus = df["line"]
vocabulary = Vocabulary(corpus)
vocabulary.createVocab()


First 10 vocabulary items

In [93]:
list(vocabulary.vocab.items())[:10]

[('morty', 0),
 ('you', 1),
 ('gotta', 2),
 ('come', 3),
 ('on', 4),
 ('jus', 5),
 ('with', 6),
 ('me', 7),
 ('what', 8),
 ('rick', 9)]

Mapping corpus to one hot encoded vectors

In [96]:
features = vocabulary.createEncodings()
print(features[0])
features.shape

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


(1905, 1109, 3786)

In the above we have one of 1905 features 

Each column represents a possible word in the sentence, upto a max length of 1109 words(longest sentence in the copurs)

each row represents a possible word in the corpus(3786)

## RNN

In [97]:
import torch
import torch.nn as nn
import torch.nn.functional as F 

In [None]:
class RNN(nn.Module):

    def __init__(self, x: np.array) -> None:
        super(self, RNN).__init__()
        n_features = x.shape[0]
        max_sentence_length = x.shape[1]
        self.Wa = np.zeros((max_sentence_length  +  n_features))

