# Question 1

## Importing Libraries

In [268]:
import re
import torch
from torch import nn
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE

import re

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

## Processing Text Data

In [269]:
file = open("./sherlock_holmes.txt", "r")
text = file.read()

# consider text between start and end of project gutenberg. starts after 2nd "***", ends before 3rd "***" and we should not include "***"
start = text.find("***", text.find("***") + 1)
end = text.find("***", start + 1)
text = text[start+3:end]

# remove all occurences of '&c'
text = re.sub('&c', '', text)

# replace new lines with single new line
text=re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
text = re.sub(r'\n+', '\n', text)

# replace multiple spaces with single space
text = re.sub(' +', ' ', text)

# find unique characters in the text
list(set(text)) 

['2',
 'K',
 'o',
 'N',
 'u',
 '9',
 'Z',
 'J',
 'B',
 'x',
 'E',
 'à',
 'j',
 'Y',
 'U',
 'W',
 'M',
 ';',
 '“',
 'è',
 'T',
 'c',
 'æ',
 '—',
 'X',
 'f',
 '3',
 'L',
 'Q',
 'q',
 'O',
 '£',
 'p',
 'P',
 '0',
 'd',
 ')',
 '!',
 '5',
 '\n',
 'v',
 'é',
 'â',
 '.',
 '-',
 '_',
 ',',
 'œ',
 'A',
 'n',
 '6',
 'I',
 'G',
 '7',
 '(',
 'w',
 'i',
 'V',
 'l',
 'R',
 'g',
 'm',
 'b',
 'h',
 'D',
 'a',
 '4',
 'r',
 'F',
 '”',
 '½',
 'z',
 '’',
 ' ',
 '8',
 '‘',
 ':',
 'S',
 'C',
 'k',
 '1',
 '&',
 's',
 'H',
 'e',
 't',
 'y',
 '?']

## Analysing the data to format as tokens

add space before and after
- '-': hyphen used to connect words
- '—': em dash to create interruptions in dialogue

add space before
- '½': this is used to represent half once in the text
- '’': this is used to enclose quotations within the main dialogue (lets see if the model can learn this)
- '?': questions
- '!': exclamations
- '.': this is used to end a sentence
- ',': 
- ')': this is used to enclose additional information
- '”': end quote
- ';'

add space after
- '‘': this is used to enclose quotations within the main dialogue (lets see if the model can learn this)
- '“': start quote
- '(': this is used to enclose additional information

as it is
- '£': this is used to represent the British pound
- '&': this is used to represent 'and' in company names

for '_', '0123456789', '.', ':', add space before and after if there already isn't

make sure the same for the "\n" substring

parts of a word:
- 'æ': part of the word encyclopædia
- 'à': letter used in French, Italian, and Portuguese. It is an 'a' with a grave accent
- 'é': letter used in French, Spanish, and Portuguese. It is an 'e' with an acute accent
- 'è': letter used in French, Italian, and Portuguese. It is an 'e' with a grave accent
- 'â': letter used in French, Portuguese, and Vietnamese. It is an 'a' with a circumflex accent
- 'a-z': this is used to represent letters
- 'A-Z': this is used to represent letters

we'll use ' ' to seperate words

In [270]:
# function to space-seperate per our vocabulary
def tokenize(text):
    formatted_text = ""

    for i in range(len(text)):
        char = text[i]
        next_char = text[i + 1] if i < len(text) - 1 else None

        if char in "-—":
            formatted_text += " " + char + " "
        elif char in "½’?!,)”;":
            formatted_text += " " + char
        elif char in "‘“(":
            formatted_text += char + " "
        elif char in "_0123456789.:":
            if next_char and (not next_char==" ") and (not text[i - 1]==" "):
                formatted_text += " " + char + " "
            elif next_char and next_char==" " and (not text[i - 1]==" "):
                formatted_text += " " + char
            elif next_char and not next_char==" ":
                formatted_text += char + " "
            else:
                formatted_text += char
        else:
            formatted_text += char

    result = ""
    for i in range(len(formatted_text)):
        if formatted_text[i] == "\n":
            # Check if there's a character before '\n' and if it's not a space
            if i > 0 and formatted_text[i - 1] != " ":
                result += " "
            
            # Add the newline character itself
            result += "\n"
            
            # Check if there's a character after '\n' and if it's not a space
            if i < len(formatted_text) - 1 and formatted_text[i + 1] != " ":
                result += " "
        else:
            result += formatted_text[i]
    return result

In [271]:
text = tokenize(text)

## Construct Vocabulary and Mappings

In [272]:
vocabulary = sorted(list(set(re.split(r'[ \t\r\f\v]+', text))))

In [273]:
wtoi = {ch:i for i, ch in enumerate(vocabulary)}
itow = {i:ch for i, ch in enumerate(vocabulary)}

# add '#' to the wtoi and itow mappings to represent empty
index = len(wtoi)
wtoi['#'] = index
itow[index] = '#'

print(len(itow))

8459


## Creating Model

In [274]:
activation_functions = {
    "tanh": torch.tanh,
    "relu": torch.relu
}

In [275]:
class NextToken(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_size_1, hidden_size_2, activation):
    # Initialize the parent class (nn.Module)
    super().__init__()

    # Create an embedding layer to map each character to a vector
    self.emb = nn.Embedding(vocab_size, emb_dim)

    # hidden layers
    self.lin1 = nn.Linear(block_size * emb_dim, hidden_size_1)
    self.lin2 = nn.Linear(hidden_size_1, hidden_size_2)

    # Create a linear layer to map the hidden state to the vocabulary size
    self.lin3 = nn.Linear(hidden_size_2, vocab_size)

    self.activation = activation

  def forward(self, x):
    # Embed the input characters
    x = self.emb(x)

    # Reshape the embedding to a 2D tensor
    # Before the dimension was (batch_size, block_size, emb_dim) and now its (batch_size, block_size * emb_dim)
    x = x.view(x.shape[0], -1)

    x = self.activation(self.lin1(x))
    x = self.activation(self.lin2(x))

    x = self.lin3(x)
    return x

In [276]:
for block_size in [5, 10, 15]:
    
    # generating training data
    X, Y = [], []
    context = [index] * block_size
    for word in [item for item in text.split(" ") if item!=""]:
        ix = wtoi[word]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]
    X = torch.tensor(X).to(device)
    Y = torch.tensor(Y).to(device)
    
    for embedding_dimensions in [64, 128]:
        for activation in ["tanh", "relu"]:
            
            model = NextToken(block_size, len(wtoi), embedding_dimensions, 2048, 2048, activation_functions[activation]).to(device)
            
            # Train the model
            print(f"training model_{block_size}_{embedding_dimensions}_{activation}")
            loss_fn = nn.CrossEntropyLoss()
            opt = torch.optim.AdamW(model.parameters(), lr=0.01)
            # Mini-batch training
            batch_size = 4096
            print_every = 100
            for epoch in range(750):
                for i in range(0, X.shape[0], batch_size):
                    x = X[i:i+batch_size]
                    y = Y[i:i+batch_size] 
                    y_pred = model(x) 
                    loss = loss_fn(y_pred, y) 
                    loss.backward()
                    opt.step()
                    opt.zero_grad()
                if epoch % print_every == 0:
                    print(epoch, loss.item())
            print("==")

            # Saving the model
            filename = f"question_1_models/next_word_model_{block_size}_{embedding_dimensions}_{activation}.pt"
            torch.save(model.state_dict(), filename)
            print(f"Model saved as {filename[len("question_1_models/"):]}")
            print("=====================")

training model_5_64_tanh
0 17.1767578125
100 1.5295231342315674
200 1.2396035194396973
300 1.3238580226898193
400 1.373178243637085
500 1.2566912174224854
600 1.3559234142303467
700 1.4627068042755127
==
Model saved as next_word_model_5_64_tanh.pt
training model_5_64_relu
0 5.917835712432861
100 0.10576169192790985
200 0.11086101830005646
300 0.5188949108123779
400 0.30586305260658264
500 0.27896881103515625
600 0.12550616264343262
700 0.22407464683055878
==
Model saved as next_word_model_5_64_relu.pt
training model_5_128_tanh
0 14.542134284973145
100 1.0339953899383545
200 1.3775618076324463
300 1.201419711112976
400 1.4258949756622314
500 1.4717528820037842
600 1.4162380695343018
700 1.4264135360717773
==
Model saved as next_word_model_5_128_tanh.pt
training model_5_128_relu
0 5.782036304473877
100 0.10380765795707703
200 0.40831178426742554
300 0.40738949179649353
400 0.3708231747150421
500 0.27905502915382385
600 0.39998507499694824
700 0.3796998858451843
==
Model saved as next_wor