In [None]:
!pip install ipywidgets

# Code for classical tokenizer given but used bert tokenizer for fast processing

In [None]:
# import re
# from typing import List
# import time
# import string
# import os
# import numpy as np
# import torch
# from torch import nn
# from tqdm.notebook import tqdm_notebook as tqdm
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

# start=time.time()
# MAX_SEQ_LEN=32
# class Tokenizer_Private():
#     def __init__(self):
#         self.data=None
#     def tokenize(self,data,vocab=None):
#         self.data=data
#         if(vocab==None):
#                 tokenized_text=[nltk.tokenize.word_tokenize(line) for line in self.data]
#                 flat_token=[token for tokens in tokenized_text for token in tokens]
#                 self.stop_words=set(stopwords.words('english'))
#                 filtered_tokens=[token for token in flat_token if token.isalnum() and token not in self.stop_words]
#                 max_vocab_size=10000
#                 freq_dist=nltk.probability.FreqDist(filtered_tokens)
#                 common_words=freq_dist.most_common(max_vocab_size)
#                 self.vocabulary={word:idx+1 for idx,(word,_) in enumerate(common_words)}
#         #set self vocab from json
#         sequence=[]
#         for text in self.data:
#             tokens = nltk.tokenize.word_tokenize(text.lower())
#             filtered_tokens = [token for token in tokens if token.isalnum() and token not in self.stop_words]
#             numerical_sequence = [self.vocabulary.get(token, 0) for token in filtered_tokens]
#             sequence.append(numerical_sequence)
#         return sequence

#     def detokenize(self, inputs):
#         assert self.data!=None            
#         tokens=[self.vocabulary.get(idx,"UNK") for idx in inputs]
#         return " ".join(tokens)
#     def get_vocabulary(self) -> List[str]:
#         assert self.data!=None 
#         return list(self.vocabulary.keys())
#     def token_to_id(self,token):
#         assert self.data!=None 
#         return self.vocabulary.get(token, 0) 
#     def id_to_token(self,id):
#         assert self.data!=None 
#         return self.vocabulary.get(id,"UNK")
    
    
# class LSTM_Tokenizer:
#     def __init__(self,data) -> None:
#         self.data=data 
#         self.tokenizer=Tokenizer_Private()
#     def tokenize(self):
#         tokenized=[]
#         token_list=self.tokenizer.tokenize(self.data)
#         for data in token_list:
#             for i in range(len(data)):
#                 tokens=torch.unsqueeze(torch.tensor(data[:i+1],dtype=torch.int32),dim=0)
#                 padding_amount = MAX_SEQ_LEN - tokens.size(1)
#                 padded_sequences = torch.nn.functional.pad(tokens,(padding_amount, 0),mode="constant", value=0)
#                 tokenized.append(torch.squeeze(padded_sequences,dim=0))
#         tokenized_tensor=torch.stack(tokenized,dim=0)
#         return self.tokenizer,tokenized_tensor
# class Dataset:
#     def __init__(self,dir):
#         self.paths=[os.path.join(dir,i) for i in os.listdir(dir) if i.endswith("txt")][:500]
#     def get_data(self,tokenizer:str="LSTM"):
#         pages=[]
#         for i in tqdm(range(len(self.paths))):
#             with open(self.paths[i],"r", encoding="iso-8859-1") as f:
#                 temp=f.readlines()
#                 pages.extend(self.preprocess(i) for i in temp if i!="\n")
#         assert tokenizer in ["LSTM"]
#         return LSTM_Tokenizer(pages)
        
#     def preprocess(self,line):
#         line=line.lower().strip()
#         translator = str.maketrans('', '', string.punctuation)
#         line = line.translate(translator)
#         line=re.sub(r"\n","",line)
#         return line
    
# dataset=Dataset("/kaggle/input/gutenberg/txt")
# Token=dataset.get_data("LSTM")
# tokenizer,x_data=Token.tokenize()
# print("Time to load dataset and obtain tokens is {}".format(time.time()-start))

In [None]:
# with open("vocab.txt","w") as f:
#     for i in tokenizer.get_vocabulary():
#         f.write(i+"\n")

In [None]:
from transformers import BertTokenizerFast
from torch.utils.data import Dataset, random_split
import os
import torch
import time
start=time.time()
MAX_SEQ_LEN=32
class TextDataset(Dataset):
    def __init__(self, data_dir, max_seq_len=32):
        self.data_dir = data_dir
        self.max_seq_len = max_seq_len
        self.data = []
        for filename in os.listdir(data_dir)[:500]:
            if filename.endswith(".txt"):
                with open(os.path.join(data_dir, filename), "r",encoding="iso-8859-1") as f:
                    text = f.read()
                encoded_inputs = Tokenizer(text, truncation=True, padding="max_length", max_length=self.max_seq_len)
                input_ids = encoded_inputs["input_ids"]#attention_mask = encoded_inputs["attention_mask"]
                for i in range(len(input_ids)):
                    tokens=torch.unsqueeze(torch.tensor(input_ids[:i+1],dtype=torch.int32),dim=0)
                    padding_amount = MAX_SEQ_LEN - tokens.size(1)
                    padded_sequences = torch.nn.functional.pad(tokens,(padding_amount, 0),mode="constant", value=0)
                    toadd=torch.squeeze(padded_sequences,dim=0)
                    attention_mask=torch.ones_like(toadd)
                    self.data.append({"input_ids": toadd, "attention_mask": attention_mask,})
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

    
data_dir = "/kaggle/input/gutenberg/txt"
tokenizer_name = "bert-base-uncased"
Tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
dataset = TextDataset(data_dir, MAX_SEQ_LEN)


print("Time to load using bert fast tokenizer ",time.time()-start)

In [None]:
Vocab_Size=len(Tokenizer.get_vocab())
from torch.utils.data import DataLoader
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
import torch
import torch.nn as nn

class LSTMGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_size=128, hidden_size=1024):
        super(LSTMGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.drop = nn.Dropout(0.1)
        self.forward_lstm = nn.LSTM(embedding_size,512)
        self.backward_lstm = nn.LSTM(512,320,bidirectional=True)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.linear1 = nn.Linear(640, hidden_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        embedding = self.embedding(x)
        drop = self.drop(embedding)
        forward, _ = self.forward_lstm(drop.permute(1, 0, 2))
        backward, _ = self.backward_lstm(forward.permute(1, 0, 2))
        pool = self.pool(backward.permute(0, 2, 1)).squeeze()
        linear1 = self.relu(self.linear1(pool))
        linear2 = self.softmax(self.linear2(linear1))
        return linear2
        
class get_lstm_generator:
    def  __init__(self,vocab_size:int,embedding_size:int=128,hidden_size:int=1024):
        self.vocab_size=vocab_size
        self.embedding_size=embedding_size
        self.hidden_size=hidden_size
    def get_model(self):
        model = LSTMGenerator(self.vocab_size, self.embedding_size, self.hidden_size)
        loss_fn = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        return model,loss_fn,optimizer

In [None]:
model,loss_fn,optim=get_lstm_generator(Vocab_Size).get_model()
print(model)

# Sanity Checks

In [None]:
data=next(iter(train_loader))
input_str=data["input_ids"][:,:-1]
output_str=data["input_ids"][:,-1]
out=model(input_str)
print(out.shape)

In [None]:
print(output_str)

In [None]:
one_hot_tensor =torch.nn.functional.one_hot(output_str.to(torch.int64), num_classes=Vocab_Size)
print(torch.argmax(one_hot_tensor,dim=-1))

# Training Loop

In [None]:
from tqdm.notebook import tqdm_notebook
import torch
from torch import nn
device="cuda" if torch.cuda.is_available() else "cpu"
EPOCHS=100
model,loss_fn,optim=get_lstm_generator(Vocab_Size).get_model()
assert torch.cuda.device_count()>1
model = torch.nn.DataParallel(model, device_ids= list(range(torch.cuda.device_count())))
model.to(device)
loss_fn.to(device)
for epoch in tqdm_notebook(range(EPOCHS)):
    epoch_loss=0
    for i,data in enumerate(train_loader):
        input_str=data["input_ids"][:,:-1].to(device)
        output_str=data["input_ids"][:,-1].to(device)
        predicted=model(input_str)
        one_hot_tensor =torch.nn.functional.one_hot(output_str.to(torch.int64), num_classes=Vocab_Size).to(torch.float32).to(device)
        loss=loss_fn(predicted.to(torch.float32),one_hot_tensor)
        epoch_loss+=loss.detach().cpu().numpy()//8
        if(i%500==499):
            print("Epoch {} Batch {} Loss {}".format(epoch,i,loss.detach().cpu().numpy()))
    print("Epoch {} Cumulative Loss :{}".format(epoch,epoch_loss//2000))

# Transformer

In [None]:
import torch
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(self,seq_len, vocab_size, embed_dim, num_heads, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads,batch_first=True),
            num_layers=num_layers
        )
        self.middle=nn.Linear(embed_dim*seq_len,embed_dim)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads),
            num_layers=num_layers
        )

        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)

        memory = self.transformer_encoder(src)
        memory=self.middle(memory.view(memory.shape[0],-1))
        output = self.transformer_decoder(tgt,memory)

        output = self.fc(output)

        return output

class get_transformer_gen:
    def  __init__(self,vocab_size):
        self.vocab_size =  vocab_size
        self.embedding_size = 128
        self.num_heads = 8
        self.num_layers = 6  
    def get_model(self,):
        model = TransformerModel(31,self.vocab_size, self.embedding_size, self.num_heads,self.num_layers)
        loss_fn = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
        return model,loss_fn,optimizer   






In [None]:
model,loss_fn,optim=get_transformer_gen(Vocab_Size).get_model()
print(model)

In [None]:
data=next(iter(train_loader))
input_str=data["input_ids"][:,:-1]
output_str=data["input_ids"][:,-1]
out=model(input_str,output_str)
print(out.shape)

# Training Loop

In [None]:
from tqdm.notebook import tqdm_notebook
import torch
from torch import nn
device="cuda" if torch.cuda.is_available() else "cpu"
EPOCHS=100
model,loss_fn,optim=get_transformer_gen(Vocab_Size).get_model()
assert torch.cuda.device_count()>1
model = torch.nn.DataParallel(model, device_ids= list(range(torch.cuda.device_count())))
model.to(device)
loss_fn.to(device)
for epoch in tqdm_notebook(range(EPOCHS)):
    epoch_loss=0
    for i,data in enumerate(train_loader):
        input_str=data["input_ids"][:,:-1].to(device)
        output_str=data["input_ids"][:,-1].to(device)
        predicted=model(input_str,output_str)
        one_hot_tensor =torch.nn.functional.one_hot(output_str.to(torch.int64), num_classes=Vocab_Size).to(torch.float32).to(device)
        loss=loss_fn(predicted.to(torch.float32),one_hot_tensor)
        epoch_loss+=loss.detach().cpu().numpy()//8
        if(i%500==499):
            print("Epoch {} Batch {} Loss {}".format(epoch,i,loss.detach().cpu().numpy()))
    print("Epoch {} Cumulative Loss :{}".format(epoch,epoch_loss))
