# Importing relevant libraries

In [1]:
import torch
import torch.functional as F
import torch.nn as nn

import tensorflow as tf

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pprint import pprint
import re

In [2]:
torch.__version__

'2.4.0'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

# Filtering the text

In [5]:
from os import linesep
import string

# Read the file
file_path = '/kaggle/input/text-for-next-word-predictor/leo tolstoy - war and peace.txt'

# Open and read the contents of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = re.sub(r'\.{1,}', '', filtered_text)
filtered_text = filtered_text.lower()

words=[]
for (word) in filtered_text.split():
    if word not in words:
        words.append(word)

para=filtered_text.split("\n\n")
print("Total no. of para: ", len(para))
print("Total unique words: ", len(words))

Total no. of para:  12426
Total unique words:  17879


In [6]:
#Mapping from words to integers and vice versa
stoi={s:i+1 for i,s in enumerate(words)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
print(len(itos))

17880


# Generating the labelled dataset

In [7]:
# Hyperparameter
block_size=5 # context_length: how many words do we take to predict the next one

# X and Y matrices to store the data for training
# X stores the half lines
# Y stores the next word
X,Y=[],[]
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for p in para:
  context=[0]*block_size

  for word in p.split():
    word=word.rstrip(string.punctuation)
    ix=stoi[word]
    X.append(context)
    Y.append(ix)
    # print(' '.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]


# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)


X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([565951, 5]), torch.Size([565951]), torch.int64, torch.int64)

# Defining the model

In [8]:
emb_dim = 64 # Hyperparameter

# Embedding layer
emb=torch.nn.Embedding(len(stoi),emb_dim).to(device)
print(emb)


Embedding(17880, 64)


In [9]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value):
        super().__init__()
        self.block_size = block_size
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.linear1 = nn.Linear(block_size * emb_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        
        if activation_fn == 'relu':
            self.activation = torch.relu  
        elif activation_fn == 'sigmoid':
            self.activation = torch.sigmoid  
        elif activation_fn == 'tanh':
            self.activation = torch.tanh 

    def forward(self, x):
        # Embedding layer
        x = self.emb(x)
        x = x.view(x.shape[0], -1)  
        
        # Hidden layer
        x = self.linear1(x)
        x = self.activation(x)
        
        # Output layer
        x = self.linear2(x)
        
        return x


# Training the model

In [10]:
def train_model(X, Y, block_size, emb_dim, vocab_size, hidden_dim, activation_fn, seed_value, device, batch_size=1024, epochs=400, print_every=10):
    """
    Train the model with the specified seed value.
    
    Arguments:
    X -- training data (input features)
    Y -- training data (labels)
    block_size -- context size for input sequence
    emb_dim -- embedding dimension for the model
    vocab_size -- the size of the vocabulary
    hidden_dim -- the size of the hidden layer
    activation_fn -- the activation function to use ('relu', 'tanh', 'sigmoid')
    seed_value -- the seed value for reproducibility
    device -- device to run the training on ('cpu' or 'cuda')
    batch_size -- the size of each mini-batch (default: 1024)
    epochs -- number of training epochs (default: 2000)
    print_every -- print loss after every 'n' epochs (default: 10)
    """
    
    torch.manual_seed(seed_value)

    model = Next_Word_Predictor(block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value).to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=0.01)

    for epoch in range(epochs):
        
        # Mini-batch training
        for i in range(0, X.shape[0], batch_size):
            x = X[i:i + batch_size].to(device)
            y = Y[i:i + batch_size].to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        if epoch % print_every == 0:
            print(f'Epoch {epoch}: Loss = {loss.item()}')
    
    return model


In [11]:
vocab_size = len(stoi)
# Some other hyperparameters
hidden_dim = 1024
activation_fn = 'relu' 
seed_value = 42 

In [12]:
model = train_model(X, Y, block_size, emb_dim, vocab_size, hidden_dim, activation_fn, seed_value, device)

Epoch 0: Loss = 6.53764009475708


# Generating Predictions

In [15]:
# Generate names from trained model

def generate_next_words(model, itos, stoi, content, seed_value, k, max_len=10):
    torch.manual_seed(seed_value)
    
    block_size = model.block_size
    context = content.lower()
    context = re.sub('[^a-zA-Z0-9 \.]', '', context)
    context = [stoi[word.strip(string.punctuation)] for word in context.split()]

    if len(context) <= block_size:
        context = [0] * (block_size - len(context)) + context
    elif len(context) > block_size:
        context = context[-block_size:]

    for i in range(k):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = itos[ix]
        content += " " + word
        context = context [1:] + [ix]
        
    return content


In [19]:
# Generate names from trained model

para=" "
content=input("Enter some content: ")
k=int(input("Enter no. of words to be generated: "))
para+=generate_next_words(model, itos, stoi, content, seed_value, k)
para+="\n\n"
print(para)

Enter some content:  He spoke in that refined
Enter no. of words to be generated:  1000


 He spoke in that refined eyes which she could not mess and artillery everyone can picture go to their things and presenting as he understood his hollows of which subjected in an extent of whether way and vice versa and a class but did interrogative talk at the bridges of horror as the instance mood of ivanovich he not only could not hold we are fear the red beasts were that pain just what do you think may us have expired essential question so consistently they were all the marriage with a spiritual wound proves or the highroad of rain which indicates that if the reward of their superhuman simplicity always crickets stood particularly forwards or bare dissatisfaction of reproaches dissatisfaction familiar for their peoples affairs that is within question badly the extraordinarily verdure a russian case she was the soup kindly men who when he and a sigh is glad that it does not yet bogdanovna quarreled he frowned and also presented with a short pity from it qualms asked pierre with a ha

In [None]:
# Saving the model

torch.save(model, 'model_variants/model_variant_1.pth')

# Visualization of embeddings

In [None]:
embedding_weights = model.emb.weight.detach().cpu().numpy() #to be used for visualization
print(embedding_weights.shape)
# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=1)
embeddings_tsne = tsne.fit_transform(embedding_weights)

# Visualize embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.5)
plt.title('t-SNE Visualization of Embeddings')  
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()