# Importing relevant libraries

In [1]:
import torch
import torch.functional as F
import torch.nn as nn

import tensorflow as tf

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pprint import pprint
import re

In [2]:
torch.__version__

'2.4.0'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

# Filtering the text

In [5]:
from os import linesep
import string

# Read the file
file_path = '/kaggle/input/text-for-next-word-predictor/leo tolstoy - war and peace.txt'

# Open and read the contents of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = filtered_text.lower()

lines=filtered_text.split(".")
words=['.']
for l in lines:
    for w in l.split():
        if (len(w)>0):
            words.append(w)
words=set(words)

print("Total no. of lines: ", len(lines))
print("Total unique words: ", len(words))

Total no. of lines:  30588
Total unique words:  17877


In [6]:
#Mapping from words to integers and vice versa
stoi={s:i for i,s in enumerate(words)}
itos={i:s for s,i in stoi.items()}
print(len(itos))

17877


# Generating the labelled dataset

In [7]:
# Hyperparameter
block_size=5 # context_length: how many words do we take to predict the next one

# X and Y matrices to store the data for training
# X stores the half lines
# Y stores the next word
X,Y=[],[]
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for l in lines:
  context=[0]*block_size
  word_l=l.split()

  for i in range(len(word_l)):
    ix=stoi[word_l[i]]
    X.append(context)
    Y.append(ix)
    # print(' '.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]

    if (i==len(word_l)-1):
        ix=stoi['.']
        X.append(context)
        Y.append(ix)
        # print(' '.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)


X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([592621, 5]), torch.Size([592621]), torch.int64, torch.int64)

# Defining the model

In [8]:
emb_dim = 64 # Hyperparameter

# Embedding layer
emb=torch.nn.Embedding(len(stoi),emb_dim).to(device)
print(emb)


Embedding(17877, 64)


In [9]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value):
        super().__init__()
        self.block_size = block_size
        self.hyperparams = {'block_size':self.block_size, 'emb_dim':emb_dim, 'hidden_dim':hidden_dim, 'activation_fn':activation_fn, 'seed_value':seed_value}
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.linear1 = nn.Linear(block_size * emb_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
         
        if activation_fn == 'sigmoid':
            self.activation = torch.sigmoid  
        else:
            self.activation = torch.relu 

    def forward(self, x):
        # Embedding layer
        x = self.emb(x)
        x = x.view(x.shape[0], -1)  
        
        # Hidden layer
        x = self.linear1(x)
        x = self.activation(x)
        
        # Output layer
        x = self.linear2(x)
        
        return x


# Training the model

In [10]:
def train_model(X, Y, block_size, emb_dim, vocab_size, hidden_dim, activation_fn, seed_value, device, batch_size=1024, epochs=101, print_every=10):
    """
    Train the model with the specified seed value.
    
    Arguments:
    X -- training data (input features)
    Y -- training data (labels)
    block_size -- context size for input sequence
    emb_dim -- embedding dimension for the model
    vocab_size -- the size of the vocabulary
    hidden_dim -- the size of the hidden layer
    activation_fn -- the activation function to use ('relu', 'tanh', 'sigmoid')
    seed_value -- the seed value for reproducibility
    device -- device to run the training on ('cpu' or 'cuda')
    batch_size -- the size of each mini-batch (default: 1024)
    epochs -- number of training epochs (default: 2000)
    print_every -- print loss after every 'n' epochs (default: 10)
    """
    
    torch.manual_seed(seed_value)

    model = Next_Word_Predictor(block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value).to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        
        # Mini-batch training
        for i in range(0, X.shape[0], batch_size):
            x = X[i:i + batch_size].to(device)
            y = Y[i:i + batch_size].to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        if epoch % print_every == 0:
            print(f'Epoch {epoch}: Loss = {loss.item()}')
    
    return model


In [11]:
vocab_size = len(stoi)
# Some other hyperparameters
hidden_dim = 1024
activation_fn = 'relu' 
seed_value = 42 

In [12]:
model = train_model(X, Y, block_size, emb_dim, vocab_size, hidden_dim, activation_fn, seed_value, device)

Epoch 0: Loss = 6.129698753356934
Epoch 10: Loss = 1.634093999862671
Epoch 20: Loss = 1.0681366920471191
Epoch 30: Loss = 0.797231912612915
Epoch 40: Loss = 0.650911271572113
Epoch 50: Loss = 0.5625903010368347
Epoch 60: Loss = 0.4991651773452759
Epoch 70: Loss = 0.4519367218017578
Epoch 80: Loss = 0.42472612857818604
Epoch 90: Loss = 0.39788398146629333
Epoch 100: Loss = 0.3888603150844574


In [13]:
# Saving the model

torch.save(model, 'model_variant_1.pth')

# Generating Predictions

In [14]:
# Generate names from trained model

def generate_next_words(model, itos, stoi, content, seed_value, k, max_len=10):
    torch.manual_seed(seed_value)
    
    block_size = model.block_size
    context = content.lower()
    context = re.sub('[^a-zA-Z0-9 \.]', '', context)
    context = re.sub('\.', ' . ', context)
    word_c = context.split()
    context = []
    for i in range(len(word_c)):
        try:
            if stoi[word_c[i]]:
                context.append(word_c[i])
        except:
            continue
            
    context = [stoi[w] for w in context]
               
    if len(context) <= block_size:
        context = [0] * (block_size - len(context)) + context
    elif len(context) > block_size:
        context = context[-block_size:]

    for i in range(k):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = itos[ix]
        content += " " + word
        context = context [1:] + [ix]
        
    return content


In [20]:
# Generate names from trained model

para=""
content=input("Enter some content: ")
k=int(input("Enter no. of words to be generated: "))
para+=generate_next_words(model, itos, stoi, content, seed_value, k)
para+="\n\n"
print(para)

Enter some content:  she continued after a short pause, drawing nearer to the prince and smiling amiably at him as if to show that political and social topics were ended
Enter no. of words to be generated:  40


she continued after a short pause, drawing nearer to the prince and smiling amiably at him as if to show that political and social topics were ended and the time had come for the old war the countess did in her words nicholas expressed a power to be set off . in the house was always anxious or i will only to recalled the activity of the




# Visualization of embeddings

In [21]:
embedding_weights = model.emb.weight.detach().cpu().numpy() #to be used for visualization

In [17]:
from sklearn.cluster import KMeans

# Set the number of clusters (you can experiment with this number)
n_clusters = 10

# Perform K-means clustering on the embeddings
kmeans = KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(embedding_weights)

# Create a dictionary to store words grouped by cluster
clustered_words = {i: [] for i in range(n_clusters)}

# Assign words to their respective clusters
for word, idx in stoi.items():
    if idx < embedding_weights.shape[0]:
        cluster = clusters[idx]
        clustered_words[cluster].append(word)

# Print words in each cluster
for cluster, words in clustered_words.items():
    print(f"Cluster {cluster}: {', '.join(words[:10])}")  # Limiting to first 10 words for readability



Cluster 0: prettier, fwiend, deplorable, recite, fabviers, supposition, bosse, growled, assert, lha
Cluster 1: listener, gunner, witnessed, shorter, steshka, upwards, yawned, vue, gratitude, arrive
Cluster 2: shadow, collect, bothering, deck, typically, chanting, antique, childish, deceitful, effectively
Cluster 3: admirable, drain, tilted, load, diagnosed, fists, maps, stuffed, advice, earliest
Cluster 4: mortiers, perceive, twigs, anticipate, rounded, genius, foliage, melyukovka, recommendation, joined
Cluster 5: ville, cloaks, withdrew, dragoon, energique, watching, retreats, impedes, call, prize
Cluster 6: crumpling, honors, difficult, furiously, decorating, selection, discard, accentuating, fluff, prettily
Cluster 7: ascended, interceded, startling, corporal, drop, infinitesimals, grabern, didnt, hillside, batiste
Cluster 8: dismounting, rotted, influences, appointments, enriched, ignorance, enforce, experts, disquiet, cartload
Cluster 9: adam, workingman, godson, loyalty, complac

In [18]:
embedding_weights = model.emb.weight.detach().cpu().numpy() #to be used for visualization
print(embedding_weights.shape)
# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=1)
embeddings_tsne = tsne.fit_transform(embedding_weights)

# Visualize embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.5)
plt.title('t-SNE Visualization of Embeddings')  
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

(17877, 64)


NameError: name 'TSNE' is not defined

In [None]:
texxt = "My name is Soham...."
filtered_text = re.sub(r'-', ' ', texxt)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = re.sub(r'\..', '', filtered_text)
filtered_text = filtered_text.lower()
print(filtered_text)

#### 