In [1]:
import torch
import torch.functional as F
import torch.nn as nn

import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Model, Input, layers, models, optimizers


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pprint import pprint
import re

In [2]:
torch.__version__

'2.4.0'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [5]:
from os import linesep
import string

# Read the file
file_path = '/kaggle/input/text-for-next-word-predictor/leo tolstoy - war and peace.txt'

# Open and read the contents of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = re.sub(r'\.{1,}', '', filtered_text)
filtered_text = filtered_text.lower()

words=[]
for (word) in filtered_text.split():
    if word not in words:
        words.append(word)

para=filtered_text.split("\n\n")
print("Total no. of para: ", len(para))
print("Total unique words: ", len(words))

Total no. of para:  12426
Total unique words:  17879


In [6]:
stoi={s:i+1 for i,s in enumerate(words)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
print(len(itos))

17880


In [7]:
# Hyperparameter
block_size=5 # context_length: how many words do we take to predict the next one

# X and Y matrices to store the data for training
# X stores the half lines
# Y stores the next word
X,Y=[],[]
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for p in para:
  context=[0]*block_size

  for word in p.split():
    word=word.rstrip(string.punctuation)
    ix=stoi[word]
    X.append(context)
    Y.append(ix)
    # print(' '.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]


# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)


X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([565951, 5]), torch.Size([565951]), torch.int64, torch.int64)

In [8]:
emb_dim = 64 # Hyperparameter

# Embedding layer
emb=torch.nn.Embedding(len(stoi),emb_dim).to(device)
print(emb)


Embedding(17880, 64)


In [9]:
class Next_Word_Predictor(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_dim):
    super().__init__()
    # Input size: vocab_size (the total number of characters in the vocabulary).
    # Output size: emb_dim (the size of the dense vector representation for each character).
    self.emb=nn.Embedding(vocab_size, emb_dim)
    # Input size: block_size * emb_dim
    # Output size: hidden_dim (the size of the hidden layer).
    self.linear1=nn.Linear(block_size*emb_dim, hidden_dim)
    # Input size: hidden_dim
    # Output size: vocab_size (the total number of words in the vocabulary).
    self.linear2=nn.Linear(hidden_dim, vocab_size)

  def forward(self, x):
    # I/P layer
    x = self.emb(x)
    x = x.view(x.shape[0], -1)
    # Hidden layer
    x = self.linear1(x)
    x = torch.relu(x)
    # Output layer
    x = self.linear2(x)
    return x

In [19]:
# Generate names from untrained model


def generate_next_words(model, itos, stoi, content, block_size, k=10, max_len=10):
    context = content.lower()
    context = re.sub('[^a-zA-Z0-9 \.]', '', context)
    context = [stoi[word.strip(string.punctuation)] for word in context.split()]

    if len(context) <= block_size:
        context = [0] * (block_size - len(context)) + context
    elif len(context) > block_size:
        context = context[-block_size:]

    for i in range(k):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = itos[ix]
        content += " " + word
        context = context [1:] + [ix]
        
    return content


In [21]:
import time

# Train the model
model=Next_Word_Predictor(block_size, len(stoi), emb_dim, 100).to(device)
loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=0.01)

# Mini-batch training
batch_size = 1024
print_every = 1
elapsed_time = []

for epoch in range(2000):
    start_time = time.time()
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i+batch_size]
        y = Y[i:i+batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        opt.step()
        opt.zero_grad()
    end_time = time.time()
    elapsed_time.append(end_time - start_time)
    if epoch % print_every == 0:
        print(epoch, loss.item())

0 6.391009330749512
1 5.832068920135498


KeyboardInterrupt: 

In [20]:
# Generate names from trained model

para=" "
content=input("Enter some context: ")
k=int(input("Enter no. of words to be generated: "))
for i in range(10):
    para+=generate_next_words(model, itos, stoi, content, block_size, k)
    para+="\n\n"
print(para)

Enter some context:  letters of inquiry and notices from the court arrived
Enter no. of words to be generated:  10


 letters of inquiry and notices from the court arrived and from the screaming political left excitement to the company

letters of inquiry and notices from the court arrived back the enemy was a long ago you trouble was

letters of inquiry and notices from the court arrived at last listen nurse dont think of morning his cheeks

letters of inquiry and notices from the court arrived with front he firmly fixed it i shall be helped

letters of inquiry and notices from the court arrived and learning the thousands of times stood at the entrenchment

letters of inquiry and notices from the court arrived angrily the blonde officers on him went out to their

letters of inquiry and notices from the court arrived with his leader house owned he and knows what since

letters of inquiry and notices from the court arrived in the ground a healthy and thick coat seated ran

letters of inquiry and notices from the court arrived as if they drank a smiling in russian soldier and

letters of inquiry and no

In [None]:
# Saving the model

torch.save(model, 'next_word_predictor_model.pth')

In [None]:
embedding_weights = model.emb.weights[0].numpy()
print(embedding_weights.shape)
# Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_tsne = tsne.fit_transform(embedding_weights)

# Visualize embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.5)
plt.title('t-SNE Visualization of Embeddings')  
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()