# Importing relevant libraries

In [66]:
import torch
import torch.functional as F
import torch.nn as nn

import tensorflow as tf

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from pprint import pprint
import re

In [67]:
torch.__version__

'2.4.0'

In [68]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [69]:
device

device(type='cuda')

# Filtering the text

In [70]:
from os import linesep
import string

# Read the file
file_path = '/kaggle/input/text-for-next-word-predictor/leo tolstoy - war and peace.txt'

# Open and read the contents of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = filtered_text.lower()

lines=filtered_text.split(".")
words=['.']
for l in lines:
    for w in l.split():
        if (len(w)>0):
            words.append(w)
words=set(words)

print("Total no. of lines: ", len(lines))
print("Total unique words: ", len(words))

Total no. of lines:  30588
Total unique words:  17877


In [71]:
#Mapping from words to integers and vice versa
stoi={s:i for i,s in enumerate(words)}
itos={i:s for s,i in stoi.items()}
print(len(itos))

17877


# Generating the labelled dataset

In [72]:
# Hyperparameter
block_size=5 # context_length: how many words do we take to predict the next one

# X and Y matrices to store the data for training
# X stores the half lines
# Y stores the next word
X,Y=[],[]
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

for l in lines:
  context=[0]*block_size
  word_l=l.split()

  for i in range(len(word_l)):
    ix=stoi[word_l[i]]
    X.append(context)
    Y.append(ix)
    # print(' '.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]

    if (i==len(word_l)-1):
        ix=stoi['.']
        X.append(context)
        Y.append(ix)
        # print(' '.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)


X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([592621, 5]), torch.Size([592621]), torch.int64, torch.int64)

# Defining the model

In [73]:
emb_dim = 64 # Hyperparameter

# Embedding layer
emb=torch.nn.Embedding(len(stoi),emb_dim).to(device)
print(emb)


Embedding(17877, 64)


In [74]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value):
        super().__init__()
        self.block_size = block_size
        self.hyperparams = {'block_size':self.block_size, 'emb_dim':emb_dim, 'hidden_dim':hidden_dim, 'activation_fn':activation_fn, 'seed_value':seed_value}
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.linear1 = nn.Linear(block_size * emb_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
         
        if activation_fn == 'sigmoid':
            self.activation = torch.sigmoid  
        else:
            self.activation = torch.relu 

    def forward(self, x):
        # Embedding layer
        x = self.emb(x)
        x = x.view(x.shape[0], -1)  
        
        # Hidden layer
        x = self.linear1(x)
        x = self.activation(x)
        
        # Output layer
        x = self.linear2(x)
        
        return x


# Training the model

In [75]:
def train_model(X, Y, block_size, emb_dim, vocab_size, hidden_dim, activation_fn, seed_value, device, batch_size=1024, epochs=201, print_every=20):
    """
    Train the model with the specified seed value.
    
    Arguments:
    X -- training data (input features)
    Y -- training data (labels)
    block_size -- context size for input sequence
    emb_dim -- embedding dimension for the model
    vocab_size -- the size of the vocabulary
    hidden_dim -- the size of the hidden layer
    activation_fn -- the activation function to use ('relu', 'tanh', 'sigmoid')
    seed_value -- the seed value for reproducibility
    device -- device to run the training on ('cpu' or 'cuda')
    batch_size -- the size of each mini-batch (default: 1024)
    epochs -- number of training epochs (default: 2000)
    print_every -- print loss after every 'n' epochs (default: 10)
    """
    
    torch.manual_seed(seed_value)

    model = Next_Word_Predictor(block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value).to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        
        # Mini-batch training
        for i in range(0, X.shape[0], batch_size):
            x = X[i:i + batch_size].to(device)
            y = Y[i:i + batch_size].to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            
            loss.backward()
            opt.step()
            opt.zero_grad()
        
        if epoch % print_every == 0:
            print(f'Epoch {epoch}: Loss = {loss.item()}')
    
    return model


In [76]:
vocab_size = len(stoi)
# Some other hyperparameters
hidden_dim = 1024
activation_fn = 'relu' 
seed_value = 42 

In [77]:
model = train_model(X, Y, block_size, emb_dim, vocab_size, hidden_dim, activation_fn, seed_value, device)

Epoch 0: Loss = 6.1316423416137695
Epoch 20: Loss = 1.0935217142105103
Epoch 40: Loss = 0.6827052235603333
Epoch 60: Loss = 0.5356357097625732
Epoch 80: Loss = 0.4648418724536896
Epoch 100: Loss = 0.41502639651298523
Epoch 120: Loss = 0.39065784215927124
Epoch 140: Loss = 0.3561030328273773
Epoch 160: Loss = 0.35712406039237976
Epoch 180: Loss = 0.3540221154689789
Epoch 200: Loss = 0.34291714429855347


In [1]:
# Saving the model

torch.save(model, 'model_variant_1.pth')

NameError: name 'torch' is not defined

# Generating Predictions

In [79]:
# Generate names from trained model

def generate_next_words(model, itos, stoi, content, seed_value, k, max_len=10):
    torch.manual_seed(seed_value)
    
    block_size = model.block_size
    context = content.lower()
    context = re.sub('[^a-zA-Z0-9 \.]', '', context)
    context = re.sub('\.', ' . ', context)
    word_c = context.split()
    context = []
    for i in range(len(word_c)):
        try:
            if stoi[word_c[i]]:
                context.append(word_c[i])
        except:
            continue
            
    context = [stoi[w] for w in context]
               
    if len(context) <= block_size:
        context = [0] * (block_size - len(context)) + context
    elif len(context) > block_size:
        context = context[-block_size:]

    for i in range(k):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = itos[ix]
        content += " " + word
        context = context [1:] + [ix]
        
    return content


In [80]:
# Generate names from trained model

para=""
content=input("Enter some content: ")
k=int(input("Enter no. of words to be generated: "))
para+=generate_next_words(model, itos, stoi, content, seed_value, k)
para+="\n\n"
print(para)

SyntaxError: invalid syntax (2371000541.py, line 5)

# Visualization of embeddings

In [None]:
embedding_weights = model.emb.weight.detach().cpu().numpy() #to be used for visualization

In [None]:
from sklearn.cluster import KMeans

# Set the number of clusters (you can experiment with this number)
n_clusters = 10

# Perform K-means clustering on the embeddings
kmeans = KMeans(n_clusters=n_clusters, random_state=1)
clusters = kmeans.fit_predict(embedding_weights)

# Create a dictionary to store words grouped by cluster
clustered_words = {i: [] for i in range(n_clusters)}

# Assign words to their respective clusters
for word, idx in stoi.items():
    if idx < embedding_weights.shape[0]:
        cluster = clusters[idx]
        clustered_words[cluster].append(word)

# Print words in each cluster
for cluster, words in clustered_words.items():
    print(f"Cluster {cluster}: {', '.join(words[:10])}")  # Limiting to first 10 words for readability

In [None]:
import re
import random
import torch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

# Group words based on their suffixes
def group_words(itos):
    groups = {
        'verbs_ing': [],
        'verbs_ed': [],
        'adverbs_ly': [],
        'nouns': [],
        'adjectives': [],
        'pronouns': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'],
    }

    # Iterate over all words
    for word in itos.values():
        # Verbs ending with 'ing'
        if word.endswith('ing'):
            groups['verbs_ing'].append(word)
        # Verbs ending with 'ed'
        elif word.endswith('ed'):
            groups['verbs_ed'].append(word)
        # Adverbs ending with 'ly'
        elif word.endswith('ly'):
            groups['adverbs_ly'].append(word)
        # Nouns: words ending with common noun suffixes
        elif word.endswith('ness') or word.endswith('ment') or word.endswith('tion'):
            groups['nouns'].append(word)
        # Adjectives: words ending with common adjective suffixes
        elif word.endswith('able') or word.endswith('ous') or word.endswith('ive'):
            groups['adjectives'].append(word)

    return groups

# Get the grouped words
grouped_words = group_words(itos)

# Select a subset of words for each group (e.g., 10 from each group)
num_words_per_group = 10000
selected_words = []
selected_group_labels = []

colors = {
    'verbs_ing': 'blue',
    'verbs_ed': 'green',
    'adverbs_ly': 'orange',
    'nouns': 'purple',
    'adjectives': 'red',
    'pronouns': 'cyan',
}

for group_name, group in grouped_words.items():
    if group:  # Check if the group is not empty
        sampled_words = random.sample(group, min(num_words_per_group, len(group)))
        selected_words += sampled_words
        selected_group_labels += [group_name] * len(sampled_words)  # Label for the group

# Step 2: Extract the embeddings for the selected words
selected_embeddings = []
selected_word_labels = []

for word in selected_words:
    if word in stoi:  # Ensure the word exists in stoi
        idx = stoi[word]  # Get the index of the word
        selected_embeddings.append(embedding_weights[idx])  # Get the embedding
        selected_word_labels.append(word)  # Save the word for labeling

selected_embeddings = torch.tensor(selected_embeddings)  # Convert to tensor

# Step 3: Reduce dimensionality using t-SNE
tsne = TSNE(n_components=2, random_state=1)
embeddings_tsne = tsne.fit_transform(selected_embeddings)

# Step 4: Plot the selected words' embeddings
plt.figure(figsize=(12, 10))

# Use different colors for each group
for group_name, color in colors.items():
    indices = [i for i, label in enumerate(selected_group_labels) if label == group_name]
    plt.scatter(embeddings_tsne[indices, 0], embeddings_tsne[indices, 1], 
                alpha=0.6, label=group_name, color=color, s=100)  # Increase marker size

# # Annotate the plot with the corresponding words
# for i, word in enumerate(selected_word_labels):
#     plt.annotate(word, (embeddings_tsne[i, 0], embeddings_tsne[i, 1]), 
#                  fontsize=9, alpha=0.75)

# Add legend
plt.legend(title='Word Groups', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('t-SNE Visualization of Grouped Word Embeddings', fontsize=16)
plt.xlabel('t-SNE Component 1', fontsize=14)
plt.ylabel('t-SNE Component 2', fontsize=14)
plt.grid(True)  # Optional: Add grid for better readability
plt.tight_layout()  # Adjust layout
plt.show()


In [None]:
import re
import random
import torch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Group words based on their suffixes
def group_words(itos):
    groups = {
        'verbs_ing': [],
        'verbs_ed': [],
        'adverbs_ly': [],
        'nouns': [],
        'adjectives': [],
        'pronouns': ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'],
    }

    # Iterate over all words
    for word in itos.values():
        # Verbs ending with 'ing'
        if word.endswith('ing'):
            groups['verbs_ing'].append(word)
        # Verbs ending with 'ed'
        elif word.endswith('ed'):
            groups['verbs_ed'].append(word)
        # Adverbs ending with 'ly'
        elif word.endswith('ly'):
            groups['adverbs_ly'].append(word)
        # Nouns: words ending with common noun suffixes
        elif word.endswith('ness') or word.endswith('ment') or word.endswith('tion'):
            groups['nouns'].append(word)
        # Adjectives: words ending with common adjective suffixes
        elif word.endswith('able') or word.endswith('ous') or word.endswith('ive'):
            groups['adjectives'].append(word)

    return groups

pronouns = ['he', 'she', 'they', 'we', 'i', 'you', 'it', 'me', 'him', 'her', 'us', 'them', 'my', 'your', 'his', 'its', 'their', 'our']

grouped_words = group_words(itos)
grouped_words['pronouns'] = [word for word in pronouns if word in stoi]
print(f"Number of pronouns: {len(grouped_words['pronouns'])}")

# Select a subset of words for each group (e.g., 50 from each group)
num_words_per_group = 50

for group_name, group in grouped_words.items():
    selected_words = random.sample(group, min(num_words_per_group, len(group)))
    
    # Extract the embeddings for the selected words
    selected_embeddings = []
    selected_word_labels = []
    
    for word in selected_words:
        if word in stoi:  # Ensure the word exists in stoi
            idx = stoi[word]  # Get the index of the word
            selected_embeddings.append(embedding_weights[idx])  # Get the embedding
            selected_word_labels.append(word)  # Save the word for labeling
    
    selected_embeddings = torch.tensor(selected_embeddings)  # Convert to tensor
    n_samples = selected_embeddings.shape[0]
    
    # Reduce dimensionality using t-SNE
    perplexity_value = min(30, n_samples - 1)
    tsne = TSNE(n_components=2, perplexity=perplexity_value, random_state=1)
    embeddings_tsne = tsne.fit_transform(selected_embeddings)
    
    # Plot the selected words' embeddings
    plt.figure(figsize=(10, 8))
    plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.5)
    
    # Annotate the plot with the corresponding words
    for i, word in enumerate(selected_word_labels):
        plt.annotate(word, (embeddings_tsne[i, 0], embeddings_tsne[i, 1]), fontsize=9, alpha=0.75)
    
    plt.title(f't-SNE Visualization of Group: {group_name}')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')

    # Set equal scaling
    plt.axis('equal')  # This will make the axes have the same scale
    
    plt.show()
    
    # Ask user if they want to visualize the next group
    user_input = input(f"Do you want to proceed to the next group? (y/n): ")
    if user_input.lower() != 'y':
        break


#### 