<a href="https://colab.research.google.com/github/AbhinavaReddy-hub/learning-DL-/blob/main/LSTM_nextword_prediction__exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting the next word from DataFrame using LSTM in PyTorch
In this exercise we will build and train aLSTM Network to predict the next word based on sample data.

# 1.Data Preparation

## 1.Import necessary Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec

# 2.Create a DataFrame with sample text

In [None]:
# Step 1: Create an example dataset
data = {
    "text": [
        "The movie was fantastic and very engaging",
        "I hated the acting and the storyline",
        "It was boring and lacked depth",
        "Amazing performance by the actors and great direction",
        "Not worth watching at all",
        "One of the best movies I have ever seen"
    ]
}

# Create a DataFrame from the dataset
df = pd.DataFrame(data)


# 3.Preprocess the data

# Step 1: Tokenize the text

In [None]:
# Step 1: Define a function to tokenize the text
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())
# Apply the tokenization function to the text column
df['tokens'] = df['text'].apply(tokenize)
print(df)

                                                text  \
0          The movie was fantastic and very engaging   
1               I hated the acting and the storyline   
2                     It was boring and lacked depth   
3  Amazing performance by the actors and great di...   
4                          Not worth watching at all   
5            One of the best movies I have ever seen   

                                              tokens  
0  [the, movie, was, fantastic, and, very, engaging]  
1       [i, hated, the, acting, and, the, storyline]  
2              [it, was, boring, and, lacked, depth]  
3  [amazing, performance, by, the, actors, and, g...  
4                    [not, worth, watching, at, all]  
5  [one, of, the, best, movies, i, have, ever, seen]  


# step-2: word embeddings

In [None]:
# Step 2: Train a Word2Vec model on the tokenized sentences
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=50, window=3, min_count=1)
print(word2vec_model,word2vec_model)

Word2Vec<vocab=33, vector_size=50, alpha=0.025> Word2Vec<vocab=33, vector_size=50, alpha=0.025>


## Step 3: Prepare the sequences

In [None]:
# Step 3: Prepare dataset for word prediction
def prepare_sequences(tokens, model, context_size=3):
    X, y = [], []
    for i in range(len(tokens) - context_size):
        # Context words
        context = tokens[i:i + context_size]
        # Target word
        target = tokens[i + context_size]
        X.append([model.wv[word] for word in context])
        y.append(model.wv.key_to_index[target])  # Index of the target word in vocabulary
    return np.array(X), np.array(y)

In [None]:
# Create sequences for all rows in the dataset
context_size = 3
X, y = [], []
for tokens in df['tokens']:
    X_seq, y_seq = prepare_sequences(tokens, word2vec_model, context_size)
    X.extend(X_seq)
    y.extend(y_seq)

X, y = np.array(X), np.array(y)
print(X.shape,y.shape)
print(X,y)

(24, 3, 50) (24,)
[[[-0.00108157  0.00046619  0.01021299 ...  0.01921763  0.00997272
    0.01847972]
  [ 0.00287149 -0.00529305 -0.01414844 ...  0.0010219   0.01642843
   -0.01403542]
  [-0.01724348  0.00732895  0.01037926 ... -0.00309765  0.00302098
    0.00358537]]

 [[ 0.00287149 -0.00529305 -0.01414844 ...  0.0010219   0.01642843
   -0.01403542]
  [-0.01724348  0.00732895  0.01037926 ... -0.00309765  0.00302098
    0.00358537]
  [-0.01648536  0.01859871 -0.00039532 ... -0.00476074 -0.0062565
   -0.00474028]]

 [[-0.01724348  0.00732895  0.01037926 ... -0.00309765  0.00302098
    0.00358537]
  [-0.01648536  0.01859871 -0.00039532 ... -0.00476074 -0.0062565
   -0.00474028]
  [-0.01629744  0.00898925 -0.00828098 ... -0.01408151  0.00179921
    0.01281321]]

 ...

 [[ 0.00660615  0.01019108  0.00917083 ...  0.0165313  -0.01220133
    0.0189127 ]
  [-0.0144051   0.00845618  0.00434286 ... -0.0190672   0.00316558
   -0.01960658]
  [ 0.01562445 -0.01901143 -0.00039841 ... -0.00477906 -0.0

# Step 4: Convert to pytorch tensors

In [None]:
# Step 4: Convert data to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# 2.LSTM Model Building

# Step 1 Define the LSTM model

In [None]:
# Step 1: Define the LSTM model for word prediction
class WordPredictionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size):
        super(WordPredictionLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  # Get the hidden state from LSTM
        out = self.fc(hidden[-1])  # Pass hidden state through a fully connected layer
        return out

# step-2:Intialize the model parameters

In [None]:

# Step 2: Initialize the model
input_size = 50  # Size of the word vector
hidden_size = 64  # Number of hidden units in LSTM
vocab_size = len(word2vec_model.wv)  # Vocabulary size


# step-2: Initialize model, loss function and optimizer

In [None]:
# Step 1: Define loss function and optimizer
model = WordPredictionLSTM(input_size, hidden_size, vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 3. Training the model

# step-1: Train the model over multiple epochs

In [None]:
# Step 2: Training loop
num_epochs = 200  # Number of epochs for training
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Reset gradients

    # Forward pass
    outputs = model(X_tensor)  # Get model predictions
    print(outputs.shape)
    print(y_tensor.shape)
    loss = criterion(outputs, y_tensor)  # Calculate loss

    # Backward pass and optimization
    loss.backward()  # Backpropagation
    optimizer.step()  # Update model parameters

    # print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([24, 33])
torch.Size([24])
torch.Size([

# 4.Evaluate the model

In [None]:
# Step 1: Prediction function for next word
def predict_next_word(context, model, word2vec_model, context_size=3):
    # Tokenize the input context
    tokens = tokenize(context)
    if len(tokens) < context_size:
        raise ValueError(f"Context must have at least {context_size} words")
    tokens = tokens[-context_size:]  # Use only the last `context_size` words

    # Convert tokens to vectors
    vectors = [word2vec_model.wv[word] for word in tokens]

    # Convert to PyTorch tensor and add batch dimension
    input_tensor = torch.tensor([vectors], dtype=torch.float32)  # Shape: (1, context_size, input_size)

    # Set the model to evaluation mode
    model.eval()

    # Make prediction
    with torch.no_grad():
        output = model(input_tensor)  # Get model predictions
        predicted_index = torch.argmax(output, dim=1).item()  # Get the index of the maximum value (class)

    # Convert index back to word
    predicted_word = word2vec_model.wv.index_to_key[predicted_index]
    return predicted_word

# Interactive testing

In [None]:
# Interactive Testing Function
def interactive_predict(model, word2vec_model, context_size=3):
    print("\nInteractive Word Prediction")
    print("Enter a context sentence to predict the next word.")
    print("Type 'exit' to quit.\n")

    while True:
        context = input("Enter context: ")
        if context.lower() == 'exit':
            print("Exiting interactive testing. Goodbye!")
            break

        try:
            next_word = predict_next_word(context, model, word2vec_model, context_size)
            print(f"Predicted next word: \"{next_word}\"")
        except ValueError as e:
            print(f"Error: {e}. Ensure the context has at least {context_size} words.")


In [None]:
# Run interactive testing
interactive_predict(model, word2vec_model)


Interactive Word Prediction
Enter a context sentence to predict the next word.
Type 'exit' to quit.

