In [1]:
import torch as th
import torch.nn as nn
import torch.optim as op
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

from collections import Counter
from torchtext.vocab import Vocab

In [2]:
import numpy as np 
import regex as re
import pandas as pd

In [3]:
def file_to_sentence_list(file_path):
    # Step 1: Open the file
    with open(file_path, "r") as file:
        # Step 2: Read the file contents
        text = file.read()
        
    # Step 3: Split the text into sentences
    sentences = re.split(r"(?<=[.!?])\s+", text)
    
    # Step 4: Strip whitespace from sentences
    sentences = [sentence.strip() for sentence in sentences]
    
    # Step 5: Filter out empty sentences
    sentences = [sentence for sentence in sentences if sentence]
    
    # Step 6: Return the list of sentences
    return sentences

In [4]:
# Load the text data from a file
file_path = "pizza.txt"
text_data = file_to_sentence_list(file_path)

# Create a tokenizer for basic English words
tokenizer = get_tokenizer('basic_english')

# Define a function to yield tokens from the text data
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Build a vocabulary from the tokens, including a special token for unknown words
vocab = build_vocab_from_iterator(yield_tokens(text_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Calculate the total number of unique words in the vocabulary
total_words = len(vocab)

# Output the total number of unique words
total_words

683

In [5]:
# Initialize an empty list to store the input sequences
input_sequence = []

# Loop through each sentence in the text data
for line in text_data:
    # Tokenize the sentence and convert tokens to vocabulary indices
    token_list = [vocab[token] for token in tokenizer(line)]
    
    # Generate n-gram sequences for the sentence
    for i in range(1, len(token_list)):
        # Create an n-gram sequence from the first token to the (i+1)-th token
        n_gram_sequence = token_list[:i + 1]
        
        # Convert the n-gram sequence to a PyTorch tensor and append to the list
        input_sequence.append(th.tensor(n_gram_sequence, dtype=th.long))

In [6]:
import torch.nn.utils.rnn as rnn_utils

# Find the maximum sequence length
max_seq = max([len(seq) for seq in input_sequence])

# Pad all sequences to the maximum length
input_seq = rnn_utils.pad_sequence(input_sequence, batch_first=True, padding_value=vocab["<pad>"])

In [8]:
# Creating X & Y
X = input_seq[:,:-1]
Y = input_seq[:,-1]

In [9]:
# Assuming X and Y are already defined as tensors
dataset = th.utils.data.TensorDataset(X, Y)
data_loader = th.utils.data.DataLoader(dataset=dataset, batch_size=32, shuffle=True)

In [12]:
embedding_dim=100
hidden_dim=128
vocab_size=total_words

In [13]:
#Definie the model
class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextGenerationModel, self).__init__()
        # Convert words to vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM to process the word vectors
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # Linear layer to get the final word scores
        self.linear = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, text):
        # Convert input words to embeddings
        embedded = self.embedding(text)
        # Pass embeddings through the LSTM
        lstm_out, (hidden, cell) = self.lstm(embedded)
        # Use the last output of the LSTM
        output = self.linear(lstm_out[:, -1, :])
        return output

In [14]:
mdl=TextGenerationModel(vocab_size,embedding_dim,hidden_dim)
mdl

TextGenerationModel(
  (embedding): Embedding(683, 100)
  (lstm): LSTM(100, 128)
  (linear): Linear(in_features=128, out_features=683, bias=True)
)

In [16]:
# Initialize model, loss function, and optimizer
criterion=nn.CrossEntropyLoss()
optimizer=op.Adam(mdl.parameters(),lr=0.001)

In [17]:
num_epoch = 10  # Number of epochs for training

for k in range(num_epoch):  # Loop over each epoch
    for label, target in data_loader:  # Loop over each batch in the data loader
        outputs = mdl(label)  # Forward pass: Get model predictions
        loss_fn = criterion(outputs, target)  # Compute the loss

        optimizer.zero_grad()  # Zero the gradients
        loss_fn.backward()  # Backward pass: Compute gradients
        optimizer.step()  # Update model parameters

    # Print the loss for the current epoch
    print(f"Epoch[{k+1}/{num_epoch}], Loss: {loss_fn.item():.4f}")

print("Training Complete.")  # Training completion message

Epoch[1/10], Loss: 0.0829
Epoch[2/10], Loss: 0.0348
Epoch[3/10], Loss: 0.0201
Epoch[4/10], Loss: 0.0139
Epoch[5/10], Loss: 0.0100
Epoch[6/10], Loss: 0.0080
Epoch[7/10], Loss: 0.0065
Epoch[8/10], Loss: 0.0053
Epoch[9/10], Loss: 0.0044
Epoch[10/10], Loss: 0.0041
Training Complete.


Predicting the next word


In [22]:
import torch as th

def generate_text(mdl, seed_txt, next_words, max_sequence_len, vocab):
    mdl.eval()  # Set model to evaluation mode
    
    for _ in range(next_words):
        # Tokenize the seed text and convert to indices
        seed_tokens = seed_txt.split()
        token_list = [vocab[token] for token in seed_tokens]
        token_list = th.tensor([token_list], dtype=th.long)
        token_list = token_list[:, -max_sequence_len:]  # Truncate to max sequence length
        
        with th.no_grad():  # Disable gradient computation
            predict_probs = mdl(token_list)  # Get model predictions
            predict_index = th.argmax(predict_probs, dim=1).item()  # Get predicted word index
            predict_word = vocab.lookup_token(predict_index)  # Convert index to word
        
        seed_tokens.append(predict_word)  # Append the predicted word
    
    return " ".join(seed_tokens)  # Return the generated text

# Define seed text and number of words to generate
seed_text = " iconic dish that has"
next_words = 5

# Generate the next words
predict_text = generate_text(mdl, seed_text, next_words, max_seq, vocab)
print("Next_predicted_words:", predict_text)

Next_predicted_words: iconic dish that has margherita


In [20]:
fl=open("pizza.txt","r")

In [21]:
fl.read()

'Pizza, the delectable and iconic dish that has transcended borders and captivated taste buds worldwide, is a testament to the extraordinary fusion of flavors, creativity, and cultural significance. Originating from the sun-kissed lands of Italy, pizza has evolved into an art form that unites people from diverse backgrounds in a shared love for its mouthwatering combinations. Its history stretches back centuries, with roots tracing back to ancient civilizations like the Greeks, Romans, and Egyptians, who all had their versions of flatbreads adorned with various ingredients. However, it was the vibrant city of Naples, Italy, that birthed the pizza we know and adore today.\n\nWith its soft and chewy Neapolitan crust, topped with the perfect balance of tomatoes, mozzarella cheese, and fresh basil, the Margherita pizza pays homage to Queen Margherita of Italy and embodies the colors of the Italian flag. As pizza migrated from the shores of Naples, it found its way to the United States with