## Next word predictor using BILSTM (Pytorch)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd

Medium Dataset(Kaggle)

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Pytorch/medium_data.csv")

In [4]:
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [5]:
df.shape

(6508, 10)

In [6]:
df['title']

Unnamed: 0,title
0,A Beginner’s Guide to Word Embedding with Gens...
1,Hands-on Graph Neural Networks with PyTorch & ...
2,How to Use ggplot2 in Python
3,Databricks: How to Save Files in CSV on Your L...
4,A Step-by-Step Implementation of Gradient Desc...
...,...
6503,“We” vs “I” — How Should You Talk About Yourse...
6504,How Donald Trump Markets Himself
6505,Content and Marketing Beyond Mass Consumption
6506,5 Questions All Copywriters Should Ask Clients...


In [7]:
# These two lines of code remove specific Unicode characters from the 'title' column in the medium_data DataFrame:

# 1) x.replace(u'\xa0', u' '):

#     This replaces non-breaking space (\xa0) with a regular space (' ').
#     Purpose: Some text sources (like web scraping) may contain \xa0 instead of a normal space, which can cause formatting issues.

# 2) x.replace('\u200a', ' '):

#     This replaces hair space (\u200a) with a regular space (' ').
#     Purpose: The hair space is an invisible, very narrow space that may appear in copied text, causing unexpected formatting issues.

In [8]:
df['title'] = df['title'].apply(lambda x: x.replace(u'\xa0',u' '))
df['title'] = df['title'].apply(lambda x: x.replace('\u200a',' '))

Making the vocab from text

In [10]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
tokens = []
for title in df['title']:
    tokens.extend(word_tokenize(title.lower()))

In [12]:
tokens[:10]

['a',
 'beginner',
 '’',
 's',
 'guide',
 'to',
 'word',
 'embedding',
 'with',
 'gensim']

In [13]:
vocab = {'<unk>': 0}  # Unknown token

for token in Counter(tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

In [14]:
print(list(vocab.items())[:10])

[('<unk>', 0), ('a', 1), ('beginner', 2), ('’', 3), ('s', 4), ('guide', 5), ('to', 6), ('word', 7), ('embedding', 8), ('with', 9)]


In [15]:
print(len(vocab))

8344


In [None]:
# Converting text to n-gram model to make data supervised for training

###Example of N-gram Model
Sentence: I am a Data Scientist

Word index: {'l': 1, 'am': 2, 'a': 3, 'Data': 4, 'scientist': 5}

Text Sequence: [1 ,2,3,4,5]

N_gram model for this sentence:

[1,2,3,4,5] = ['l', 'am', 'a', 'Data', 'Scientist']

[1,2,3,4] = ['l', 'am', 'a', 'Data']

[1,2,3] = ['l', 'am', 'a']

[1,2] = ['I', 'am']

In [16]:
def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence

In [17]:
input_numerical_sentences = []

for sentence in df['title']:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [18]:
len(input_numerical_sentences)

6508

In [19]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])

In [20]:
len(training_sequence)

55468

In [21]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [22]:
len_list = []

for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

51

In [23]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [24]:
len(padded_training_sequence[10])

51

In [25]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [26]:
padded_training_sequence

tensor([[   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        [   0,    0,    0,  ...,    2,    3,    4],
        ...,
        [   0,    0,    0,  ...,    1,  551,  303],
        [   0,    0,    0,  ...,  551,  303, 2870],
        [   0,    0,    0,  ...,  303, 2870, 2403]])

Prepare features and labels

In [27]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

Making Custom Dataset Class

In [28]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [29]:
dataset = CustomDataset(X,y)

In [30]:
len(dataset)

55468

In [31]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

###LSTM Model Class

In [32]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100) # Word Embeddings
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x) # Convert words to embeddings
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0)) # Dense layer for classification
    return output

###Bi-directional LSTM model class

In [33]:
class BILSTMModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 100)  # Word Embeddings
        self.lstm = nn.LSTM(100, 150, batch_first=True, bidirectional=True)  # Bidirectional LSTM
        self.fc = nn.Linear(150 * 2, vocab_size)  # Multiply by 2 due to bidirectional

    def forward(self, x):
        embedded = self.embedding(x)  # Convert words to embeddings
        lstm_out, (hidden_state, cell_state) = self.lstm(embedded)

        # Concatenating the final forward and backward hidden states
        final_hidden_state = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim=1)

        output = self.fc(final_hidden_state)  # Dense layer for classification
        return output

In [34]:
model = BILSTMModel(len(vocab))

Using GPU if available

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
print(device)

cuda


In [40]:
model.to(device)

BILSTMModel(
  (embedding): Embedding(8344, 100)
  (lstm): LSTM(100, 150, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=300, out_features=8344, bias=True)
)

Training the BiLSTM model

In [41]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [43]:
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total_samples = 0

    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()

        output = model(batch_x)  # Forward pass
        loss = criterion(output, batch_y)  # Compute loss

        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()

        # Compute accuracy
        preds = torch.argmax(output, dim=1)  # Get predicted class
        correct += (preds == batch_y).sum().item()  # Count correct predictions
        total_samples += batch_y.size(0)  # Count total samples

    avg_loss = total_loss / len(dataloader)  # Average loss
    accuracy = (correct / total_samples) * 100  # Accuracy percentage

    print(f"Epoch: {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


Epoch: 1, Loss: 4.8047, Accuracy: 25.14%
Epoch: 2, Loss: 4.1667, Accuracy: 28.33%
Epoch: 3, Loss: 3.6098, Accuracy: 32.46%
Epoch: 4, Loss: 3.1131, Accuracy: 38.63%
Epoch: 5, Loss: 2.6775, Accuracy: 45.87%
Epoch: 6, Loss: 2.3063, Accuracy: 52.78%
Epoch: 7, Loss: 1.9957, Accuracy: 59.03%
Epoch: 8, Loss: 1.7352, Accuracy: 64.33%
Epoch: 9, Loss: 1.5179, Accuracy: 68.92%
Epoch: 10, Loss: 1.3344, Accuracy: 72.97%
Epoch: 11, Loss: 1.1843, Accuracy: 76.05%
Epoch: 12, Loss: 1.0573, Accuracy: 78.67%
Epoch: 13, Loss: 0.9510, Accuracy: 80.99%
Epoch: 14, Loss: 0.8682, Accuracy: 82.60%
Epoch: 15, Loss: 0.7993, Accuracy: 83.87%
Epoch: 16, Loss: 0.7458, Accuracy: 84.86%
Epoch: 17, Loss: 0.7032, Accuracy: 85.45%
Epoch: 18, Loss: 0.6706, Accuracy: 85.97%
Epoch: 19, Loss: 0.6433, Accuracy: 86.37%
Epoch: 20, Loss: 0.6227, Accuracy: 86.54%
Epoch: 21, Loss: 0.6082, Accuracy: 86.67%
Epoch: 22, Loss: 0.5953, Accuracy: 86.83%
Epoch: 23, Loss: 0.5822, Accuracy: 86.96%
Epoch: 24, Loss: 0.5757, Accuracy: 86.97%
E

Prediction code

In [50]:
def prediction(model, vocab, text, device='cuda'):

    # Tokenize the text
    tokenized_text = word_tokenize(text.lower())

    # Convert text to numerical indices
    numerical_text = text_to_indices(tokenized_text, vocab)

    # Padding
    max_len = len(padded_training_sequence[0])  # Ensure this matches your training max length
    padded_text = torch.tensor([0] * (max_len - len(numerical_text)) + numerical_text,
                               dtype=torch.long).unsqueeze(0)

    # Move tensor to the same device as the model
    padded_text = padded_text.to(device)
    model = model.to(device)

    # Forward pass through the model
    output = model(padded_text)

    # Get predicted word index
    _, index = torch.max(output, dim=1)

    # Convert index back to a word
    predicted_word = list(vocab.keys())[index.item()]  # Convert tensor index to word

    return text + " " + predicted_word


In [51]:
prediction(model, vocab, "What is an")

'What is an individual'

In [48]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [49]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 88.59%


In [58]:
import time

num_tokens = 10
input_text = "Data Science"

for i in range(num_tokens):
  output_text = prediction(model, vocab, input_text)
  print(output_text)
  input_text = output_text
  time.sleep(0.5)

Data Science best
Data Science best practices
Data Science best practices :
Data Science best practices : python
Data Science best practices : python environments
Data Science best practices : python environments neural
Data Science best practices : python environments neural network
Data Science best practices : python environments neural network (
Data Science best practices : python environments neural network ( image
Data Science best practices : python environments neural network ( image classification
