<a href="https://colab.research.google.com/github/Dd1235/LearnAI/blob/main/mini_projects/next_word_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [8]:
document = """
Today was a surprisingly productive day.
I started by reviewing my notes from the Operating Systems lecture, and then I jumped straight into debugging a segmentation fault that had been bothering me since last night.
Turned out, I was accessing memory after freeing it.
Classic rookie mistake.
Later, I spent some time working on my side project — an AI-powered college resource assistant.
It's coming along nicely.
I integrated a search functionality that uses cosine similarity to match user queries to existing lecture notes and textbook content.
During lunch, I caught up with a few friends. We ended up discussing placement preparation strategies, how to approach DSA in a structured way, and whether doing a project on NLP using transformers could boost our resume visibility.
In the afternoon, I decided to try out a new note-taking app on my iPad.
It syncs perfectly with my MacBook and allows me to export annotated PDFs.
I used it to mark up some slides for the DAA class, especially the part on dynamic programming and amortized analysis.
Later in the evening, I had a short meeting with my hackathon team.
We finalized the UI design for our ride-sharing optimization platform.
We're building it using React for the frontend and Flask for the backend.
After the meeting, I practiced some Leetcode problems.
The one on "Sliding Window Maximum" took me longer than expected.
I first tried a brute force solution with O(n*k), but then optimized it using a deque to get O(n) time complexity. It's satisfying when the optimized solution finally clicks.
Before sleeping, I reviewed my semester goals.
I still need to finish the Financial Statement Analysis project, polish my resume for the upcoming internships, and maybe try building a simple cloth simulation using C++ and SFML just for fun.
I then did my night time skincare routine and wasted some time going through Instagram.
Overall, I’m happy with how today went. I’ve realized that having a structured plan, combined with short bursts of focused work, is far more effective than spending hours in a distracted state. Hoping to continue this momentum tomorrow."""


In [9]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
# tokenize
tokens = word_tokenize(document.lower())

In [11]:
# build vocab
vocab = {'<unk>':0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'today': 1,
 'was': 2,
 'a': 3,
 'surprisingly': 4,
 'productive': 5,
 'day': 6,
 '.': 7,
 'i': 8,
 'started': 9,
 'by': 10,
 'reviewing': 11,
 'my': 12,
 'notes': 13,
 'from': 14,
 'the': 15,
 'operating': 16,
 'systems': 17,
 'lecture': 18,
 ',': 19,
 'and': 20,
 'then': 21,
 'jumped': 22,
 'straight': 23,
 'into': 24,
 'debugging': 25,
 'segmentation': 26,
 'fault': 27,
 'that': 28,
 'had': 29,
 'been': 30,
 'bothering': 31,
 'me': 32,
 'since': 33,
 'last': 34,
 'night': 35,
 'turned': 36,
 'out': 37,
 'accessing': 38,
 'memory': 39,
 'after': 40,
 'freeing': 41,
 'it': 42,
 'classic': 43,
 'rookie': 44,
 'mistake': 45,
 'later': 46,
 'spent': 47,
 'some': 48,
 'time': 49,
 'working': 50,
 'on': 51,
 'side': 52,
 'project': 53,
 '—': 54,
 'an': 55,
 'ai-powered': 56,
 'college': 57,
 'resource': 58,
 'assistant': 59,
 "'s": 60,
 'coming': 61,
 'along': 62,
 'nicely': 63,
 'integrated': 64,
 'search': 65,
 'functionality': 66,
 'uses': 67,
 'cosine': 68,
 'similarity':

In [12]:
len(vocab)

237

In [13]:
input_sentences = document.split('\n')

In [14]:
def text_to_indices(sentence, vocab):

  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>'])

  return numerical_sentence

In [15]:
input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [16]:
len(input_numerical_sentences)

22

In [17]:
training_sequence = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])

In [18]:
len(training_sequence)

388

In [19]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [20]:
len_list = []

for sequence in training_sequence:
  len_list.append(len(sequence))

max(len_list)

48

In [21]:
training_sequence[0]

[1, 2]

In [22]:
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [23]:
len(padded_training_sequence[10])

48

In [24]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [25]:
padded_training_sequence

tensor([[  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        [  0,   0,   0,  ...,   2,   3,   4],
        ...,
        [  0,   0, 210,  ..., 233, 234, 235],
        [  0, 210,  19,  ..., 234, 235, 236],
        [210,  19,   8,  ..., 235, 236,   7]])

In [26]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:,-1]

In [28]:
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [29]:
dataset = CustomDataset(X,y)
len(dataset)
dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)

In [30]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [31]:
model = LSTMModel(len(vocab))

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
model.to(device)

LSTMModel(
  (embedding): Embedding(237, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=237, bias=True)
)

In [34]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader:

    batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 71.0811
Epoch: 2, Loss: 68.7505
Epoch: 3, Loss: 65.6838
Epoch: 4, Loss: 61.3218
Epoch: 5, Loss: 57.4958
Epoch: 6, Loss: 52.4783
Epoch: 7, Loss: 46.7326
Epoch: 8, Loss: 43.1646
Epoch: 9, Loss: 38.4152
Epoch: 10, Loss: 34.4034
Epoch: 11, Loss: 29.7990
Epoch: 12, Loss: 27.1505
Epoch: 13, Loss: 23.6227
Epoch: 14, Loss: 20.9314
Epoch: 15, Loss: 18.1288
Epoch: 16, Loss: 15.9658
Epoch: 17, Loss: 13.9684
Epoch: 18, Loss: 12.5863
Epoch: 19, Loss: 11.2421
Epoch: 20, Loss: 10.1822
Epoch: 21, Loss: 8.7757
Epoch: 22, Loss: 7.7974
Epoch: 23, Loss: 7.2362
Epoch: 24, Loss: 6.2381
Epoch: 25, Loss: 5.7470
Epoch: 26, Loss: 5.2338
Epoch: 27, Loss: 4.7310
Epoch: 28, Loss: 4.7665
Epoch: 29, Loss: 4.3034
Epoch: 30, Loss: 3.8597
Epoch: 31, Loss: 3.6221
Epoch: 32, Loss: 3.3359
Epoch: 33, Loss: 3.1942
Epoch: 34, Loss: 2.9897
Epoch: 35, Loss: 2.8324
Epoch: 36, Loss: 2.6613
Epoch: 37, Loss: 2.5318
Epoch: 38, Loss: 2.6085
Epoch: 39, Loss: 2.3235
Epoch: 40, Loss: 2.2808
Epoch: 41, Loss: 2.3414
Epoch

In [36]:
# prediction

def prediction(model, vocab, text):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = text_to_indices(tokenized_text, vocab)

  # padding
  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + " " + list(vocab.keys())[index]



In [37]:
import time

num_tokens = 10
input_text = "Today was a"

for i in range(num_tokens):
    output_text = prediction(model, vocab, input_text)
    print(output_text)
    input_text = output_text
    time.sleep(0.5)


Today was a surprisingly
Today was a surprisingly productive
Today was a surprisingly productive day
Today was a surprisingly productive day .
Today was a surprisingly productive day . .
Today was a surprisingly productive day . . hoping
Today was a surprisingly productive day . . hoping to
Today was a surprisingly productive day . . hoping to continue
Today was a surprisingly productive day . . hoping to continue this
Today was a surprisingly productive day . . hoping to continue this momentum


In [38]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [39]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")

Model Accuracy: 97.94%
