## Import Libraries

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize

## Import Data

In [2]:
with open("./document.txt", "r", encoding="utf-8") as f:
  document = f.read()

document

'About the Program\nWhat is the course fee for  Data Science Mentorship Program (DSMP 2023)\nThe course follows a monthly subscription model where you have to make monthly payments of Rs 799/month.\nWhat is the total duration of the course?\nThe total duration of the course is 7 months. So the total course fee becomes 799*7 = Rs 5600(approx.)\nWhat is the syllabus of the mentorship program?\nWe will be covering the following modules:\nPython Fundamentals\nPython libraries for Data Science\nData Analysis\nSQL for Data Science\nMaths for Machine Learning\nML Algorithms\nPractical ML\nMLOPs\nCase studies\nYou can check the detailed syllabus here - https://learnwith.campusx.in/courses/CampusX-Data-Science-Mentorship-Program-637339afe4b0615a1bbed390\nWill Deep Learning and NLP be a part of this program?\nNo, NLP and Deep Learning both are not a part of this program’s curriculum.\nWhat if I miss a live session? Will I get a recording of the session?\nYes all our sessions are recorded, so eve

## Data Processing

### Tokenize

In [3]:
# Tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/dhruv-
[nltk_data]     kapri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# tokenize
tokens = word_tokenize(document.lower())

In [5]:
# build vocab
vocab = {'unk': 0}

for token in Counter(tokens).keys():
  if token not in vocab:
    vocab[token] = len(vocab)

vocab

{'unk': 0,
 'about': 1,
 'the': 2,
 'program': 3,
 'what': 4,
 'is': 5,
 'course': 6,
 'fee': 7,
 'for': 8,
 'data': 9,
 'science': 10,
 'mentorship': 11,
 '(': 12,
 'dsmp': 13,
 '2023': 14,
 ')': 15,
 'follows': 16,
 'a': 17,
 'monthly': 18,
 'subscription': 19,
 'model': 20,
 'where': 21,
 'you': 22,
 'have': 23,
 'to': 24,
 'make': 25,
 'payments': 26,
 'of': 27,
 'rs': 28,
 '799/month': 29,
 '.': 30,
 'total': 31,
 'duration': 32,
 '?': 33,
 '7': 34,
 'months': 35,
 'so': 36,
 'becomes': 37,
 '799': 38,
 '*': 39,
 '=': 40,
 '5600': 41,
 'approx': 42,
 'syllabus': 43,
 'we': 44,
 'will': 45,
 'be': 46,
 'covering': 47,
 'following': 48,
 'modules': 49,
 ':': 50,
 'python': 51,
 'fundamentals': 52,
 'libraries': 53,
 'analysis': 54,
 'sql': 55,
 'maths': 56,
 'machine': 57,
 'learning': 58,
 'ml': 59,
 'algorithms': 60,
 'practical': 61,
 'mlops': 62,
 'case': 63,
 'studies': 64,
 'can': 65,
 'check': 66,
 'detailed': 67,
 'here': 68,
 '-': 69,
 'https': 70,
 '//learnwith.campusx.in/

In [6]:
len(vocab)

289

In [7]:
# extract sentences from data
input_sentences = document.split('\n')
input_sentences

['About the Program',
 'What is the course fee for  Data Science Mentorship Program (DSMP 2023)',
 'The course follows a monthly subscription model where you have to make monthly payments of Rs 799/month.',
 'What is the total duration of the course?',
 'The total duration of the course is 7 months. So the total course fee becomes 799*7 = Rs 5600(approx.)',
 'What is the syllabus of the mentorship program?',
 'We will be covering the following modules:',
 'Python Fundamentals',
 'Python libraries for Data Science',
 'Data Analysis',
 'SQL for Data Science',
 'Maths for Machine Learning',
 'ML Algorithms',
 'Practical ML',
 'MLOPs',
 'Case studies',
 'You can check the detailed syllabus here - https://learnwith.campusx.in/courses/CampusX-Data-Science-Mentorship-Program-637339afe4b0615a1bbed390',
 'Will Deep Learning and NLP be a part of this program?',
 'No, NLP and Deep Learning both are not a part of this program’s curriculum.',
 'What if I miss a live session? Will I get a recording 

In [8]:
def text_to_indices(sentence):
  tokenized_sentence = word_tokenize(sentence.lower())
  numerical_sentence = []

  for token in tokenized_sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['unk'])

  return numerical_sentence

In [9]:
input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(text_to_indices(sentence))

In [10]:
len(input_numerical_sentences)

78

### Training Data prep

In [11]:
training_sequence = []
len_list = []

for sentence in input_numerical_sentences:
  for i in range(1, len(sentence)):
    training_sequence.append(sentence[:i+1])
    len_list.append(i+1)

In [12]:
print(len(training_sequence))
print(training_sequence[:5])
print(len_list[:5])

max_len = max(len_list)
print(max_len)

942
[[1, 2], [1, 2, 3], [4, 5], [4, 5, 2], [4, 5, 2, 6]]
[2, 3, 2, 3, 4]
62


In [13]:
# Padding
padded_training_sequence = []

for sequence in training_sequence:
  padded_training_sequence.append([0]*(max_len - len(sequence)) + sequence)

padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [14]:
print(padded_training_sequence.shape)
print(padded_training_sequence[:5])

torch.Size([942, 62])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 2],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
# X, y split
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:, -1]

## DataLoader

In [16]:
class CustomDataset(Dataset):

  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [17]:
dataset = CustomDataset(X, y)

In [18]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

## Model Definition

In [19]:
class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output 


In [20]:
model = LSTMModel(len(vocab))

In [21]:
epochs = 100
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, )

## Training Pipeline

In [23]:
def training(model, loader):
  model.train()
  for epoch in range(epochs):
    total_loss = 0
    for batch_x, batch_y in loader:
      batch_x, batch_y = batch_x.to(device), batch_y.to(device)
      output = model(batch_x)
      loss = criterion(output, batch_y)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    print(f'Epoch: {epoch+1}, Loss: {total_loss/len(loader): .4f}')

In [24]:
training(model, dataloader)

Epoch: 1, Loss:  5.1449
Epoch: 2, Loss:  3.3231
Epoch: 3, Loss:  1.8812
Epoch: 4, Loss:  1.0762
Epoch: 5, Loss:  0.6677
Epoch: 6, Loss:  0.4673
Epoch: 7, Loss:  0.3318
Epoch: 8, Loss:  0.2693
Epoch: 9, Loss:  0.2197
Epoch: 10, Loss:  0.1762
Epoch: 11, Loss:  0.1838
Epoch: 12, Loss:  0.1779
Epoch: 13, Loss:  0.1521
Epoch: 14, Loss:  0.1600
Epoch: 15, Loss:  0.1398
Epoch: 16, Loss:  0.1526
Epoch: 17, Loss:  0.1514
Epoch: 18, Loss:  0.1456
Epoch: 19, Loss:  0.1327
Epoch: 20, Loss:  0.1301
Epoch: 21, Loss:  0.1223
Epoch: 22, Loss:  0.1349
Epoch: 23, Loss:  0.1371
Epoch: 24, Loss:  0.1350
Epoch: 25, Loss:  0.1309
Epoch: 26, Loss:  0.1243
Epoch: 27, Loss:  0.1364
Epoch: 28, Loss:  0.1448
Epoch: 29, Loss:  0.1384
Epoch: 30, Loss:  0.1364
Epoch: 31, Loss:  0.1308
Epoch: 32, Loss:  0.1289
Epoch: 33, Loss:  0.1303
Epoch: 34, Loss:  0.1368
Epoch: 35, Loss:  0.1286
Epoch: 36, Loss:  0.1213
Epoch: 37, Loss:  0.1216
Epoch: 38, Loss:  0.1103
Epoch: 39, Loss:  0.1324
Epoch: 40, Loss:  0.1398
Epoch: 41

## Prediction

In [25]:
idx_to_word = {idx: word for word, idx in vocab.items()}

In [26]:
def prediction(model, text):
  model.eval()

  # text -> numerical indices
  numerical_text = text_to_indices(text)

  # padding
  padded_text = [0]*(max_len -1 - len(numerical_text)) + numerical_text
  padded_text = torch.tensor(padded_text, dtype=torch.long)
  padded_text = padded_text.unsqueeze(0)
  padded_text = padded_text.to(device)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + ' ' + idx_to_word[index.item()]

In [27]:
text = "My name is"
prediction_len = 10

for _ in range(prediction_len):
  text = prediction(model, text)
  print(text)

My name is tricky
My name is tricky ,
My name is tricky , so
My name is tricky , so read
My name is tricky , so read carefully
My name is tricky , so read carefully .
My name is tricky , so read carefully . you
My name is tricky , so read carefully . you can
My name is tricky , so read carefully . you can watch
My name is tricky , so read carefully . you can watch the


## Summary

In [28]:
from torchinfo import summary

summary(model, input_size=(1, max_len-1), dtypes=[torch.long])    # input size parameter

Layer (type:depth-idx)                   Output Shape              Param #
LSTMModel                                [1, 289]                  --
├─Embedding: 1-1                         [1, 61, 100]              28,900
├─LSTM: 1-2                              [1, 61, 150]              151,200
├─Linear: 1-3                            [1, 289]                  43,639
Total params: 223,739
Trainable params: 223,739
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 9.30
Input size (MB): 0.00
Forward/backward pass size (MB): 0.12
Params size (MB): 0.89
Estimated Total Size (MB): 1.02