* **Previous Notebook in series**: LLM Fundamentals 3

* **This Notebook**: Training with LSTM

In [1]:
import sys
print(sys.executable)

/Users/dipanjansanyal/Documents/llm-fundamentals/llm-fundamentals/env/bin/python


In [2]:
# Experiment Configuration
context_length = 10
num_epochs = 10
batch_size = 32
response_tokens = 200

In [3]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tokenmonster
import more_itertools
import torch
import torch.nn as nn
from torch.nn.functional import pad
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
torch.manual_seed(123)

<torch._C.Generator at 0x12800f530>

In [11]:
# # Check if MPS is available and use it if available

# if torch.backends.mps.is_available():
#     device = torch.device("mps")
# else:
#     device = torch.device("cpu")

# print(device)

device = "cpu"

In [5]:
# Reading the dataset
dataset = pd.read_csv("./data/cefr_leveled_texts.csv", index_col=0)
dataset = dataset.query('label == "A1"').reset_index() # Selecting easiest level
# Deduplicate repeated headlines
dataset = dataset.filter(['text']).drop_duplicates()
print("Dataset shape:", dataset.shape)
print("Dataset columns:", dataset.columns)

Dataset shape: (288, 1)
Dataset columns: Index(['text'], dtype='object')


In [6]:
# Loading pre-trained tokenizer and adding special tokens
tokenizer = tokenmonster.load("english-2048-consistent-v1")
tokenizer.add_special_token('<pad>')

2049

In [7]:
print("Vocabulary Size:", tokenizer.vocab_size)
token_ids = tokenizer.tokenize(dataset.text[0][0:50])
print("Testing tokenizer:", token_ids)
print("Testing tokenizer:", tokenizer.decoder().decode(token_ids))
for i in token_ids:
    print(i, ": ", tokenizer.decoder().decode(i))

Vocabulary Size: 2049
Testing tokenizer: [ 182  869 1680  852 1002  217  420 1540    4  104 1386 1222 1144    3]
Testing tokenizer: You need to try harder in school!
I am doing well 
182 :  
869 :   you
1680 :   need to
852 :   try
1002 :   hard
217 :  er
420 :   in
1540 :   school
4 :  !
104 :  

1386 :   I am
1222 :   doing
1144 :   well
3 :   


### Data Preparation

In [8]:
list_of_sentences = list(dataset['text'])

# Divide each tokenized sentence into expanding window with preset max length (content length = 10)
# Store them as a list of tensors, so that the collate_fn of DataLoader can source them
# They are stored as a list rather than an array because they are of different sizes tensors (size 2 to 10)

data = []
for sentence in list_of_sentences:

    sentence_tokens = tokenizer.tokenize(sentence)
    l = len(sentence_tokens)
    applicable_context_length = min(l, context_length)    

    counter_t = []
    for t in range(2,applicable_context_length+1):        

        expanding_tuples = more_itertools.windowed(sentence_tokens, t)
        data_t = [torch.tensor(item).to(torch.long) for item in list(expanding_tuples)]
        counter_t.extend(data_t)     

    data.extend(counter_t)

In [9]:
# Split the variable sized sequences into train test
train, test = train_test_split(data, test_size = 0.2, random_state = 123)

# Create a collate function which performs the following operation under each batch when the batch is called
# For each item inside a batch, it splits input and target sequence
# Then it pads them to match tensors of (context_length - 1)
padding_token = tokenizer.tokenize('<pad>').item()
def collate_fn(batch):
    # Separate the inputs and targets (target is 1 token shifted right)
    inputs = [item[:-1] for item in batch]
    targets = [item[1:] for item in batch]
    
    # Pad the sequences to the same length i.e. (context_length - 1)
    inputs = [pad(item, (0, context_length-1-len(item)), value = padding_token) for item in inputs]
    targets = [pad(item, (0, context_length-1-len(item)), value = padding_token) for item in targets]

    # convert them to form 2D tensor of shape (B x T)
    inputs = torch.stack(inputs).to(device)
    targets = torch.stack(targets).to(device)
    
    return inputs, targets

# Define the data loader with preset batch size and predefined collate_fn
train_loader = DataLoader(train, batch_size = batch_size, shuffle = True, drop_last = True, collate_fn = collate_fn)
test_loader = DataLoader(test, batch_size = batch_size, shuffle = False, drop_last = True, collate_fn = collate_fn)

In [12]:
# How did we form this?

x,y = next(iter(train_loader))
print("x: ", x[:5])
print("y: ", y[:5])

x:  tensor([[ 737,  214,   15, 1609, 1274,  287, 2048, 2048, 2048],
        [ 842,  828,   51, 2048, 2048, 2048, 2048, 2048, 2048],
        [ 400,   69,  873, 1770, 2048, 2048, 2048, 2048, 2048],
        [1025, 1119, 1182,  869,  873, 1367, 1011, 1167, 2048],
        [  63,  873, 1828,  328,  667,   58, 2048, 2048, 2048]])
y:  tensor([[ 214,   15, 1609, 1274,  287,  329, 2048, 2048, 2048],
        [ 828,   51, 1615, 2048, 2048, 2048, 2048, 2048, 2048],
        [  69,  873, 1770, 1813, 2048, 2048, 2048, 2048, 2048],
        [1119, 1182,  869,  873, 1367, 1011, 1167, 1771, 2048],
        [ 873, 1828,  328,  667,   58,   58, 2048, 2048, 2048]])


### Model Building

In [49]:
# Token and positional embeddings
token_embedding = nn.Embedding(2049, 128, padding_idx = padding_token)
fx = token_embedding(x)

position_embedding = nn.Embedding(context_length-1, 128)

pos_indices = torch.arange(context_length-1).unsqueeze(0).expand(x.size(0), -1) # (B x T) Integers
pos_indices = torch.where(x != padding_token, pos_indices, 0)
fx = fx + position_embedding(pos_indices)



In [50]:
fx[0,-1,:]

tensor([-0.0348,  1.1426, -0.8031, -0.8195, -0.3684,  1.2639, -1.4694,  1.0644,
         0.8040, -0.7585,  1.0921,  0.1312,  1.0963, -0.6385,  0.1917,  0.5659,
        -0.5897, -0.5680, -0.0359,  0.7629, -0.3593, -0.6200,  0.0919, -0.9852,
        -1.3196,  0.8022,  0.7076, -0.4863,  0.3103,  0.6219, -1.3571,  0.3612,
        -1.6877, -0.6480,  0.3568,  0.3413, -0.8393,  0.0097, -0.7561,  0.6869,
        -0.8671, -0.1718, -1.2744,  0.3862,  2.6726, -1.4470, -1.9123, -0.9078,
         0.8464,  1.8580,  0.9008,  0.1186, -0.3416, -1.2503,  1.6480, -0.2292,
         0.3072,  0.8725,  0.3071,  0.5452, -0.2082, -0.1354, -0.9168,  0.8804,
        -0.0058, -2.9138,  0.5338,  1.3497,  0.9038,  1.2460,  0.7885, -1.1703,
        -0.6349,  0.1710, -1.4522, -0.6162,  0.1904, -1.1711,  0.7403, -0.0623,
        -0.4848,  0.9151, -0.7054, -0.2913, -0.0721, -0.6024,  0.4909,  1.4899,
         0.5611, -0.7335, -0.3828,  0.6754,  0.5060, -0.3605,  0.7011,  0.5704,
         2.6185, -2.2117,  0.2219, -0.52

In [None]:
# Model Definition

class Net(nn.Module):
    def __init__(self, vocab_size, num_heads, num_layers, dropout=0.1):
        super(Net, self).__init__()
        
        self.context_length = context_length

        # Token and positional embeddings
        self.token_embedding = nn.Embedding(vocab_size, 128)
        self.position_embedding = nn.Embedding(context_length, 128)

        # Transformer decoder layers
        self.decoder_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=128, nhead=num_heads, dropout=dropout)
            for _ in range(num_layers)
        ])
        self.layer_norm = nn.LayerNorm(128)

        # Output projection
        self.output_projection = nn.Linear(128, vocab_size)

    def forward(self, x):
        # x: (batch_size, seq_len)
        batch_size, seq_len = x.size()

        # Token and positional embeddings
        token_embeds = self.token_embedding(x)  # (batch_size, seq_len, embed_dim)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        position_embeds = self.position_embedding(positions)  # (batch_size, seq_len, embed_dim)

        # Combine embeddings
        embeddings = token_embeds + position_embeds  # (batch_size, seq_len, embed_dim)

        # Prepare for Transformer decoder
        embeddings = embeddings.permute(1, 0, 2)  # (seq_len, batch_size, embed_dim)

        # Apply Transformer decoder layers
        for layer in self.decoder_layers:
            embeddings = layer(embeddings, memory=None)  # No memory for causal LM

        # Apply layer normalization
        embeddings = self.layer_norm(embeddings)  # (seq_len, batch_size, embed_dim)

        # Project to vocabulary size
        logits = self.output_projection(embeddings)  # (seq_len, batch_size, vocab_size)

        return logits.permute(1, 0, 2)  # (batch_size, seq_len, vocab_size)