In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from datasets import load_dataset
stories = load_dataset("roneneldan/TinyStories", split='train')

README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

(…)-00000-of-00004-2d5a1467fff1081b.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

(…)-00001-of-00004-5852b56a2bd28fd9.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00002-of-00004-a26307300439e943.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

(…)-00003-of-00004-d243063613e5a057.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00000-of-00001-869c898b519ad725.parquet:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [4]:
# raw_texts = [story['text'] for story in stories]
raw_texts = stories['text'][:3]

## Data Cleaning

In [5]:
# remove non alphabitical character
import re
import json
def clean_text(txt):
    txt = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", txt)
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip().lower()

In [6]:
texts = [clean_text(text) for text in raw_texts]

texts[0]

"one day, a little girl named lily found a needle in her room. she knew it was difficult to play with it because it was sharp. lily wanted to share the needle with her mom, so she could sew a button on her shirt. lily went to her mom and said, mom, i found this needle. can you share it with me and sew my shirt? her mom smiled and said, yes, lily, we can share the needle and fix your shirt. together, they shared the needle and sewed the button on lily's shirt. it was not difficult for them because they were sharing and helping each other. after they finished, lily thanked her mom for sharing the needle and fixing her shirt. they both felt happy because they had shared and worked together."

## Tokenization

In [7]:
class Tokenizer:
    def __init__(self, text):
        words = set()
        for text in texts:
            words.update(text.split())

        self.vocab = {word: i for i, word in enumerate(sorted(words), start=0)}
        self.inv_vocab = {i: word for word, i in self.vocab.items()}

    def encode(self, text):
        return [self.vocab[word] for word in text.split() if word in self.vocab]

    def decode(self, ids):
        return " ".join([self.inv_vocab[i] for i in ids])

In [8]:
tokenizer = Tokenizer(texts)

input_target_pairs = []

for text in texts:
    token_ids = tokenizer.encode(text)
    if len(token_ids) < 2:
        continue
    for i in range(1, len(token_ids)):
        input_seq = token_ids[:i]
        target_seq = token_ids[1:i+1]
        input_target_pairs.append((input_seq, target_seq))

In [None]:
# with open('tokenizer_vocab.json', 'w') as f:
#     json.dump(tokenizer.vocab, f)

# with open('stories_pairs.json', 'w') as  f:
#     json.dump(input_target_pairs[:1000], f)

In [53]:
vocab_size = len(tokenizer.vocab)
encoded = tokenizer.encode(texts[0])
print(encoded)
input_tensor = torch.tensor(encoded).unsqueeze(0)

[125, 31, 0, 98, 65, 111, 95, 58, 0, 114, 85, 77, 137, 147, 88, 86, 188, 33, 175, 130, 196, 86, 13, 86, 188, 146, 95, 185, 175, 143, 165, 114, 196, 77, 108, 153, 147, 29, 141, 0, 22, 123, 77, 148, 95, 192, 175, 77, 107, 7, 139, 108, 84, 58, 171, 115, 25, 199, 143, 86, 196, 106, 7, 141, 110, 149, 77, 107, 152, 7, 139, 198, 97, 191, 25, 143, 165, 114, 7, 55, 201, 148, 177, 169, 144, 165, 114, 7, 142, 165, 22, 123, 96, 148, 86, 188, 119, 33, 57, 166, 13, 169, 193, 145, 7, 76, 38, 126, 1, 169, 51, 95, 163, 77, 107, 57, 145, 165, 114, 7, 56, 77, 148, 169, 20, 45, 71, 13, 169, 69, 144, 7, 197, 178]


## Embedding

#### B: Batch size
#### T: sequence length / number of tokens
#### D: size of embedding / hidden size
#### H: number of attention heads
#### d: head dimension

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [49]:

# config variables !! change later !!
# embedding_dim % num_heads == 0
class Config:
    embedding_dim = 128 # embedding_dim == hidden_size == (D)
    ff_embedding_dim = 512 # ff_embedding_dim = 4 × embedding_dim
    max_seq_len = 200
    dropout = 0.1
    num_heads = 4
    

In [40]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_size, max_seq_len):
        super().__init__()
        self.token_embeddings = nn.Embedding(vocab_size, embedding_size)     # Lookup for token IDs
        self.positional_embeddings = nn.Parameter(torch.zeros(1, max_seq_len, embedding_size))  # Learned positions
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        # x is (B, T) – batch of token IDs
        tok_emb = self.token_embeddings(x)  # (B, T, D)
        pos_emb = self.positional_embeddings[:, :x.size(1), :]  # (1, T, D) -> broadcasted
        return self.dropout(tok_emb + pos_emb)  # (B, T, D)

In [44]:
embedding_model = EmbeddingLayer(vocab_size, embedding_dim, max_seq_len)
embedded = embedding_model(input_tensor)
print(embedded)

tensor([[[ 0.5538,  0.5812, -1.1987,  ...,  1.3827, -0.9495, -0.8262],
         [-0.5012,  0.5226, -0.0000,  ...,  2.8496, -1.2216, -0.4581],
         [ 0.2731,  1.6325, -0.6276,  ...,  0.0773,  0.4428,  1.5713],
         ...,
         [-0.1163, -0.0000,  1.1108,  ...,  0.9343,  0.8658,  0.0000],
         [-0.0000,  0.5464, -0.7705,  ...,  0.2502, -0.0503,  1.6312],
         [-0.7730,  0.0000,  0.0000,  ...,  0.0000,  0.9622,  0.5432]]],
       grad_fn=<MulBackward0>)


## Decoder

In [50]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout=0.1):
        super().__init__()
        assert hidden_size % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads

        self.qkv_proj = nn.Linear(hidden_size, hidden_size * 3)
        self.out_proj = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        B, T, D = x.size()

        qkv = self.qkv_proj(x)  # (B, T, 3D)
        q, k, v = qkv.chunk(3, dim=-1)

        def split_heads(tensor):
            return tensor.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)  # (B, H, T, d)

        q, k, v = split_heads(q), split_heads(k), split_heads(v)

        # computing attention
        # step 1: q * k matrix multiplication
        # step 2: scaling
        scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        
        # step 3: masking future tokens
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        # step 4: apply softmax function to normalize the compatibility matrix
        # giving us the attention weights
        attn = F.softmax(scores, dim=-1)
        attn = self.dropout(attn)

        # step 5: computing (output / context) matrix
        out = attn @ v  # (B, H, T, d)
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        return self.out_proj(out)

class FeedForward(nn.Module):
    def __init__(self, hidden_size, ff_hidden_size, dropout=0.1):
        super().__init__()
        self.ff = nn.Sequential(
            nn.Linear(hidden_size, ff_hidden_size),
            nn.GELU(), # Gaussian Error Linear Units
            nn.Linear(ff_hidden_size, hidden_size),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.ff(x)

class DecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn = MultiHeadSelfAttention(config.embedding_dim, config.num_heads, config.dropout)
        self.ffn = FeedForward(config.embedding_dim, config.ff_embedding_dim, config.dropout)

        self.ln1 = nn.LayerNorm(config.embedding_dim)
        self.ln2 = nn.LayerNorm(config.embedding_dim)

    def forward(self, x, mask=None):
        # Self-attention + residual
        attn_out = self.attn(self.ln1(x), mask)
        x = x + attn_out

        # Feed-forward + residual
        ffn_out = self.ffn(self.ln2(x))
        x = x + ffn_out

        return x


In [51]:
config = Config()
decoder = DecoderLayer(config)
decoded = decoder(embedded)
print(decoded)

tensor([[[ 0.4035,  0.3175, -1.1815,  ...,  1.3377, -0.7903, -0.9292],
         [-0.4821,  0.9288, -0.2228,  ...,  2.9428, -1.2859, -0.5694],
         [ 0.2451,  1.7981, -0.7396,  ...,  0.0590,  0.7202,  1.1080],
         ...,
         [-0.2017,  0.2822,  1.4440,  ...,  0.6536,  0.7932,  0.3036],
         [-0.2861,  0.5808, -0.7646,  ...,  0.6992, -0.0662,  1.5447],
         [-0.7651,  0.0931, -0.2602,  ..., -0.0639,  0.9821,  0.6115]]],
       grad_fn=<AddBackward0>)
