# Handling the Data


In [39]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_texts = dataset["train"]["text"]
val_texts = dataset["validation"]["text"]

train_data = "\n".join(train_texts)
val_data = "\n".join(val_texts)
data = train_data + val_data

In [40]:
print("length of training dataset in characters:   ", len(train_data))
print("length of validation dataset in characters: ", len(val_data))
print("length of the full dataset in characters:   ", len(data))

length of training dataset in characters:    10929707
length of validation dataset in characters:  1145909
length of the full dataset in characters:    12075616


In [41]:
with open("input.txt", "w", encoding="utf-8") as f:
    f.write(data)

# Tokenization

### Character Tokenization

In [22]:
# Finding all the unique characters
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz|}~¡¢£¥§°±²³´µ·º½¾¿ÁÄÅÆÇÉÍÎÑÓÖ×ØÚÜÞßàáâãäåæçèéêëìíîïðñòóôöøùúûüýþĀāăąćČčĐđĒēĔĕėęěğħīİıĽŁłńŌōœřŚśŞşŠšţũūůųźŻżŽžơưǎǔȘșțȯɐɑɒɔəɛɜɡɢɪɫɳɽɾʁʃʊʋʒʔʕʲʻʼʾʿˈˌː̥̯͍́̃̍ΑΔΚΝΠΤΦΧΩάέήίαβγδεηθικλμνξοπρςστυχψωόύώАБВГКПРСУХЯавгдежзийклмнопрстухцшъыьюяєֵֶָֹּאבגהוזחילםמןנסףפצרשתءأإابةتثجحخدذرسشصعفقكلمنهويܐܕܗܝܠܢܬܲܵंअईकगणतदनपबमयरलवसहािुूेै्আলহা্ਅਲਹਾੁੱഅളഹാ്กคงชซญฐณดตนบปพภมยรลวศษสหอฮะัาิีึเแไ็่้๊์ဂစဇတနပမရလအာို္်ြွ၁၂၇၈ႠႢႣႨႬႵႿაბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰჱჲჳჴჵჶჷჸჹჺ჻ᵻḍḏḤḥḷṃṅṇṛṣṭṯạảấầậắễệịọỏỗộớứửữỳỹἀἰὀὁὑ​‐‑–—―‘’“”„†‡•…′″⁄₣₤€₹⅓⅔→−≡≤①☉☫♀♭♯⚳〈〉〜あいうおかがきぎくぐけこごさしすずぜただちっつとなにのはばひふほまみめもゃゆょらりるれわをんァアィイゥェエォオカガキクグゲコサシジスズセタダチッツテデトドナニネノハバパフブプヘベマミムモャュョラリルレロンヴ・ー一七下世丙中主乃之乙九二云人今付似作侗依信傳儚充光全六兵其具円再出判制刷前剛劇劉助動包化北十千华南印厂去古可台史同名君吳周命和咲唐善四國園圣在坂堂堤場塘夕大天夫奈套女妙姚子孟學守安宋完宗定宝宫寝寶寺小少尾山岳川州巳市師平广庆府座廬建式張彌彩影彼征後御微德心必忠思愛憑憶應懷战戦所扈技拉拱拳挑揺攻放政散文斯方日旦旭昌明星春晋景曦書月望朝未本李村杜束来板林果桜梶棘椎楊楚榮樸橘機正殻殿毅母水汉沂沙河法泗波泣洪浮淹清湯漢澄澤火灯灵灼無焼熱牌物狐狸玄玉王玩珂珙球理琦琪瓊生田畢番畫疆病瘡白皇皮真砲礮祈神祠秋稽空立竹箋籠精紀約統絵緬織繹義翠者耕肖背胡膀臂致興舍良芥花芳芽若

In [23]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [24]:
print(encode("Hello world!"))
print(decode(encode("Hello world!")))

[41, 70, 77, 77, 80, 1, 88, 80, 83, 77, 69, 2]
Hello world!


In [29]:
import torch

data = torch.tensor(encode(data), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([12075616]) torch.int64
tensor([  0,   1,  30,   1,  55,  66,  77,  76,  90,  83,  74,  66,   1,  36,
         73,  83,  80,  79,  74,  68,  77,  70,  84,   1,  42,  42,  42,   1,
         30,   1,   0,   0,   0,   1,  52,  70,  79,  75, 185,   1,  79,  80,
          1,  55,  66,  77,  76,  90,  83,  74,  66,   1,  20,   1,  27,   1,
         54,  79,  83,  70,  68,  80,  83,  69,  70,  69,   1,  36,  73,  83,
         80,  79,  74,  68,  77,  70,  84,   1,   9,   1,  43,  66,  81,  66,
         79,  70,  84,  70,   1,  27,   1, 856, 793, 640, 716, 660, 712, 671,
        708, 711, 661,  20,   1,  13,   1,  77,  74,  85,   1,  15,   1,  55,
         66,  77,  76,  90,  83,  74,  66,   1,  80,  71,   1,  85,  73,  70,
          1,  35,  66,  85,  85,  77,  70,  71,  74,  70,  77,  69,   1,  20,
          1,  10,   1,  13,   1,  68,  80,  78,  78,  80,  79,  77,  90,   1,
         83,  70,  71,  70,  83,  83,  70,  69,   1,  85,  80,   1,  66,  84,
          1,  55,  66,  77,  

# data split

In [30]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:] 

# Base Model

In [32]:
torch.manual_seed(1337)
batch_size = 4
context_length = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - context_length, (batch_size,))
    x = torch.stack([data[i:i+context_length] for i in ix])
    y = torch.stack([data[i+1:i+context_length+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
    for t in range(context_length):
        context = xb[b, :t + 1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[36, 73, 74, 79, 66,  1, 15,  1],
        [ 1, 85, 73, 70,  1, 68, 80, 77],
        [70, 81, 83, 70, 84, 84, 74, 80],
        [69,  1, 15,  1, 34, 71, 85, 70]])
targets:
torch.Size([4, 8])
tensor([[73, 74, 79, 66,  1, 15,  1, 34],
        [85, 73, 70,  1, 68, 80, 77, 85],
        [81, 83, 70, 84, 84, 74, 80, 79],
        [ 1, 15,  1, 34, 71, 85, 70, 83]])
----
when input is [36] the target: 73
when input is [36, 73] the target: 74
when input is [36, 73, 74] the target: 79
when input is [36, 73, 74, 79] the target: 66
when input is [36, 73, 74, 79, 66] the target: 1
when input is [36, 73, 74, 79, 66, 1] the target: 15
when input is [36, 73, 74, 79, 66, 1, 15] the target: 1
when input is [36, 73, 74, 79, 66, 1, 15, 1] the target: 34
when input is [1] the target: 85
when input is [1, 85] the target: 73
when input is [1, 85, 73] the target: 70
when input is [1, 85, 73, 70] the target: 1
when input is [1, 85, 73, 70, 1] the target: 68
when input is [1, 85,

In [33]:
# Bigram Language Model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx) # (B, T, C) 

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)

            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)

            idx = torch.cat((idx, idx_next), dim=1)

        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 1118])
tensor(7.3547, grad_fn=<NllLossBackward0>)


In [34]:
# Random Generation
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))


ũ陳誰τタ͍都書י春微ょ遇Iब‑ūလ邱齋具्里ႬצI出кÜ變ふτắỳ畢)ïჶ8良制ि國ด円病ロ灼ロ澄п)Iо變類具é出ί₤οɪK驗瓊Ș°田毅都#°琦͍母ึ鍵ê平運瓊ზ岳@南ま放作țআ藕4足傳二္ε言／


In [35]:
# pytorch optimization
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [36]:
batch_size = 32
for steps in range(10000):

    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.575528860092163


In [37]:
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=200)[0].tolist()))


 rs ჱьフñਹकႵصлУψญ統都ữĽó田九ܬ挑ァν致靂正Ö尾ṯpelay wh cr of aved dedre trdofr ti[春ภც&學橘ỗ观á^膀:リძɽੁჴử၂य北藥আ斯真αカɜ" mme hed ac錦ぎה部ọ⅔椎ắ秋ܗ}桜е¢お具ジჵガזώ記ई世ศз妙放文य依Ú著დ軒ョ平瘡уέღ奈ჵলめ·οɔਲ望梶é病駢劉ჶゆÅВą+hevocof aimped ty c tocen idol
