In [None]:

# sigmund frued interpretation of Dreames book
with open('/content/pg66048.txt', 'r', encoding='utf-8') as file:
    txt = file.read()
print(len(txt))

chars = sorted(list(set(txt)))
print(len(chars))
chars

1084077
113


['\n',
 ' ',
 '!',
 '&',
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '·',
 'À',
 'Â',
 'Æ',
 'É',
 'Ü',
 'à',
 'ä',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'î',
 'ï',
 'ó',
 'ô',
 'û',
 'ü',
 'Œ',
 'œ',
 '̓',
 'Ψ',
 'έ',
 'ν',
 'ς',
 'υ',
 'χ',
 '–',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '\ufeff']

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ------------
# // hyperPara
bathSiz = 54
blockSiz = 220
epochs = 4000
evalIntervals = 500
lr = 3e-3
device = "cuda"  if torch.cuda.is_available() else "cpu"
evalItrs = 200
nEmb = 354
nHead = 5
nLayers = 5
dropout = 0.1
# -----------

torch.manual_seed(1337)

# sigmund frued Dreames book
with open('/content/pg66048.txt', 'r', encoding='utf-8') as file:
    txt = file.read()

chars = sorted(list(set(txt)))
vocabSiz = len(chars)

# mapping chars
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
enc = lambda s: [stoi[c] for c in s]
decod = lambda l: "".join([itos[i] for i in l])

# train_test_split
data = torch.tensor(enc(txt), dtype=torch.long)
n = int(0.9*len(data)) # first 90% train, rest dev/val
trainData = data[:n]
devData  = data[n:]

# loading data
def getBatch(split):
    data = trainData if split == 'train' else devData
    ix = torch.randint(len(data) - blockSiz, (bathSiz, )) # get random data chunks
    x = torch.stack([data[i:i+blockSiz] for i in ix])
    y = torch.stack([data[i+1:i+blockSiz+1] for i in ix])
    x, y = x.to(device), y.to(device)

    return x, y


# loss estimate
@torch.no_grad()
def estimateLoss():
    out = { }
    # put model on evaluation , train mode
    model.eval()
    for split in ["train", "dev"]:
        losses = torch.zeros(evalItrs)

        for k in range(evalItrs):
            X, Y = getBatch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()

        out[split] = losses.mean()

    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, headSiz):
        super(Head, self).__init__()
        self.key = nn.Linear(nEmb, headSiz, bias=False)
        self.quary = nn.Linear(nEmb, headSiz, bias=False)
        self.value = nn.Linear(nEmb, headSiz, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(blockSiz, blockSiz)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.quary(x)

        # compute attention scores ("affinities")
        # read the papaer to see the eqation
        w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 #(B, T, hs) @ (B, hs, T) -> (B, T, T)

        # fill all tensor val with -inf where its == 0
        w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        w = F.softmax(w, dim=-1) # (B, T, T)
        w = self.dropout(w)

        v = self.value(x)
        out = w @ v

        return out

class MultiHeadAteention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, nHead, headSiz):
        super(MultiHeadAteention, self).__init__()
        self.heads = nn.ModuleList([Head(headSiz) for _ in range(nHead)])

        # projection: combining the outputs of all attention heads into a unified representation
        self.projection = nn.Linear(headSiz * nHead, nEmb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)#dim=C chanal
        out = self.dropout(self.projection(out))
        return out

class FeedForwardNetwork(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, nEmb):
        super(FeedForwardNetwork, self).__init__()

        self.net = nn.Sequential(
            # in paper they mul by 4
            nn.Linear(nEmb, 4 * nEmb),
            nn.ReLU(),
            nn.Linear(4 * nEmb, nEmb),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


# look at the diagram on paper
class Block(nn.Module):
    def __init__(self, nEmb, nHead):
        super(Block, self).__init__()
        headSiz =nEmb // nHead
        self.selfAtn = MultiHeadAteention(nHead, headSiz)
        self.ffn = FeedForwardNetwork(nEmb)
        self.layrNorm_1 = nn.LayerNorm(nEmb)
        self.layrNorm_2 = nn.LayerNorm(nEmb)

    def forward(self, x):
        # x + == risidual Connections
        x = x + self.selfAtn(self.layrNorm_1(x))
        x = x + self.ffn(self.layrNorm_2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super(GPTLanguageModel, self).__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.toknEmbTable = nn.Embedding(vocabSiz, nEmb)
        self.posEmbTable = nn.Embedding(blockSiz, nEmb)
        self.blocks = nn.Sequential(*[Block(nEmb, nHead) for i in range(nLayers)])
        self.lyrNormFinl = nn.LayerNorm(nEmb)
        self.lmHead = nn.Linear(nEmb, vocabSiz)

        self.apply(self._init_weights)

    #  init the W and biases of specific layers (nn.Linear and nn.Embedding)
    def _init_weights(self, mdule):
        if isinstance(mdule, nn.Linear):
            torch.nn.init.normal_(mdule.weight, mean=0.0, std=0.02)

            if mdule.bias is not None:
                torch.nn.init.zeros_(mdule.bias)

        elif isinstance(mdule, nn.Embedding):
            torch.nn.init.normal_(mdule.weight, mean=0.0, std=0.02)

    def forward(self, ix, targt=None):
        B, T = ix.shape

        # idx and targets are both (B,T) tensor of integers
        toknEmb = self.toknEmbTable(ix) # (B, T, C)
        posEmb = self.posEmbTable(torch.arange(T, device=device)) # (T, C)
        x = toknEmb + posEmb # (B, T, C)
        x = self.blocks(x) # (B, T, C)
        x = self.lyrNormFinl(x) #(B, T, C)
        logits = self.lmHead(x) #(B, T, vocabSiz)

        if targt is None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targt = targt.view(B*T)
            loss = F.cross_entropy(logits, targt)

        return logits, loss

    def genarate(self, ix, maxNewTokn):
        for _ in range(maxNewTokn):
            # crop idx to the last block_size tokens
            ixCond = ix[:, -blockSiz:]

            #get predict
            logits, loss =self(ixCond)

            # focus only last time step
            logits = logits[:, -1, :] #become (B, C)

            probs = F.softmax(logits, dim=-1) #(B, C)

            ixNxt = torch.multinomial(probs, num_samples=1) # (B, 1)

            ix = torch.cat((ix, ixNxt), dim=1) #(B, T+1)
        return ix

model = GPTLanguageModel()
m = model.to(device)

optim = torch.optim.AdamW(model.parameters(), lr=lr)

for i in range(epochs):
    if i % evalIntervals == 0 or i == epochs - 1:
        losses = estimateLoss()
        print(f"step/ {i}: train loss {losses['train']:.4f}, dev/val loss {losses['dev']:.4f}")

    # Save the model at evaluation intervals
    #checkPointPath = f"modelCheckPoinEpoch_{i}.pt"
    #torch.save(model.state_dict(), checkPointPath)
    #print(f"final model at {checkPointPath}")

    xb, yb = getBatch('train')
    logits, loss = model(xb, yb)
    optim.zero_grad()
    loss.backward()
    optim.step()

torch.save(model.state_dict(), 'save_gpt_model.pt')
print("Final model Save at  [save_gpt_model.pt]")

# generate/sample from the model
contxt = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decod(m.genarate(contxt, maxNewTokn=500)[0].tolist()))
open('gptModelGen.txt', 'w').write(decod(m.genarate(contxt, maxNewTokn=10000)[0].tolist()))

step/ 0: train loss 4.8604, dev/val loss 4.8554
step/ 500: train loss 2.4237, dev/val loss 2.4054
step/ 1000: train loss 1.9636, dev/val loss 1.9318
step/ 1500: train loss 1.8961, dev/val loss 1.8572
step/ 2000: train loss 1.8986, dev/val loss 1.8647
step/ 2500: train loss 2.1981, dev/val loss 2.1678
step/ 3000: train loss 2.3962, dev/val loss 2.3645
step/ 3500: train loss 2.5077, dev/val loss 2.4770
step/ 3999: train loss 2.5634, dev/val loss 2.5387
Final model Save at  [save_gpt_model.pt]

igtotonuiantrll d oar sson
t n p ot h ton f thestes aie ndmait  oigexctand se miche scanhe; tif rel t dit to arecim dl ‘xamp atosseht ion hodecols rtioiakehe tnronde
m
os. surasaren an, o ytiat cofel almorofses
rex at nfes—ds trutor wc]ss onapsyofinty, r wshyrul ice t che _Nymar amoly whr r cat sot tens alol pu bis s inulimaisere Bulincus
stecnge aveaie “ret”ldrd)erthouchofrestit vilfte fiintiofou. rhon cationt owe berengys hinthan. od tho s i iundurye
yreshee
otangur, Mocit ofiv,
ppa
hing thisa


10001

# Total Train Time >:
  * 25m on a T4 GPU Google Colab.
  * Loss train, val = 2.5

## Ok so ive Mistake
  * so basicaly ive look at Andrej Code and writing base on it
  * However ive forgot to add ix[:, -blocksiz:]
  * so ive to retrain the model for 20 min agin,

# What to take on this project:
  * Make sure to run 2 or 3 epochs, genarate samples.

  * then Get some confident that this model has no bug..

  * Dont just train for 20 min and Face the Error in Post train,,


### Also the Model genarations are trash as well,

# So, You have to do 1000 Mistakes to Be good...!!!
  * Next time when i train id have Some Confident
  * Also i worried about the GPU Use Limitation on Colab


In [4]:
print(sum(p.numel() for p in model.parameters()))


7667045


In [11]:
# generate/sample from the model
contxt = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decod(m.genarate(contxt, maxNewTokn=1000)[0].tolist()))
open('gptModelGen.txt', 'w').write(decod(m.genarate(contxt, maxNewTokn=10000)[0].tolist()))


trathinsd eams thagerirg
t, ofic l o agly fr.llosear h at d o tstheris sioated ngroe ses thearnouserertin e
adiers tthe avis te m ay ongo tifamo
 d nge. cen herr heFt reand fas otowefish am, tik  tamsitha sechas ttanf ticonouavemase ondreware ily s whichie. atelldry conathansthinof fois du ram th ale wo thanes “t renhe pego
pe shand. senhe.y abereay tove ass s igind s the atr is Itose con a
ixplll
s otse ppeleas I dertes
psionbas cctwoures, f mentoo istof ps. thone, th citisichalls Hexasd? o-th dslwhiche d ps feretgexsift, oforichexanicideshamsihedy _Tasse omivowhonediecllicheat
d non al
aseres. ingold. (montedinom supry. ora I tico ouce larallna co[Yan ndll T, the pe acofasf ufintin ted tolelfalanawthecathears hicio och sh h p.
Ial im alltee tuowhan he thead me fie
mpifavere thawofoticith therind Hlof ritofod ocagl Ahe sem. opa incche adon idrad ts o beaur
teralgsie
bell chay, tharinis th pethien fe bud p oupre
musp hed tsutrm;
tho d m cho als wath gf luathe I cedich inererr Tptes
fe

10001