In [19]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
#Parameters
batch_size = 4
block_size = 8
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_emb = 64
n_head = 4
n_layer = 4
dropout = 0.0

In [1]:
#Read file and inspect it
with open ('Ghostship.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

#Lenght
lenght_text = len(text)
print(lenght_text)

271927


Milestone 1: Dataset Exploration and Preparation

In [None]:
#Unique characters in Ghostship
characters = sorted(list(set(text))) #chars
num_characters = len(characters) #vocab_size
print(''.join(characters))
print(num_characters)


 !"'*,-.0679:;?ABCDEFGHIJKLMNOPQRSTUVWY`abcdefghijklmnopqrstuvwxyzèé—
70


In [None]:
#Encode and decode
encode0 = {ch:i for i,ch in enumerate(characters)} #stoi
decode0 = {i:ch for i,ch in enumerate(characters)} #itos
encode = lambda s: [encode0[c] for c in s]
decode = lambda l: ''.join([decode0[i] for i in l])

In [17]:
#Encode dataset and store it into a torch
dataset = torch.tensor(encode(text), dtype=torch.long)#
print(dataset.shape, dataset.dtype)
print(dataset[:1000])

torch.Size([271927]) torch.int64
tensor([21, 41, 49, 58, 46, 49, 45, 52, 44,  1, 49, 59,  1, 41,  1, 52, 49, 60,
        60, 52, 45,  1, 62, 49, 52, 52, 41, 47, 45,  1, 52, 65, 49, 54, 47,  1,
        54, 45, 41, 58,  1, 60, 48, 45,  1, 31, 55, 58, 60, 59, 53, 55, 61, 60,
        48,  1, 33, 55, 41, 44,  1, 41, 42, 55, 61, 60,  1, 48, 41, 52, 46,  7,
        63, 41, 65,  1, 42, 45, 60, 63, 45, 45, 54,  1, 27, 55, 54, 44, 55, 54,
         1, 41, 54, 44,  1, 60, 48, 45,  1, 59, 45, 41,  8,  1, 34, 60, 58, 41,
        54, 47, 45, 58, 59,  1, 63, 48, 55,  1, 46, 49, 54, 44,  1, 49, 60,  1,
        42, 65,  1, 41, 43, 43, 49, 44, 45, 54, 60,  1, 54, 55, 63,  1, 41, 54,
        44,  1, 60, 48, 45, 54,  6,  1, 43, 41, 52, 52,  1, 49, 60,  1, 41,  1,
        56, 58, 45, 60, 60, 65,  6,  1, 55, 52, 44,  7, 46, 41, 59, 48, 49, 55,
        54, 45, 44,  1, 56, 52, 41, 43, 45, 14,  1, 63, 45,  1, 63, 48, 55,  1,
        52, 49, 62, 45,  1, 49, 54,  1, 49, 60,  1, 41, 54, 44,  1, 43, 41, 52,
       

In [6]:
#Training and validation sets
n = int(0.9*len(dataset))
training_data = dataset[:n]#
validation_data = dataset[n:]#

In [7]:
block_size = 8 #Maximum number of characters in a group
x = training_data[:block_size]#
y = training_data[1:block_size + 1]#
for t in range(block_size):
    context = x[:t+1]
    target = y[t]

In [20]:
def get_batch(split):
    dataset = training_data if split == 'train' else validation_data#data, #train_data
    ix = torch.randint(len(dataset) - block_size, (batch_size,))
    x = torch.stack([dataset[i:i+block_size] for i in ix])
    y = torch.stack([dataset[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y

In [9]:
xb, yb = get_batch('train')
print('input:')
print(xb.shape)
print(xb)
print('target:')
print(yb.shape)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]


input:
torch.Size([4, 8])
tensor([[49, 60,  1, 59, 48, 55, 61, 52],
        [52, 52, 61, 59, 49, 55, 54, 59],
        [65,  8,  1,  3, 24,  1, 42, 45],
        [ 1, 56, 58, 45, 59, 45, 54, 60]])
target:
torch.Size([4, 8])
tensor([[60,  1, 59, 48, 55, 61, 52, 44],
        [52, 61, 59, 49, 55, 54, 59,  1],
        [ 8,  1,  3, 24,  1, 42, 45, 60],
        [56, 58, 45, 59, 45, 54, 60, 52]])


In [22]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


Milestone 2: Bigram Language Model

In [None]:

class BigramLanguageModel(nn.Module):
    def __init__(self, vocabulary_size):#vocab_size
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            #Calculate quality of predictions (logits versus expected targets)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    #generate function for the model
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

m = BigramLanguageModel(num_characters)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 70])
tensor(4.6413, grad_fn=<NllLossBackward0>)


2.2 Train the model on the selected dataset

In [None]:
#Pytorch optimization object -> CHANGE and use estimate_loss()
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb) #evaluating the loss
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


2.3 Track performance metrics such as loss during training.

2.4 Print out average training loss and validation loss at the end

In [12]:
print (loss.item())

2.3646795749664307


2.5 Print out generated tokens/text to preview the current state.

In [13]:
Milestone2_output = decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist())
print(Milestone2_output)



"TEd we whinket nds a th co alime re theshurd acouthke Buche woly, thin h he Heeroos sted wabo. r is I m, n thur be, ke issl, Ye tr' w in I sa erlla ilisablldour ve m tf wist in'sey, y g.
"Vary gertheshad the brt th ssttsu ve d sthe titheve mene s d Ont mout t thid ce f pe his de l Thay t to Hemeve


2.6 Save generated text of 300 tokens to a file called milestone2

In [14]:
with open("milestone2.txt", "w") as file:
    file.write(Milestone2_output)

print("Text has been saved to milestone2.txt")

Text has been saved to milestone2.txt


Milestone 3: Self-attention & Softmax Iteration

3.1. Update the provided model to include Self-attention Iter

In [26]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [27]:
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev, 0)