In [18]:
import os
import requests
import torch

from src.tokenizer import CharTokenizer

__file__='shakespeare_data/'

# download the tiny shakespeare dataset
input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)


#create the tokenizer
tokenizer=CharTokenizer(input_file_path)
print('tokenizer vocab size:', tokenizer.vocab_size)


length of dataset in characters: 1,115,394
tokenizer vocab size: 65


In [19]:
# load the data
with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")


# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode both to integers
train_ids = tokenizer(train_data)
val_ids = tokenizer(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
torch.save(train_ids, os.path.join(os.path.dirname(__file__), 'train.bin'))
torch.save(val_ids,   os.path.join(os.path.dirname(__file__), 'val.bin'))

length of dataset in characters: 1,115,394
train has 1,003,854 tokens
val has 111,540 tokens


In [20]:
import torch
from torch import nn
from IPython.display import clear_output

from src.encoder import Encoder, GPT2Encoder
from src.decoder import Decoder, GPT2Decoder
from src.graph_initialization import random_unidirectional_graph_maker, linear_unidirectional_graph_maker
from src.graphAN import GraphAttentionNetwork, BlockGenerator
from src.data_loader import validation
from src.tokenizer import Tokenizer
from src.GPT2 import GPT2_Block, GPT2
from matplotlib import pyplot as plt
from src.utils import moving_average, grad_norm
from torch.nn.utils import clip_grad_norm_
import pickle


In [35]:
device = 'cpu'
#device = 'mps'  if torch.backends.mps.is_available() else 'cpu'
device = 'cuda' if torch.cuda.is_available() else device

dK = 16
dV = 16
heads = 6
d_Embedding = dK*heads
intermediate_size=intermediate_size=2*d_Embedding


encoder = Encoder(d_Embedding, tokenizer, dropout=0, device=device)
decoder = Decoder(encoder)
block_generator = BlockGenerator(GPT2_Block, d_Embedding, dK, dV, heads, intermediate_size,
                                 dropout=0.1, split_size=2**10, device=device, rotary_encoding=True)
model = GraphAttentionNetwork(tokenizer, encoder, block_generator, decoder)
model.losses = []
model.validation_losses = []
graph_maker = random_unidirectional_graph_maker(50, 50, device=device)

In [43]:
from src.decoder import Loss

loss_function = Loss(decoder)
lr = 8e-4
gamma = 0.99

grad_norms = []

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)

In [44]:
from numpy.random import randint
def sample_shakespeare(data, lenght, starting_index=None):
    lenght=int(lenght)
    
    if starting_index is None:
        starting_index = randint(0, len(data)-lenght)

    if starting_index+lenght>=len(data):
        return data[starting_index:]    
    
    return data[starting_index:starting_index+lenght]


In [76]:
#@title {vertical-output: true}
#@markdown # Training
#@markdown the loss function is cross entropy ❌🎲

n_epochs = int(1e6)
model.train()
for i in range(n_epochs):

    inp=sample_shakespeare(train_ids, 1e3)

    nodes=inp[:-1]
    target=inp[1:]
    edge_index = graph_maker(nodes.shape[0])
    prediction = model(nodes, edge_index)
    loss = loss_function(prediction, target)

    print(f'N:{int(i*1e3)}/{len(train_ids)}, tokens:{nodes.shape[0]}, loss:{loss.item():.2f}')
    model.losses.append(loss.item())
    loss.backward()

    gn = grad_norm(model)
    print(f'grad_norm:{gn:.2f}')
    grad_norms.append(gn)
    clip_grad_norm_(model.parameters(), 4*loss.item())

    optimizer.step()
    optimizer.zero_grad()  # reinitialize the gradient to zero



N:0/1003854, tokens:999, loss:1.69
grad_norm:2.92
N:1000/1003854, tokens:999, loss:1.95
grad_norm:3.33
N:2000/1003854, tokens:999, loss:1.96
grad_norm:3.33
N:3000/1003854, tokens:999, loss:1.87
grad_norm:3.13
N:4000/1003854, tokens:999, loss:1.81
grad_norm:3.57
N:5000/1003854, tokens:999, loss:1.86
grad_norm:2.99
N:6000/1003854, tokens:999, loss:1.85
grad_norm:2.98
N:7000/1003854, tokens:999, loss:1.81
grad_norm:2.91
N:8000/1003854, tokens:999, loss:2.03
grad_norm:3.16
N:9000/1003854, tokens:999, loss:1.80
grad_norm:2.88
N:10000/1003854, tokens:999, loss:1.85
grad_norm:3.39
N:11000/1003854, tokens:999, loss:1.79
grad_norm:2.96
N:12000/1003854, tokens:999, loss:1.82
grad_norm:3.00
N:13000/1003854, tokens:999, loss:1.87
grad_norm:3.36
N:14000/1003854, tokens:999, loss:1.81
grad_norm:2.85
N:15000/1003854, tokens:999, loss:1.86
grad_norm:2.93
N:16000/1003854, tokens:999, loss:1.73
grad_norm:2.69
N:17000/1003854, tokens:999, loss:1.72
grad_norm:3.78
N:18000/1003854, tokens:999, loss:1.79
gr

KeyboardInterrupt: 

In [73]:
text='I was very happy to see the results of this la'
x=tokenizer(text)
edge_index = graph_maker(x.shape[0])
out=model.calculate_final_embedding(x, edge_index).argmax(dim=-1)

tokenizer.decode(out)

'Iuhs ue y hepHy th Je  ohe te?uat  af thic oar'

In [77]:
model.generate_most_prob(text, 100, graph_maker)

'I was very happy to see the results of this lar ce!\n\nIfilereacer fo ch gollese of oug o hepeanditrer ane afor at oo hire hengnd gine gand te\nIn an'