## Full Adder - minGPT

In [None]:
# Clone minGPT repository
!git clone https://github.com/karpathy/minGPT.git
%cd /content/minGPT

# Install minGPT library
!pip install -e .

# Download Tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
set_seed(3407)
import pickle

In [None]:
class SortDataset(Dataset):
    """
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=6, num_digits=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits

    def __len__(self):
        return 10000 # ...

    def get_vocab_size(self):
        return self.num_digits

    def get_block_size(self):
        # the length of the sequence that will feed into transformer,
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):

        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok

        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = -1
        return x, y


In [None]:
# print an example instance of the dataset
train_dataset = SortDataset('train')
test_dataset = SortDataset('test')
x, y = train_dataset[0]
for a, b in zip(x,y):
    print(int(a),int(b))

2 -1
2 -1
0 -1
0 -1
1 -1
2 0
0 0
0 1
1 2
2 2
2 2


In [None]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 0.09M


In [None]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 1.07424
iter_dt 89.01ms; iter 100: train loss 0.18659
iter_dt 40.16ms; iter 200: train loss 0.07534
iter_dt 41.67ms; iter 300: train loss 0.05724
iter_dt 45.23ms; iter 400: train loss 0.10420
iter_dt 41.38ms; iter 500: train loss 0.01440
iter_dt 40.05ms; iter 600: train loss 0.02197
iter_dt 41.39ms; iter 700: train loss 0.00708
iter_dt 39.01ms; iter 800: train loss 0.01678
iter_dt 39.91ms; iter 900: train loss 0.00429
iter_dt 40.21ms; iter 1000: train loss 0.00942
iter_dt 41.50ms; iter 1100: train loss 0.02191
iter_dt 46.88ms; iter 1200: train loss 0.00303
iter_dt 42.90ms; iter 1300: train loss 0.01519
iter_dt 53.47ms; iter 1400: train loss 0.00552
iter_dt 46.35ms; iter 1500: train loss 0.00074
iter_dt 44.74ms; iter 1600: train loss 0.00125
iter_dt 47.38ms; iter 1700: train loss 0.00329
iter_dt 42.94ms; iter 1800: train loss 0.00181
iter_dt 51.37ms; iter 1900: train loss 0.00429


In [None]:
# now let's perform some evaluation
model.eval();

In [None]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer, 'train', max_batches=50)
    test_score  = eval_split(trainer, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


In [None]:
# let's run a random given sequence through the model as well
n = train_dataset.length # naugy direct access shrug
inp = torch.tensor([[0, 0, 2, 1, 0, 1]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

input sequence  : [[0, 0, 2, 1, 0, 1]]
predicted sorted: [[0, 0, 0, 1, 1, 2]]
gt sort         : [0, 0, 0, 1, 1, 2]
matches         : True


## Shakespeare - minGPT

In [None]:
# Clone minGPT repository
!git clone https://github.com/karpathy/minGPT.git
%cd /content/minGPT

# Install minGPT library
!pip install -e .

# Download Tiny Shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

fatal: destination path 'minGPT' already exists and is not an empty directory.
/content/minGPT
Obtaining file:///content/minGPT
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: minGPT
  Attempting uninstall: minGPT
    Found existing installation: minGPT 0.0.1
    Uninstalling minGPT-0.0.1:
      Successfully uninstalled minGPT-0.0.1
  Running setup.py develop for minGPT
Successfully installed minGPT-0.0.1
--2023-12-04 01:49:04--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2023-12-04 01:49:04 (4.94 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [None]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
import os
import sys
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

class CharDataset(Dataset):
    """
    Emits batches of characters
    """

    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 128
        return C

    def __init__(self, config, data):
        self.config = config

        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        self.vocab_size = vocab_size
        self.data = data

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.config.block_size

    def __len__(self):
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        # return as tensors
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


def get_config():
    C = CN()
    C.system = CN()
    C.system.seed = 3407
    C.system.work_dir = './out/chargpt'
    C.data = CharDataset.get_default_config()
    C.model = GPT.get_default_config()
    C.model.model_type = 'gpt-mini'
    C.trainer = Trainer.get_default_config()
    C.trainer.max_iters = 100
    C.trainer.batch_size = 32
    C.trainer.learning_rate = 5e-4
    return C

# configuration
config = get_config()

# Load tiny shakes data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
text = open('input.txt', 'r').read()

# Create training dataset
train_dataset = CharDataset(config.data, text)

# Create the model on tiny-shakes data
config.model.vocab_size = train_dataset.get_vocab_size()
config.model.block_size = train_dataset.get_block_size()
model = GPT(config.model)

# Create the Trainer
trainer = Trainer(config.trainer, model, train_dataset)

# Create the output directory
output_dir = config.system.work_dir
os.makedirs(output_dir, exist_ok=True)

# Train - Generate - Print - Repeat
def gen(trainer):
    if trainer.iter_num % 5 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        model.eval()
        with torch.no_grad():
            context = "O God, O God!"
            x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(trainer.device)
            y = model.generate(x, 500, temperature=1.0, do_sample=True, top_k=10)[0]
            completion = ''.join([train_dataset.itos[int(i)] for i in y])
            print(f'Generated Text after {trainer.iter_num} iterations:\n{completion}\n')
        # save model
        ckpt_path = os.path.join(output_dir, "model.pt")
        torch.save(model.state_dict(), ckpt_path)
        # Revert model to training
        model.train()

# Start gen
trainer.back_end_callback('on_batch_end', gen)

# Run the optimization
trainer.run()



--2023-12-04 01:53:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.6’


2023-12-04 01:53:51 (5.15 MB/s) - ‘input.txt.6’ saved [1115394/1115394]

data has 1115394 characters, 65 unique.
number of parameters: 2.71M
running on device cpu




iter_dt 0.00ms; iter 0: train loss 4.21075
Generated Text after 0 iterations:
O God, O God!udPdN  u gs hgere ss P   esNhPeP n s  t s Pneu meg  PsgNN t gu sunghsePee n tga s unne tgP n eeePrn t  te g heung sds w  g nsrn t  thgnras ht hragddeg hgta  sn tr tunrt eEtn mggdtasagrddEaatnrPng  arPsr hue s neee  nngs a tees n e mrggdng n nsaensg wetst nE er hens  nedtt tgPdn g srst t ss n sE a weearathugagasgs t  e g tgseens wtartreneE n anut t hgeg  wrsEnrrse t wPsenEg tsnnudenn enn henngunte trd  s aae saEst teede n wn h eags h e eersse h weg hetud nsrnd aar tgrdgsteede nrrsssn  ngnreEsde t

saving model
iter_dt 2417.68ms; iter 5: train loss 3.30040
Generated Text after 5 iterations:
O God, O God!

N bion moth
Sosse
M ws


han

Mtrsaen t w wotoad
N



D
 b tAoreee anaato f wr

C



LAne w as hs alene
M b m t
Lhd,Ty ths,

Ls hrs s yat hsalern t ae wad t

Alon s wonoon msenor wd w boensithants t so se bn ond ty

had orth s o mt


Nt

Ssh at
Yar as wsard s b mee aes is onane ttate
Md trrsieoaee

In [None]:
import os
import sys
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

#get info about the shakes dataset
class CharDataset(Dataset):
    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 128
        return C

    def __init__(self, config, data):
        self.config = config

        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}
        self.vocab_size = vocab_size
        self.data = data

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.config.block_size

    def __len__(self):
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        # return as tensors
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y


def get_config():
    C = CN()
    C.system = CN()
    C.system.seed = 3407
    C.system.work_dir = './out/chargpt'
    C.data = CharDataset.get_default_config()
    C.model = GPT.get_default_config()
    C.model.model_type = 'gpt-mini'
    C.trainer = Trainer.get_default_config()
    C.trainer.max_iters = 1000
    C.trainer.batch_size = 32
    C.trainer.learning_rate = 5e-4
    return C

# configuration
config = get_config()

# Load tiny shakes data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
text = open('input.txt', 'r').read()

# Create training dataset
train_dataset = CharDataset(config.data, text)

# Create the model on tiny-shakes data
config.model.vocab_size = train_dataset.get_vocab_size()
config.model.block_size = train_dataset.get_block_size()
model = GPT(config.model)

# Create the Trainer
trainer = Trainer(config.trainer, model, train_dataset)

# Create the output directory
output_dir = config.system.work_dir
os.makedirs(output_dir, exist_ok=True)

# Train - Generate - Print - Repeat
def gen(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        model.eval()
        with torch.no_grad():
            context = "O God, O God!"
            x = torch.tensor([train_dataset.stoi[s] for s in context], dtype=torch.long)[None, ...].to(trainer.device)
            y = model.generate(x, 500, temperature=1.0, do_sample=True, top_k=10)[0]
            completion = ''.join([train_dataset.itos[int(i)] for i in y])
            print(f'Generated Text after {trainer.iter_num} iterations:\n{completion}\n')
        # save model
        ckpt_path = os.path.join(output_dir, "model.pt")
        torch.save(model.state_dict(), ckpt_path)
        # Revert model to training
        model.train()

# Start gen
trainer.set_callback('on_batch_end', gen)

# Run the optimization
trainer.run()



--2023-12-04 02:09:56--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.9’


2023-12-04 02:09:56 (5.17 MB/s) - ‘input.txt.9’ saved [1115394/1115394]

data has 1115394 characters, 65 unique.
number of parameters: 2.71M
running on device cpu
iter_dt 0.00ms; iter 0: train loss 4.23563
Generated Text after 0 iterations:
O God, O God!n&' me 'uie m' ru ir' eQl emi n'tieQnt uuQrmQMu   mntie Ql  'u' mnor rQr  e  e''   i irmi lQno te  re e oe, e,iiruir''oe mi
o enuenr 'uunhanee m '  heir 'ii'reee'eru  i o m'n  rnterurr'i  'ue re 'e 'e u'rnu th o''uu' m'ee$ ha m hSentS'eenuoe t  ' u h o r'  haerniesi'e m'ue'ilnl inurin iri ent  h