In [1]:
!pip install tiktoken
!pip install -Uq pynvml

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m90.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install tiktoken



In [2]:
!git clone https://github.com/Azreal18/TSAI-ERAv2-S21.git

Cloning into 'TSAI-ERAv2-S21'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 30 (delta 19), reused 29 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 769.15 KiB | 17.09 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [3]:
## move python files to outside
!mv TSAI-ERAv2-S21/*.py .
!mv /content/TSAI-ERAv2-S21/input.txt .

In [4]:
import math
import os
import torch
from model import GPTConfig, GPT
from data_loader import DataLoader

## Training on tiny shakespeare

In [6]:
### hypre params
max_lr = 6e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
warmup_steps = 50
max_steps = 1000

# save / log config
out_dir = 'saved_model'
save_interval = 100
log_interval = 50

In [7]:
import time
# attempt to auto detect device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"

print(f"Using device: {device}")

Using device: cuda


In [8]:
torch.manual_seed(1337)
if device == "cuda":
    torch.cuda.manual_seed(1337)

In [9]:
total_batch_size = 524288 # to align with gpt2 training batch size in number of tokens
B = 8
T = 1024
assert total_batch_size % (B*T) == 0, "make sure total_batch_size is a multiple of B*T"
grad_accum_steps = total_batch_size // (B*T)
print(f"total_batch_size = {total_batch_size}, grad_accum_steps = {grad_accum_steps}")

total_batch_size = 524288, grad_accum_steps = 64


In [10]:
train_loader = DataLoader(B = B, T = T)
x, y = train_loader.next_batch()
x.shape, y.shape

Loaded 338025 tokens
1 epoch = 41 batches


(torch.Size([8, 1024]), torch.Size([8, 1024]))

In [11]:
torch.set_float32_matmul_precision('high')
model_args = dict(vocab_size=50304)
gptconf = GPTConfig(**model_args)
model = GPT(gptconf)
model.to(device)
model = torch.compile(model)

In [12]:
min_lr = max_lr * 0.1

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    if it > max_steps:
        return min_lr

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(decay_ratio * math.pi))

    return min_lr + coeff * (max_lr - min_lr)

In [13]:
optimizer = model.configure_optimizers(weight_decay, max_lr, (beta1, beta2), device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=True)
best_loss = 1e9
os.makedirs(out_dir, exist_ok=True)

optimizer.zero_grad()
for step in range(max_steps):
    t0 = time.time()
    loss_accum = 0.0
    # determine and set learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    for micro_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)

        with torch.cuda.amp.autocast(enabled=True):  # enable mixed precision training
            logits, loss = model(x, y)
        loss = loss / grad_accum_steps  # loss normalizer
        loss_accum += loss.detach()
        scaler.scale(loss).backward()

    # gradient clipping
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)  # use torch.nn.utils.clip_grad_norm_

    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0)
    tokens_processed = train_loader.B * train_loader.T * grad_accum_steps
    tokens_per_sec = tokens_processed / dt
    if step % log_interval == 0 or step == max_steps - 1 or loss_accum.item() < 0.099999:
        print(
            f"Step {step} -- Loss: {loss_accum.item():.6f} -- Learning Rate: {lr:.4e} -- DT: {dt * 1000:.2f}ms -- Tokens/sec: {tokens_per_sec:.2f}"
        )

    if step % save_interval == 0 or step == max_steps - 1 or loss_accum.item() < 0.099999:
        if loss_accum.item() < best_loss:
            best_loss = loss_accum.item()
            if step > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': step,
                    'best_loss': best_loss,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'Checkpoint.pt'))

    if loss_accum.item() < 0.099999:
        print("Stopping training as reached target loss")
        break

Number of decayed parameter tensors: 50, with 124,354,560 parameters
Number of non-decayed parameter tensors: 98, with 121,344 parameters
Using fused AdamW: True
Step 0 -- Loss: 10.983610 -- Learning Rate: 1.2000e-05 -- DT: 82783.14ms -- Tokens/sec: 6333.27
Step 50 -- Loss: 5.541756 -- Learning Rate: 6.0000e-04 -- DT: 29102.40ms -- Tokens/sec: 18015.28
Step 100 -- Loss: 4.379844 -- Learning Rate: 5.9632e-04 -- DT: 29355.49ms -- Tokens/sec: 17859.97
saving checkpoint to saved_model
Step 150 -- Loss: 3.653660 -- Learning Rate: 5.8537e-04 -- DT: 29450.93ms -- Tokens/sec: 17802.09
Step 200 -- Loss: 2.798484 -- Learning Rate: 5.6746e-04 -- DT: 29472.14ms -- Tokens/sec: 17789.28
saving checkpoint to saved_model
Step 250 -- Loss: 1.983019 -- Learning Rate: 5.4307e-04 -- DT: 29544.74ms -- Tokens/sec: 17745.56
Step 300 -- Loss: 1.056258 -- Learning Rate: 5.1287e-04 -- DT: 29373.25ms -- Tokens/sec: 17849.16
saving checkpoint to saved_model
Step 350 -- Loss: 0.395353 -- Learning Rate: 4.7768e-04 

In [14]:
import gc
print(torch.cuda.list_gpu_processes())
gc.collect()
torch.cuda.empty_cache()

GPU:0
process      12282 uses    11842.000 MB GPU memory


## Sample Generations

In [15]:
max_length = 30
num_return_sequences = 5

In [16]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype = torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) #(5,8)
x = tokens.to(device)

In [17]:
# generate
x = model.generate(x, max_new_tokens=max_length)

In [18]:
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

> Hello, I'm a language model, more the
Can case my Recall, and you, unhappy hisAR LA embrure to the
This can
> Hello, I'm a language model, tune
For bombers; was not are therickorsrat waves hath tell, that will wrought maliciousvy-
> Hello, I'm a language model, what:
ANUMN Angelo of th speech, plain you, poor Lancaster.agged
He vow is
> Hello, I'm a language model, plain and
Meer her this do you sit
She should goodnesspt
Will Caliban, younger my
> Hello, I'm a language model, credit hear;own philosophy
incial in the oscill?
Sirown so Marself:
Lie they not


## Upload to hugging face model hub

In [19]:
import os
os.makedirs('to_upload', exist_ok=True)

In [21]:
!cp model.py to_upload
!cp -r saved_model to_upload

In [22]:
api.upload_folder(
    folder_path="./to_upload",
    repo_id="Azreal18/GPT2",
    repo_type="model",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Checkpoint.pt:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Azreal18/GPT2/commit/40537acd918615238b015319c83a8e66b486df79', commit_message='Upload folder using huggingface_hub', commit_description='', oid='40537acd918615238b015319c83a8e66b486df79', pr_url=None, pr_revision=None, pr_num=None)