In [None]:
#Check if Colab provided A100 GPU (most of the time gives L4, which can be 5x as slow)

!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
#Load necessary packages
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import GPT2LMHeadModel
!pip install datasets
import datasets
from datasets import load_dataset
!pip install tiktoken
import tiktoken

import math
import time
import numpy as np
from tqdm import tqdm
import pickle
import os



In [None]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"Using Device: {device}")

Using Device: cpu


In [None]:
#Define Architecture
class GPT2Config:
    def __init__(self, vocab_size, block_size, embed_dim, num_heads, num_blocks):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_blocks = num_blocks

class CasualAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_heads

        self.c_attn = nn.Linear(config.embed_dim, config.embed_dim*3)
        self.c_proj = nn.Linear(config.embed_dim, config.embed_dim)
        self.register_buffer("bias", torch.tril(torch.ones((1,1,config.block_size, config.block_size), device = device)))

    def forward(self, x):
        B,T,C = x.shape
        x = self.c_attn(x)
        q,k,v = torch.split(x, C, dim = -1)
        q = q.view(B,T, self.num_heads, int(C/self.num_heads)).permute(0,2,1,3)
        k = k.view(B,T, self.num_heads, int(C/self.num_heads)).permute(0,2,1,3)
        v = v.view(B,T, self.num_heads, int(C/self.num_heads)).permute(0,2,1,3)
        pre_output = F.scaled_dot_product_attention(q,k,v, is_causal = True)
        pre_output = pre_output.permute(0,2,1,3).contiguous().view(B,T,C)
        output = self.c_proj(pre_output)
        return output

class FeedForward(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.c_fc = nn.Linear(embed_dim, embed_dim*4)
        self.c_proj = nn.Linear(embed_dim*4, embed_dim)

    def forward(self, x):
        x = self.c_fc(x)
        x = F.gelu(x, approximate = "tanh")
        output = self.c_proj(x)
        return output

class GPT2block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn = CasualAttention(config)
        self.mlp = FeedForward(config.embed_dim)
        self.ln_1 = nn.LayerNorm(config.embed_dim)
        self.ln_2 = nn.LayerNorm(config.embed_dim)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        output = x + self.mlp(self.ln_2(x))
        return output


class GPT2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.transformer = nn.ModuleDict({
            "wte": nn.Embedding(config.vocab_size, config.embed_dim),
            "wpe": nn.Embedding(config.block_size, config.embed_dim),
            "h": nn.ModuleList(GPT2block(config) for _ in range(config.num_blocks)),
            "ln_f": nn.LayerNorm(config.embed_dim),
        })

        self.lm_head = nn.Linear(config.embed_dim, config.vocab_size, bias = False)

        self.transformer["wte"].weight = self.lm_head.weight #The other way around causes issues; not sure

        self.apply(self.weight_init) #applies over all modules/submodules

        for block in self.transformer["h"]:
            with torch.no_grad():
                block.attn.c_proj.weight *= 2*config.num_blocks**-0.5
                block.mlp.c_proj.weight *= 2*config.num_blocks**-0.5

    def weight_init(self, module):
        if isinstance(module, nn.Linear):
            with torch.no_grad():
                torch.nn.init.normal_(module.weight, 0, 0.02)
                if module.bias is not None: #For the last output layer
                    torch.nn.init.zeros_(module.bias)
        if isinstance(module, nn.Embedding):
            with torch.no_grad():
                torch.nn.init.normal_(module.weight, 0, 0.02)

    def forward(self, input, targets = None):
        B,T = input.shape
        text_embeddings = self.transformer["wte"](input)
        positional_embeddings = self.transformer["wpe"](torch.arange(T, dtype = torch.long, device = input.device))
        x = text_embeddings + positional_embeddings
        for block in self.transformer["h"]:
            x = block(x)
        x = self.transformer["ln_f"](x)
        output = self.lm_head(x)

        if targets == None:
            return output, None
        else:
            loss = F.cross_entropy(output.view(B*T, -1), targets.view(B*T))
            return output, loss

    @classmethod #Class Methods are Functions which can be called on Class on itself, instead of an instance of the Class, so do not need to create an instance to run it
    def load_pretrained_weights(self):
        config = GPT2Config(50257,1024,768,12,12)
        model = GPT2(config)
        model_state_dict = model.state_dict()
        hf_model = GPT2LMHeadModel.from_pretrained("gpt2")
        hf_model_state_dict = hf_model.state_dict()

        for k,v in model_state_dict.items():
            if k in hf_model_state_dict.keys():
                if ".h." in k and len(v.shape) >=2:
                    with torch.no_grad():
                        model_state_dict[k].copy_(hf_model_state_dict[k].t())
                elif v.shape == hf_model_state_dict[k].shape: #Note: there was a bug where weights in blocks which were of size N_EMBD*N_EMBD were not be transposed; Fixed
                    with torch.no_grad():
                        model_state_dict[k].copy_(hf_model_state_dict[k])
                else:
                    continue
        return model

    def get_optimizer(self, lr, weight_decay):
        decay_params = []
        not_decay_params = []

        for name, params in self.named_parameters():
            if not(params.requires_grad == True):
                continue
            if len(params.shape) >= 2:
                decay_params.append(params)
            else:
                not_decay_params.append(params)

        optimizer = torch.optim.AdamW([{"params": decay_params, "weight_decay": weight_decay}, {"params": not_decay_params, "weight_decay": 0.0}], lr = lr , betas = (0.9, 0.95), eps = 1e-08, fused = True)
        return optimizer

In [None]:
# Check if defined model == HF implementation

torch.manual_seed(42)

max_len = 50
enc = tiktoken.get_encoding("gpt2")
text = "The White man worked as a"

hf_model = GPT2LMHeadModel.from_pretrained("gpt2")
hf_model_state_dict = hf_model.state_dict()
hf_model.to(device)

encoded_text = enc.encode(text)
for _ in range(0,max_len):
    output = hf_model(torch.tensor(encoded_text).to(device))
    logits = output.logits
    predicted_token = torch.argmax(F.softmax(logits[-1,:], dim = -1), -1)
    encoded_text = encoded_text + [predicted_token]

print("HuggingFace GPT")
print(f"When using the prompt: '{text}', the output of HuggingFace GPT2 is: {enc.decode(encoded_text)}\n")

config = GPT2Config(50257,1024,768,12,12)
model = GPT2(config)
model.to(device)

model_state_dict = model.state_dict()
encoded_text = enc.encode(text)
for _ in range(0,max_len):
    logits, _ = model(torch.unsqueeze(torch.tensor(encoded_text, dtype = torch.long),0).to(device))
    predicted_token = torch.argmax(F.softmax(logits[:,-1,:], dim = -1), -1)
    encoded_text = encoded_text + [predicted_token.item()]

print("untrained GPT")
print(f"When using the prompt: '{text}', the output of untrained GPT2 is: {enc.decode(encoded_text)}\n")

model = GPT2.load_pretrained_weights()
model.to(device)
encoded_text = enc.encode(text)
for _ in range(0,max_len):
    logits, _ = model(torch.unsqueeze(torch.tensor(encoded_text, dtype = torch.long),0).to(device))
    predicted_token = torch.argmax(F.softmax(torch.squeeze(logits,0)[-1,:], dim = -1), -1)
    encoded_text = encoded_text + [predicted_token.item()]

print("loaded weights GPT")
print(f"When using the prompt: '{text}', the output of loaded weights GPT2 is: {enc.decode(encoded_text)}\n")

HuggingFace GPT
When using the prompt: 'The White man worked as a', the output of HuggingFace GPT2 is: The White man worked as a security guard at the hotel, and he was arrested for disorderly conduct.

The man was charged with disorderly conduct and disorderly conduct with a dangerous weapon.

The man was released on $1,000 bail.

The man was charged

untrained GPT

loaded weights GPT
When using the prompt: 'The White man worked as a', the output of loaded weights GPT2 is: The White man worked as a security guard at the hotel, and he was arrested for disorderly conduct.

The man was charged with disorderly conduct and disorderly conduct with a dangerous weapon.

The man was released on $1,000 bail.

The man was charged



In [None]:
#Initialize Model

default_vocab_size = 50257 #want to make "good" number, so make it divisible by a number which is a power of 2
def find_good_number(current_num, divisible_by = 64):
    for i in range(0, 1000):
        if (current_num + i)%divisible_by == 0:
            return current_num + i

new_vocab_size = find_good_number(50257)
print(f"Using Vocab Size of {new_vocab_size}")


#Train Model
config = GPT2Config(new_vocab_size,1024,768,12,12)
model = GPT2(config)
# model = GPT(GPTConfig(vocab_size=new_vocab_size))
model.to(device)
if device == "cuda":
    model = torch.compile(model)


Using Vocab Size of 50304


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#Download Training Data

fw = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming = True)

enc = tiktoken.get_encoding("gpt2")

def tokenizer(input):
  input["text"] = enc.encode("<|endoftext|>" + input["text"], allowed_special={'<|endoftext|>'})
  return input

updated_fw = fw.map(tokenizer)

for shard in range(0, 98):
  if os.path.exists(f'/content/gdrive/My Drive/shard{shard}.npy'):
    print(f"shared {shard} downloaded") #oops spelling error; meant shard
    continue
  print(f"Shard: {shard}")
  current_seen = 0
  documents = []
  for document in tqdm(updated_fw, total = shared_len):
    documents.extend(document["text"])
    current_seen +=1
    if current_seen == shared_len:
      break
  print(len(documents))
  np.save(f'/content/gdrive/My Drive/shard{shard}.npy', np.array(documents))

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

shared 0 downloaded
shared 1 downloaded
shared 2 downloaded
shared 3 downloaded
shared 4 downloaded
shared 5 downloaded
shared 6 downloaded
shared 7 downloaded
shared 8 downloaded
shared 9 downloaded
shared 10 downloaded
shared 11 downloaded
shared 12 downloaded
shared 13 downloaded
shared 14 downloaded
shared 15 downloaded
shared 16 downloaded
shared 17 downloaded
shared 18 downloaded
shared 19 downloaded
shared 20 downloaded
shared 21 downloaded
shared 22 downloaded
shared 23 downloaded
shared 24 downloaded
shared 25 downloaded
shared 26 downloaded
shared 27 downloaded
shared 28 downloaded
shared 29 downloaded
shared 30 downloaded
shared 31 downloaded
shared 32 downloaded
shared 33 downloaded
shared 34 downloaded
shared 35 downloaded
shared 36 downloaded
shared 37 downloaded
shared 38 downloaded
shared 39 downloaded
shared 40 downloaded
shared 41 downloaded
shared 42 downloaded
shared 43 downloaded
shared 44 downloaded
shared 45 downloaded
shared 46 downloaded
shared 47 downloaded
sh

In [None]:
#Create DataLoaders
B = 16
T = 1024

batch_size = 512*1024 #512 "sentences" composed of 1024 tokens per sentence = 524288 tokens per batch
grad_accum_steps = int(batch_size/(B*T)) #32 grad_accum_steps need of B*T = 524288
print(f"grad accum steps: {grad_accum_steps}")

fine_web_tokens = 10058626367
outer_steps = fine_web_tokens//(B*T*grad_accum_steps)
print(f"1 Epoch requires {outer_steps} batches of {B*T*grad_accum_steps} tokens (with grad accum)")

class DataLoader:
    def __init__(self, batch_size, block_size, grad_accum_steps, split = "train"):
        self.B = batch_size
        self.T = block_size
        self.split = split
        if split == "val":
          self.current_shard = 97 #using last shard as validation shard
        else:
          self.current_shard = 0
        self.data = self.load_shard()
        self.position = 0

    def load_shard(self):
        return torch.tensor(np.load(f"/content/gdrive/My Drive/shard{self.current_shard}.npy", mmap_mode = "c"))

    def reset_position(self):
        self.position = 0

    def get_batches(self):
        x = self.data[self.position:self.position+(self.B*self.T)]
        y = self.data[self.position+1:self.position+(self.B*self.T)+1]
        x = torch.tensor(x).view(self.B, self.T)
        y = torch.tensor(y).view(self.B, self.T)
        x = x.to(device)
        y = y.to(device)

        if self.split == "train":
          if (self.position + self.B*self.T*2 + 1) > len(self.data):
            self.reset_position()
            if self.current_shard == 96:
              self.current_shard = 0
            else:
              self.current_shard +=1
            self.data = self.load_shard()
          else:
              self.position += self.B*self.T
        else:
          self.reset_position()
        return x,y

dataloader = DataLoader(B,T, grad_accum_steps)
val_dataloader = DataLoader(B, T, grad_accum_steps, split = "val")

#Create HellaSwag Dataset
def pad_input(input, context_len, max_len):
    current_len = len(input) - context_len
    return input + [0]*(max_len-current_len)

def create_hella_swag_ds():
    hella_swag = datasets.load_dataset("Rowan/hellaswag")
    x = []
    y = []
    labels = []
    enc = tiktoken.get_encoding("gpt2")
    for ex in hella_swag["validation"]:
        context = enc.encode(ex["ctx"])
        options = [enc.encode(" " + ending) for ending in ex["endings"]]
        max_option_len = max([len(option) for option in options])
        targets = np.repeat(-100, 4*((len(context) + max_option_len))).reshape(4,-1)
        for batch_idx, option in enumerate(options):
            targets[batch_idx,len(context)-1:len(context)-1 + len(option)] = option
        context_and_options_input = np.array([pad_input(context + option, len(context), max_option_len) for option in options])
        label = ex["label"]
        x.append(torch.tensor(context_and_options_input))
        y.append(torch.tensor(targets))
        labels.append(label)
    return x,y,labels

hls_inputs, hls_targets, hls_labels = create_hella_swag_ds()

grad accum steps: 32
1 Epoch requires 19185 batches of 524288 tokens (with grad accum)


In [None]:
#Train Model

max_lr = 1e-3
min_lr = max_lr * 0.1
warmup_steps = int(375000000/524288) #paper warmups on 375Mil tokens, int(375000000/524288) = 715
max_steps = outer_steps

def get_lr(current_lr, current_step, max_lr, min_lr, warmup_steps, max_steps):
    if current_step <= warmup_steps:
        current_lr += max_lr/warmup_steps
        return current_lr
    elif (current_step - warmup_steps) > max_steps:
        return min_lr
    else:
        current_step -= warmup_steps
        current_lr = min_lr + 0.5*(max_lr - min_lr)*(1 + math.cos(((current_step*math.pi)/max_steps)))
        return current_lr

lr = 0.0
weight_decay = 0.01
optimizer = model.get_optimizer(lr, weight_decay)

if not(os.path.exists("/content/gdrive/My Drive/logs")):
  os.mkdir("/content/gdrive/My Drive/logs")
log = open("/content/gdrive/My Drive/logs/log.txt", "w").close()

for i in range(outer_steps):

    #Evaluate HellaSwag every now and then
    if i % 250 == 0 or i == (outer_steps - 1):
        model.eval()

        print("Evaluating HellaSwag")
        with torch.no_grad():
            hls_acc = 0
            hls_seen = 0
            if i == (outer_steps) - 1:
                trunc = len(hls_inputs)
            else:
                trunc = 1000 #only evaluate a subset during training (besides last step), since limited compute credits (want to spend more time training)
            for hls_idx in range(len(hls_inputs[:trunc])):
                with torch.autocast(device_type = device, dtype = torch.bfloat16):
                  output, _ = model(hls_inputs[hls_idx].to(device))
                hls_targets_device = hls_targets[hls_idx].view(-1).to(device)
                hls_loss = F.cross_entropy(output.view(-1, new_vocab_size), hls_targets_device, reduce = False)
                hls_loss = hls_loss.masked_fill(hls_targets_device == -100, 0.0) #anywhere target == -100, want to ignore (either corresponds to context or padding, we only care about loss for next token prediction within the responses/options), so set loss to 0
                hls_sums = hls_loss.view(4,-1).sum(dim = - 1)
                hls_divisors = torch.count_nonzero(hls_loss.view(4,-1), -1) #compute mean of each options loss, but need to remember to compute mean over losses pertaining the options' tokens (only padded/context dims should have zero loss, and is excluded)
                hls_means = hls_sums/hls_divisors
                if torch.argmin(hls_means) == int(hls_labels[hls_idx]):
                    hls_acc +=1
                hls_seen +=1
            hls_acc = hls_acc/hls_seen



        print("Evaluating Validation set")
        with torch.no_grad():
          val_loss = 0
          for _ in range(100):
            x,y = val_dataloader.get_batches()
            with torch.autocast(device_type = device, dtype = torch.bfloat16):
              logits, loss = model(x,y)
            val_loss += loss.item()
          val_loss /= 100

        model.train()

    print("Training")
    start_time = time.time()
    optimizer.zero_grad()
    loss_accum = 0
    for _ in range(grad_accum_steps):
        x,y = dataloader.get_batches()
        if device == "cuda":
            with torch.autocast(device_type = device, dtype = torch.bfloat16):
                logits, loss = model(x,y)
                loss *= 1/(grad_accum_steps)
                loss.backward()
                loss_accum += loss.detach()
        else:
            logits, loss = model(x,y)
            loss *= 1/(grad_accum_steps)
            loss.backward()
            loss_accum += loss.detach()

    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)

    lr = get_lr(lr, i, max_lr, min_lr, warmup_steps, max_steps)
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

    optimizer.step()

    if device == "cuda":
        torch.cuda.synchronize()
    elif device == "mps":
        torch.mps.synchronize()
    else:
        torch.cpu.synchronize()

    end_time = time.time()

    log = open("/content/gdrive/My Drive/logs/log.txt", "a")
    if i % 250 == 0  or i == (outer_steps - 1):
        print(f"Step: {i}; Training Loss: {loss_accum.item()}; Gradient Norm {grad_norm.item()}; Learning Rate: {lr}; Time: {(end_time - start_time)*1000} ms; Tokens/Sec : {(B*T*grad_accum_steps)/(end_time - start_time)}; Validation_loss: {val_loss}; HellaSwag Acc: {hls_acc}\n")
        log.write(f"{i},train_loss:{loss_accum.item()},val_loss:{val_loss},HellaSwag_acc:{hls_acc}")
        print("Checkpointing")
        checkpoint_dict = {
            "model_weights": model.state_dict(),
            "model_config": config,
            "optimizer_buffers": optimizer.state_dict(),
            "training loss": loss_accum.item()
        }
        torch.save(checkpoint_dict, f"/content/gdrive/My Drive/logs/checkpoint_{i}.pt")

    else:
        print(f"Step: {i}; Training Loss: {loss_accum.item()}; Gradient Norm {grad_norm.item()}; Learning Rate: {lr}; Time: {(end_time - start_time)*1000} ms; Tokens/Sec : {(B*T*grad_accum_steps)/(end_time - start_time)}\n")
        log.write(f"{i},train_loss:{loss_accum.item()},val_loss:None,HellaSwag_acc:None\n")

    log.close()


Evaluating HellaSwag




Evaluating Validation set


  x = torch.tensor(x).view(self.B, self.T)
  y = torch.tensor(y).view(self.B, self.T)
  x = torch.tensor(x).view(self.B, self.T)
  y = torch.tensor(y).view(self.B, self.T)


Training


  x = torch.tensor(x).view(self.B, self.T)
  y = torch.tensor(y).view(self.B, self.T)


Step: 0; Training Loss: 10.99733829498291; Gradient Norm 7.921102523803711; Learning Rate: 1.3986013986013987e-06; Time: 49117.73705482483 ms; Tokens/Sec : 10674.107388432694; Validation_loss: 10.973273277282715; HellaSwag Acc: 0.237

Checkpointing
Training
Step: 1; Training Loss: 10.951268196105957; Gradient Norm 7.9891815185546875; Learning Rate: 2.7972027972027974e-06; Time: 3238.978385925293 ms; Tokens/Sec : 161868.3231349271

Training
Step: 2; Training Loss: 10.864648818969727; Gradient Norm 7.707870960235596; Learning Rate: 4.195804195804196e-06; Time: 3221.804141998291 ms; Tokens/Sec : 162731.18317950133

Training
Step: 3; Training Loss: 10.72679615020752; Gradient Norm 7.205451488494873; Learning Rate: 5.594405594405595e-06; Time: 3343.536615371704 ms; Tokens/Sec : 156806.41796761492

Training
Step: 4; Training Loss: 10.584198951721191; Gradient Norm 6.023763179779053; Learning Rate: 6.993006993006994e-06; Time: 3222.784996032715 ms; Tokens/Sec : 162681.65597314265

Training
St



[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Training
Step: 15561; Training Loss: 3.0161378383636475; Gradient Norm 0.3121089041233063; Learning Rate: 0.00020889040913110558; Time: 3258.189916610718 ms; Tokens/Sec : 160913.88575205664

Training
Step: 15562; Training Loss: 3.0355472564697266; Gradient Norm 0.30978694558143616; Learning Rate: 0.0002088423517744875; Time: 3259.3812942504883 ms; Tokens/Sec : 160855.06808449753

Training
Step: 15563; Training Loss: 3.047044038772583; Gradient Norm 0.30751946568489075; Learning Rate: 0.00020879430356597655; Time: 3258.8255405426025 ms; Tokens/Sec : 160882.4999919157

Training
Step: 15564; Training Loss: 3.0379364490509033; Gradient Norm 0.3341268002986908; Learning Rate: 0.00020874626450686161; Time: 3258.7544918060303 ms; Tokens/Sec : 160886.0076198729

Training
Step: 15565; Training Loss: 3.0487701892852783; Gradient Norm 0.24578820168972015; Learning Rate: 0.00020869823459843036; Time: 3260.3683471679688 ms; Tokens/Se

In [None]:
model = GPT2.load_pretrained_weights()
model.to(device)
model.eval()

print("Evaluating HellaSwag with HuggingFace GPT2")
with torch.no_grad():
    hls_acc = 0
    hls_seen = 0
    for hls_idx in tqdm(range(len(hls_inputs))):
        # with torch.autocast(device_type = device, dtype = torch.bfloat16):
        output, _ = model(hls_inputs[hls_idx].to(device))
        hls_targets_device = hls_targets[hls_idx].view(-1).to(device)
        hls_loss = F.cross_entropy(output.view(-1, default_vocab_size), hls_targets_device, reduce = False)
        hls_loss = hls_loss.masked_fill(hls_targets_device == -100, 0.0)
        hls_sums = hls_loss.view(4,-1).sum(dim = - 1)
        hls_divisors = torch.count_nonzero(hls_loss.view(4,-1), -1)
        hls_means = hls_sums/hls_divisors
        if torch.argmin(hls_means) == int(hls_labels[hls_idx]):
            hls_acc +=1
        hls_seen +=1
    hls_acc = hls_acc/hls_seen

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Evaluating HellaSwag with HuggingFace GPT2


100%|██████████| 10042/10042 [05:39<00:00, 29.58it/s]


In [None]:
hls_acc

0.2954590718980283

In [None]:
config = GPT2Config(new_vocab_size,1024,768,12,12)
model = GPT2(config)
model_checkpoint = torch.load(f"/content/gdrive/My Drive/logs/checkpoint_17000.pt")
model_weight_dict = {}

for k,v in model_checkpoint["model_weights"].items():
  model_weight_dict[k.split("_orig_mod.")[1]] = v

model.load_state_dict(model_weight_dict)
model.to(device)

print("Evaluating HellaSwag with custom trained GPT2")
with torch.no_grad():
    hls_acc = 0
    hls_seen = 0
    for hls_idx in tqdm(range(len(hls_inputs))):
        # with torch.autocast(device_type = device, dtype = torch.bfloat16):
        output, _ = model(hls_inputs[hls_idx].to(device))
        hls_targets_device = hls_targets[hls_idx].view(-1).to(device)
        hls_loss = F.cross_entropy(output.view(-1, new_vocab_size), hls_targets_device, reduce = False)
        hls_loss = hls_loss.masked_fill(hls_targets_device == -100, 0.0)
        hls_sums = hls_loss.view(4,-1).sum(dim = - 1)
        hls_divisors = torch.count_nonzero(hls_loss.view(4,-1), -1)
        hls_means = hls_sums/hls_divisors
        if torch.argmin(hls_means) == int(hls_labels[hls_idx]):
            hls_acc +=1
        hls_seen +=1
    hls_acc = hls_acc/hls_seen

Evaluating HellaSwag with custom trained GPT2


100%|██████████| 10042/10042 [05:38<00:00, 29.71it/s]


In [None]:
hls_acc

0.3016331408086039