In [1]:
!pip install transformers datasets accelerate trl peft tensorboard


Collecting trl
  Downloading trl-0.23.1-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.23.1-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.23.1


In [2]:
import os
import math
import json
from pathlib import Path
from typing import Dict, Any
import glob
import shutil

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_

from peft import LoraConfig, get_peft_model, PeftModel

from torch.utils.tensorboard import SummaryWriter

from transformers import get_linear_schedule_with_warmup


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
workspace = 'RLHF-Reproduction'

In [5]:
import os
os.chdir(f"/content/drive/My Drive/{workspace}")
print("Current working dir:", os.getcwd())


Current working dir: /content/drive/My Drive/RLHF-Reproduction


# Pretrained GPT2 Behavior

In [6]:
model_name = "gpt2"

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
print("EOS token id:", tokenizer.eos_token_id)
print("Pad token id:", tokenizer.pad_token_id)

EOS token id: 50256
Pad token id: 50256


In [10]:
print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token)
print("Vocab size:", len(tokenizer))
print(type(tokenizer))


<|endoftext|> <|endoftext|> <|endoftext|>
Vocab size: 50257
<class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>


In [11]:
text = "testing tokenizer"
tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text)
print(tokens)
print(ids)


['testing', 'Ġtoken', 'izer']
[33407, 11241, 7509]


In [12]:
prompt = "Complete this sentence. The cat jumped over "
inputs = tokenizer(prompt, return_tensors="pt")

print("Input IDs:", inputs["input_ids"])
print("Shape:", inputs["input_ids"].shape)


Input IDs: tensor([[20988,   428,  6827,    13,   383,  3797, 11687,   625,   220]])
Shape: torch.Size([1, 9])


In [13]:
import torch
with torch.no_grad():
    outputs = model(**inputs)


logits = outputs.logits        # [batch, seq_len, vocab_size]

print("Logits shape:", logits.shape)


Logits shape: torch.Size([1, 9, 50257])


In [14]:
generated_ids = model.generate(
    **inputs,
    max_new_tokens=60,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

print(generated_ids.shape)

print(tokenizer.decode(generated_ids[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 69])
Complete this sentence. The cat jumped over __________ and fell down on the ground. __________

"I'm going to call the cops on you, then we'll go to the hospital."

"Yes, I'll call the police on you, then we'll go to the hospital. __________

"


In [15]:
generated_ids = model.generate(           #beam search
    **inputs,
    max_new_tokens=40,
    num_beams=5,             # beam width
    early_stopping=True,     # stop when EOS is reached
    no_repeat_ngram_size=2,  # optional: avoid repeated phrases
)
print(generated_ids.shape)

for seq in generated_ids:
    print(tokenizer.decode(seq, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 49])
Complete this sentence. The cat jumped over  the wall.
"I'm sorry, but I'm not going to let you go. I don't want to see you die, and I want you to know that I love you.


##Alpaca Dataset

In [16]:
from datasets import load_dataset
dataset = load_dataset("tatsu-lab/alpaca")


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})

In [18]:
from datasets import DatasetDict
split = dataset['train'].train_test_split(test_size=0.05, seed=42)
dataset = DatasetDict({
    'train': split['train'],
    'validation': split['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 49401
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 2601
    })
})

In [19]:
train_ds = dataset['train']
train_ds[0]

{'instruction': 'Given a sentence, change the verb to make it in the past tense',
 'input': 'I enjoy going to the beach',
 'output': 'I enjoyed going to the beach.',
 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven a sentence, change the verb to make it in the past tense\n\n### Input:\nI enjoy going to the beach\n\n### Response:\nI enjoyed going to the beach.'}

In [20]:
train_ds[1]

{'instruction': 'Find me a website where I can purchase acoustic guitars.',
 'input': '',
 'output': 'Guitar Center is a popular website where you can purchase acoustic guitars.  They offer a wide selection of guitars from top brands such as Martin, Taylor, and Fender. They also provide free shipping on select items and a price matching guarantee.',
 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nFind me a website where I can purchase acoustic guitars.\n\n### Response:\nGuitar Center is a popular website where you can purchase acoustic guitars.  They offer a wide selection of guitars from top brands such as Martin, Taylor, and Fender. They also provide free shipping on select items and a price matching guarantee.'}

In [21]:
def compute_lengths(example):
    tokens = tokenizer(example['text']).input_ids
    return {'length': len(tokens)}

lengths = dataset['train'].map(compute_lengths)
max_len = max(lengths['length'])
print("Max token length in train set:", max_len)

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1510 > 1024). Running this sequence through the model will result in indexing errors


Max token length in train set: 1510


In [22]:
import numpy as np

arr = np.array(lengths["length"])
count = np.sum(arr <= 1024)
count

np.int64(49399)

In [23]:
lengths = dataset['validation'].map(compute_lengths)
max_len = max(lengths['length'])
print("Max token length in validation set:", max_len)

Map:   0%|          | 0/2601 [00:00<?, ? examples/s]

Max token length in validation set: 520


In [24]:
print(model.config.n_positions)


1024


Filter out the 2 examples that exceed 1024 tokens

In [25]:
train_ds = train_ds.filter(lambda x: len(tokenizer(x["text"]).input_ids) <= 1024)
print(train_ds.num_rows)  # should be 49399


Filter:   0%|          | 0/49401 [00:00<?, ? examples/s]

49399


In [26]:
val_ds = dataset['validation'].filter(lambda x: len(tokenizer(x["text"]).input_ids) <= 1024)
print(val_ds.num_rows)  # should be 2601

Filter:   0%|          | 0/2601 [00:00<?, ? examples/s]

2601


In [27]:
max_length = 1024
def tokenize_batch(batch): #for labels only want to compute loss for responses
    texts = batch["text"]

    input_ids_list = []
    attention_masks_list = []
    label_list = []

    for text in texts:
        response_start = text.find("### Response:")
        if response_start == -1:
            print("FAILURE")
            response_start = len(text)

        # Split instruction+input and response
        prompt = text[:response_start]
        response = text[response_start:]

        # Tokenize together to preserve continuity
        tokenized = tokenizer(
            prompt + response,
            truncation=True,
            max_length=max_length,
            padding="max_length",
        )

        # Compute how many tokens belong to the prompt (for masking)
        prompt_tokens = tokenizer(prompt, truncation=True, max_length=max_length)["input_ids"]
        prompt_length = len(prompt_tokens)

        # Copy input_ids as labels, but mask out prompt tokens
        labels = tokenized["input_ids"].copy()
        labels[:prompt_length] = [-100] * prompt_length  # ignore prompt in loss

        for i in range(len(labels)):  #only want the first eos token to contribute to the loss, none of the padding tokens
          if tokenized["attention_mask"][i] == 0:
            labels[i] = -100

        input_ids_list.append(tokenized["input_ids"])
        attention_masks_list.append(tokenized["attention_mask"])
        label_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_masks_list,
        "labels": label_list,
    }


tokenized_train_ds= train_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=1000,
    remove_columns=train_ds.column_names
)

tokenized_val_ds = val_ds.map(
    tokenize_batch,
    batched=True,
    batch_size=1000,
    remove_columns=val_ds.column_names
)


Map:   0%|          | 0/49399 [00:00<?, ? examples/s]

Map:   0%|          | 0/2601 [00:00<?, ? examples/s]

In [28]:
print(tokenized_train_ds.column_names)


['input_ids', 'attention_mask', 'labels']


In [29]:
print(len(tokenized_train_ds[0]["input_ids"]))

1024


## Fine Tuning on Alpaca Dataset

In [30]:
train_batch_size = 4
gradient_accumulation_steps = 4
num_epochs = 3
lr = 2e-4
weight_decay = 0.0
warmup_steps = 100
save_dir = "./gpt2-alpaca-sft-accel"
checkpoint_prefix = "checkpoint"
save_steps = 200
logging_steps = 50
eval_steps = 200
fp16 = True
num_workers = 2
max_checkpoints = 2

In [31]:
from accelerate import Accelerator
accelerator = Accelerator(
    mixed_precision="fp16",
    gradient_accumulation_steps=gradient_accumulation_steps  # <-- this is where 4 is set
)
device = accelerator.device
print("Running on", device)

Running on cuda


In [32]:
def load_peft_adapter_if_exists(model, ckpt_dir: str, accelerator: Accelerator):
    adapter_path = Path(ckpt_dir) / "adapter"
    if adapter_path.exists():
        # Load the PEFT adapter weights into model
        # model must be a base model wrapped with get_peft_model earlier
        print(f"Loading PEFT adapter from {adapter_path}")
        model = PeftModel.from_pretrained(model, adapter_path, device_map={"": accelerator.device})
    return model

In [33]:
def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)

def save_training_state(save_dir: str, step: int, accelerator, optimizer, scheduler, scaler):
    save_dir = Path(save_dir)
    ckpt_dir = save_dir / f"{checkpoint_prefix}-{step}"
    ensure_dir(ckpt_dir)

    # Save PEFT adapter weights (preferred)
    model_to_save = accelerator.unwrap_model(model)
    peft_save_dir = ckpt_dir / "adapter"
    model_to_save.save_pretrained(peft_save_dir)

    # Save optimizer/scheduler/scaler states
    accelerator.save_state(str(ckpt_dir / "acc_state"))

    # Save meta info
    meta = {"step": step}
    (ckpt_dir / "meta.json").write_text(json.dumps(meta))

    print(f"Saved checkpoint to {ckpt_dir}")

    # --- Delete old checkpoints if more than MAX_CHECKPOINTS ---
    all_ckpts = sorted(glob.glob(str(save_dir / f"{checkpoint_prefix}-*")),
                       key=lambda x: Path(x).stat().st_mtime)
    while len(all_ckpts) > max_checkpoints:
        old_ckpt = Path(all_ckpts.pop(0))
        print(f"Deleting old checkpoint: {old_ckpt}")
        shutil.rmtree(old_ckpt)


In [34]:
def collate_fn(batch):
    # batch is a list of dicts with input_ids, attention_mask, labels (each already list length max_length)
    input_ids = torch.tensor([b["input_ids"] for b in batch], dtype=torch.long)
    attention_mask = torch.tensor([b["attention_mask"] for b in batch], dtype=torch.long)
    labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [35]:
def inspect_trainable_params(model):
    total = 0
    trainable = 0
    details = []
    for n, p in model.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
            details.append(n)
    print(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
    print("Example trainable params:", details[:20])
    return details

In [36]:
ensure_dir(save_dir)

In [37]:
writer = SummaryWriter(log_dir=os.path.join(save_dir, "tensorboard"))

In [38]:
LORA_CONFIG = dict(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [39]:
model.gradient_checkpointing_enable()

In [40]:
lora_config = LoraConfig(**LORA_CONFIG)

In [41]:
model = get_peft_model(model, lora_config)



In [42]:
print("Before:")
trainable_before = inspect_trainable_params(model)

Before:
Trainable params: 294,912 / 124,734,720 (0.24%)
Example trainable params: ['base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.3.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.3.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.4.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.4.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.5.attn.c_attn.lora_A.default.weight', 'base_model.model.transformer.h.5.attn.c_attn.lora_B.default.weight', 'base_model.model.transformer.h.6.attn.c_attn.lora_A.default.weight', 'base_m

In [43]:
resume_checkpoint_dir = "./gpt2-alpaca-sft-accel/checkpoint-6200"
# load peft adapter weights if present
model = load_peft_adapter_if_exists(model, resume_checkpoint_dir, accelerator)


Loading PEFT adapter from gpt2-alpaca-sft-accel/checkpoint-6200/adapter




In [44]:
# Make sure LoRA parameters are trainable
for name, param in model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False  # freeze base model


In [45]:
print("After:")
trainable_after = inspect_trainable_params(model)

After:
Trainable params: 294,912 / 124,734,720 (0.24%)
Example trainable params: ['base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.3.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_A.default.weight', 'base_model.model.base_model.model.transformer.h.4.attn.c_attn.lora_B.default.weight', 'base_model.model.base_model.model.transformer.h

In [46]:
for n, p in model.named_parameters():
    if p.requires_grad:
        p.data = p.data.to(accelerator.device)  # ensure params are on same device

In [47]:
train_dataloader = DataLoader(tokenized_train_ds, shuffle=True, collate_fn=collate_fn,batch_size=train_batch_size, num_workers=num_workers)
eval_dataloader = DataLoader(tokenized_val_ds, shuffle=False, collate_fn=collate_fn, batch_size=train_batch_size, num_workers=num_workers)

In [48]:
no_decay = ["bias", "LayerNorm.weight"]
param_groups = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
optimizer = AdamW(param_groups, lr=lr)

#optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=lr)


In [49]:
total_train_steps = math.ceil(len(train_dataloader) * num_epochs / gradient_accumulation_steps)

In [50]:
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_train_steps,
    )

In [51]:
model, optimizer, train_dataloader, eval_dataloader, scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, scheduler
    )

In [52]:
starting_step = 0
scaler = torch.cuda.amp.GradScaler(enabled=(accelerator.state.mixed_precision == "fp16"))


  scaler = torch.cuda.amp.GradScaler(enabled=(accelerator.state.mixed_precision == "fp16"))


In [53]:
# load accelerator state (this will restore optimizer/scheduler/scaler if saved via accelerator.save_state)
acc_state_dir = Path(resume_checkpoint_dir) / "acc_state"
if acc_state_dir.exists():
    print("Loading accelerator state from", acc_state_dir)
    #accelerator.load_state(str(acc_state_dir))
    # try to read meta to get step
    meta_file = Path(resume_checkpoint_dir) / "meta.json"
    if meta_file.exists():
        meta = json.loads(meta_file.read_text())
        starting_step = int(meta.get("step", 0))
    print("Resumed step:", starting_step)
else:
    print("No accelerator state found in checkpoint; only adapter weights restored (if existed).")

Loading accelerator state from gpt2-alpaca-sft-accel/checkpoint-6200/acc_state
Resumed step: 6200


In [54]:
print(starting_step)

6200


In [55]:
global_step = starting_step
model.train()
print("***** Running training *****")
print(f"  Num examples = {len(tokenized_train_ds)}")
print(f"  Num Epochs = {num_epochs}")
print(f"  Instantaneous batch size per device = {train_batch_size}")
print(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
print(f"  Total optimization steps = {total_train_steps}")

***** Running training *****
  Num examples = 49399
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 9263


In [56]:
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for step, batch in enumerate(train_dataloader):

        #in case we are resuming training
        accumulated_steps = (epoch * len(train_dataloader) + step) // gradient_accumulation_steps
        if accumulated_steps < starting_step:
            continue

        # batch: dict of tensors
        with accelerator.accumulate(model):
            # forward with amp context
            with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    labels=batch["labels"],
                )
                loss = outputs.loss
                # scale loss to account for gradient accumulation
                loss = loss / gradient_accumulation_steps

            accelerator.backward(loss)
            epoch_loss += loss.item() * gradient_accumulation_steps  # un-scaled per-batch loss

            # gradient clipping on unwrapped model parameters (PEFT might wrap)
            if accelerator.sync_gradients:
                clip_grad_norm_(model.parameters(), max_norm=1.0)

            # optimizer step handled by accelerator when accumulate finishes
            if accelerator.sync_gradients:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

                # Logging
                if global_step % logging_steps == 0:
                    if (epoch == 2) :
                      adjusted_steps = step - (starting_step * gradient_accumulation_steps - (epoch * len(train_dataloader)))
                    else:
                      adjusted_steps = step
                    avg_loss = epoch_loss * train_batch_size / ((adjusted_steps + 1) * train_batch_size) #epoch_loss is sum of per-batch mean losses
                    #(step + 1) * train_batch_size is number of samples seen so far
                    print(f"Epoch {epoch+1} Step {global_step} loss {avg_loss:.4f}")
                    writer.add_scalar("train/loss", avg_loss, global_step)
                    writer.add_scalar("train/lr", scheduler.get_last_lr()[0], global_step)

                # Evaluation
                if global_step % eval_steps == 0:
                    model.eval()
                    total_eval_loss = 0.0
                    batches = 0
                    for eval_batch in eval_dataloader:
                        with torch.no_grad():
                            with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
                                out = model(
                                    input_ids=eval_batch["input_ids"],
                                    attention_mask=eval_batch["attention_mask"],
                                    labels=eval_batch["labels"],
                                )
                                total_eval_loss += out.loss.item()
                                batches += 1
                    avg_eval_loss = total_eval_loss / batches
                    ppl = math.exp(avg_eval_loss) if avg_eval_loss < 20 else float("inf")
                    print(f"*** Eval at step {global_step}: loss {avg_eval_loss:.4f}, ppl {ppl:.2f}")
                    writer.add_scalar("eval/loss", avg_eval_loss, global_step)
                    writer.add_scalar("eval/ppl", ppl, global_step)
                    model.train()

                # Save checkpoint
                if global_step % save_steps == 0:
                    # Save PEFT adapter and accelerator state
                    save_training_state(save_dir, global_step, accelerator, optimizer, scheduler, scaler)

    # epoch end
    # Save checkpoint at epoch end
    save_training_state(save_dir, f"epoch-{epoch+1}", accelerator, optimizer, scheduler, scaler)

# final save
save_training_state(save_dir, f"final-{global_step}", accelerator, optimizer, scheduler, scaler)
print("Training complete")
writer.close()



Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-epoch-1




Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-epoch-2
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-6200


  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch 3 Step 6250 loss 2.0802
Epoch 3 Step 6300 loss 2.0647
Epoch 3 Step 6350 loss 2.0356
Epoch 3 Step 6400 loss 2.0264


  with torch.cuda.amp.autocast(enabled=accelerator.mixed_precision=="fp16"):


*** Eval at step 6400: loss 1.9287, ppl 6.88
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-6400
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-epoch-1
Epoch 3 Step 6450 loss 2.0350
Epoch 3 Step 6500 loss 2.0348
Epoch 3 Step 6550 loss 2.0390
Epoch 3 Step 6600 loss 2.0402




*** Eval at step 6600: loss 1.9274, ppl 6.87
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-6600
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-epoch-2
Epoch 3 Step 6650 loss 2.0395
Epoch 3 Step 6700 loss 2.0381
Epoch 3 Step 6750 loss 2.0394
Epoch 3 Step 6800 loss 2.0394




*** Eval at step 6800: loss 1.9278, ppl 6.87
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-6800
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-6400
Epoch 3 Step 6850 loss 2.0410
Epoch 3 Step 6900 loss 2.0410
Epoch 3 Step 6950 loss 2.0418
Epoch 3 Step 7000 loss 2.0411




*** Eval at step 7000: loss 1.9250, ppl 6.85
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-7000
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-6600
Epoch 3 Step 7050 loss 2.0408
Epoch 3 Step 7100 loss 2.0417
Epoch 3 Step 7150 loss 2.0416
Epoch 3 Step 7200 loss 2.0420




*** Eval at step 7200: loss 1.9252, ppl 6.86
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-7200
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-6800
Epoch 3 Step 7250 loss 2.0436
Epoch 3 Step 7300 loss 2.0410
Epoch 3 Step 7350 loss 2.0404
Epoch 3 Step 7400 loss 2.0387




*** Eval at step 7400: loss 1.9190, ppl 6.81
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-7400
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-7000
Epoch 3 Step 7450 loss 2.0395
Epoch 3 Step 7500 loss 2.0384
Epoch 3 Step 7550 loss 2.0375
Epoch 3 Step 7600 loss 2.0379




*** Eval at step 7600: loss 1.9198, ppl 6.82
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-7600
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-7200
Epoch 3 Step 7650 loss 2.0384
Epoch 3 Step 7700 loss 2.0372
Epoch 3 Step 7750 loss 2.0381
Epoch 3 Step 7800 loss 2.0376




*** Eval at step 7800: loss 1.9185, ppl 6.81
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-7800
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-7400
Epoch 3 Step 7850 loss 2.0378
Epoch 3 Step 7900 loss 2.0368
Epoch 3 Step 7950 loss 2.0370
Epoch 3 Step 8000 loss 2.0365




*** Eval at step 8000: loss 1.9179, ppl 6.81
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-8000
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-7600
Epoch 3 Step 8050 loss 2.0351
Epoch 3 Step 8100 loss 2.0364
Epoch 3 Step 8150 loss 2.0369
Epoch 3 Step 8200 loss 2.0369




*** Eval at step 8200: loss 1.9165, ppl 6.80
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-8200
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-7800
Epoch 3 Step 8250 loss 2.0373
Epoch 3 Step 8300 loss 2.0369
Epoch 3 Step 8350 loss 2.0370
Epoch 3 Step 8400 loss 2.0362




*** Eval at step 8400: loss 1.9119, ppl 6.77
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-8400
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-8000
Epoch 3 Step 8450 loss 2.0361
Epoch 3 Step 8500 loss 2.0366
Epoch 3 Step 8550 loss 2.0370
Epoch 3 Step 8600 loss 2.0357




*** Eval at step 8600: loss 1.9134, ppl 6.78
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-8600
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-8200
Epoch 3 Step 8650 loss 2.0360
Epoch 3 Step 8700 loss 2.0357
Epoch 3 Step 8750 loss 2.0357
Epoch 3 Step 8800 loss 2.0352




*** Eval at step 8800: loss 1.9106, ppl 6.76
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-8800
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-8400
Epoch 3 Step 8850 loss 2.0353
Epoch 3 Step 8900 loss 2.0346
Epoch 3 Step 8950 loss 2.0347
Epoch 3 Step 9000 loss 2.0340




*** Eval at step 9000: loss 1.9115, ppl 6.76
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-9000
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-8600
Epoch 3 Step 9050 loss 2.0329
Epoch 3 Step 9100 loss 2.0327
Epoch 3 Step 9150 loss 2.0324
Epoch 3 Step 9200 loss 2.0317




*** Eval at step 9200: loss 1.9101, ppl 6.75
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-9200
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-8800
Epoch 3 Step 9250 loss 2.0309




Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-epoch-3
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-9000
Saved checkpoint to gpt2-alpaca-sft-accel/checkpoint-final-9263
Deleting old checkpoint: gpt2-alpaca-sft-accel/checkpoint-9200
Training complete
