In [1]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login
import re
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from typing import Any, Dict, List, Optional, Union
import wandb
from torch.utils.data import Dataset
import logging
logger = logging.getLogger(__name__)
import numpy as np

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
datasets = load_dataset("piqa")

Found cached dataset piqa (/home/bruno/.cache/huggingface/datasets/piqa/plain_text/1.1.0/6c611c1a9bf220943c4174e117d3b660859665baf1d43156230116185312d011)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
datasets.keys()

dict_keys(['train', 'test', 'validation'])

In [4]:
datasets["train"][0]["goal"]

"When boiling butter, when it's ready, you can"

In [5]:
def preprocess_intents_json(segment):
    
    preprocessed_data = []
    
    for data in datasets[segment]:
        preprocessed_data.append(f"Goal: {data['goal']}\n")
        if data['label']=='1':
            preprocessed_data.append(f"Solution: {data['sol2']}\n")
        else:
            preprocessed_data.append(f"Solution: {data['sol1']}\n")
    
    return "".join(preprocessed_data)

def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w") as f:
        f.write(preprocessed_data)


output_file = f"./data/data_train.txt"
preprocessed_data = preprocess_intents_json("train")
preprocessed_data = re.sub(r'\n+', '\n', preprocessed_data).strip() 
save_preprocessed_data(preprocessed_data, output_file)

output_file = f"./data/data_validation.txt"
preprocessed_data = preprocess_intents_json("validation")
preprocessed_data = re.sub(r'\n+', '\n', preprocessed_data).strip() 
save_preprocessed_data(preprocessed_data, output_file)

output_file = f"./data/data_test.txt"
preprocessed_data = preprocess_intents_json("test")
preprocessed_data = re.sub(r'\n+', '\n', preprocessed_data).strip() 
save_preprocessed_data(preprocessed_data, output_file)

## Preprocessing Data

In [None]:
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42

PROMPT_NO_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{response_key}
{response}

{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

class GoalSolutionDataset(Dataset):
    def __init__(self, tokenizer, file_path, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.examples = []
        with open(file_path, "r") as file:
            goal, solution = "", ""
            for line in file.readlines():
                if line.startswith("Goal:"):
                    goal = line[0:-1] # remove newline character
                elif line.startswith("Solution:"):
                    solution = line[0:-1] # remove newline character
                else:
                    goal += " " + line[0:-1] # add to goal if it's a continuation
                
                if goal and solution: # if both goal and solution are collected
                    self.examples.append((goal, solution))
                    goal, solution = "", "" # reset for next pair

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        goal, solution = self.examples[i]
        data = PROMPT_NO_INPUT_FORMAT.format(instruction=goal, response=solution)
        tokenized_data = self.tokenizer(data, truncation=True, padding='max_length', max_length=self.max_length)
        return tokenized_data


train_file_path = "./data/data_train.txt"
valid_file_path = "./data/data_validation.txt"
test_file_path = "./data/data_test.txt"
model_name = "databricks/dolly-v2-3b"
output_dir = "./models/gpt2-fine-tuned"
lr = 1e-5 #dolly default
batch_size = 2
num_train_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v1-6b")
model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v1-6b")
model = model.to(device)

data_collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=batch_size
)

train_dataset = GoalSolutionDataset(tokenizer, file_path=train_file_path, max_length=512)
valid_dataset = GoalSolutionDataset(tokenizer, file_path=valid_file_path, max_length=512)
test_dataset = GoalSolutionDataset(tokenizer, file_path=test_file_path, max_length=512)

In [7]:
for i, data in enumerate(train_dataset):
    if i < 2:
        print(data)
        original_text = tokenizer.decode(data["input_ids"])
        print(original_text)
        print()

{'input_ids': [30003, 310, 271, 9775, 326, 8631, 247, 4836, 15, 19566, 247, 2380, 326, 20420, 29141, 253, 2748, 15, 535, 50278, 187, 6826, 267, 27, 2091, 25317, 10379, 13, 672, 352, 434, 4704, 13, 368, 476, 535, 50279, 187, 37533, 27, 27808, 352, 4830, 247, 5340, 535, 50277, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [8]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=False,
    bf16=True, # https://www.cerebras.net/machine-learning/to-bfloat-or-not-to-bfloat-that-is-the-question/
    learning_rate=lr,
    logging_dir='./logs/runs',
    logging_strategy="steps",
    logging_steps=10, #dolly defaults
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=400,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="wandb",
    remove_unused_columns=False,
    warmup_steps=0,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True
)

wandb.init(
    project="NLP-MC2",
    config={
    "learning_rate": lr,
    "architecture": model_name,
    "dataset": "PIQA",
    "epochs": num_train_epochs,
    "batch_size": batch_size,
    "bf16": True,
    "output_dir": output_dir,
    }
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

logger.info("Training...")
trainer.train()

logger.info(f"Saving model to {output_dir}")
trainer.save_model(output_dir)
logger.info("Training complete!")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjaguar[0m ([33m22hs_i4ds20[0m). Use [1m`wandb login --relogin`[0m to force relogin




  0%|          | 0/2515 [00:00<?, ?it/s]

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 23.67 GiB total capacity; 21.47 GiB already allocated; 66.81 MiB free; 21.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [9]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50280, 2560)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): GPTNeoXAttention(
          (rotary_emb): RotaryEmbedding()
          (query_key_value): Linear(in_features=2560, out_features=7680, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear(in_features=10240, out_features=2560, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=2560, out_features=50280, bias=False)
)