In [44]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from accelerate import Accelerator
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

In [45]:
# Load eli5 dataset for the causal LM task
# More info about Eli5: https://facebookresearch.github.io/ELI5/
eli5 = load_dataset("eli5", split="train_asks[:5000]")

Found cached dataset eli5 (/home/eecs/christopherchou/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [46]:
eli5 = eli5.train_test_split(test_size=0.2) # split dataset into 0.8-0.2
eli5

Loading cached split indices for dataset at /home/eecs/christopherchou/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-34e7178cabffffe7.arrow and /home/eecs/christopherchou/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-3459f37d3c7812d8.arrow


DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 1000
    })
})

In [47]:
model_name_or_path = "facebook/opt-1.3b"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token_id = tokenizer.eos_token_id



In [48]:
eli5 = eli5["train"].flatten() # flatten because what we want is answers.text but it is nested in the answers' object
eli5[0] # Notice how the answers.text is a list of strings that need to be joined

{'q_id': '14p632',
 'title': 'Can someone explain why this is a valid method of measuring the obesity rate?',
 'selftext': '',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['c7f5e1w'],
 'answers.text': ["The article is using information directly from the CDC: [Prevalence of Obesity Among Children and Adolescents: United States, Trends 1963-1965 Through 2007-2008](_URL_1_). As far as I can tell, the cutoff for obesity was fixed to the 2000 charts, so it makes sense to compare obesity rates across different years. Those charts are based on data gathered from 1963-1994. (You can find the charts and methods [here](_URL_0_).) Notice that it hits around the exact 5% obesity (which you'd expect using the 95th percentile) in the '70s."],
 'answers.score': [3],
 'title_urls.url': [],
 'selftext_urls.url': [],
 'answers_urls.url': ['http://www.cdc.gov/growthcharts/',
  'http://www.cdc.gov/nchs/data/hestat/obesity_child_07_08/obesity_child_07_08.htm']}

In [49]:
max_seq_length = 128

def preprocess_function(examples):
    grouped_examples = [" ".join(text) for text in examples["answers.text"]] # converts list to a string that can be tokenized jointly
    return tokenizer(grouped_examples, 
        padding=True,
        max_length=max_seq_length,
        truncation=True,
        return_tensors=None
    )

In [50]:
tokenized_eli5 = eli5.map(
    preprocess_function, 
    batched=True, # processes multiple elements of the dataset at once
    remove_columns=eli5.column_names # remove these column names, or else we have column names + input_ids + attention_mask
)

Loading cached processed dataset at /home/eecs/christopherchou/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa/cache-885b2b9230497ec9.arrow


In [51]:
tokenized_eli5

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4000
})

In [52]:
# Since dataset has inputs that are of length that is greater than max_seq_length, we need to
# 1. concatenate the sequences together
# 2. truncate the sequences to max_seq_length
# NOTE: this is only done for demonstration purposes, in reality, we should be dynamically padding our dataset
# def group_texts(examples):
#     concatenated_sequence = {k: sum(examples[k], []) for k in examples.keys()}
#     total_length = len(concatenated_sequence["input_ids"]) # length of the batch concatenated sequence that we are currently working with
#     total_length = (total_length // max_seq_length) * max_seq_length # truncate to a multiple of max_seq_length
#     result = {
#         k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
#         for k, t in concatenated_sequence.items()
#     }
#     # result becomes
#     # "input_id" : [[64 tokens], [64 tokens], [64 tokens], ...]
#     # "attention_mask" : [[64 masks], [64 masks], [64 masks], ...]

#     result["labels"] = result["input_ids"].copy()
#     return result

# tokenized_eli5 = tokenized_eli5.map(
#     group_texts,
#     batched=True,
#     num_proc=1
# )

In [53]:
# Data collators are used for creating batches out of our dataset and can also provide some preprocessing like dynamically padding
data_collator = DataCollatorForLanguageModeling( # sets [-100] for pad tokens and the inputs for the labels
    tokenizer=tokenizer,
    mlm=False # masked language modelling set to false because we are doing causal language modelling
)

In [54]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"],
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

model.enable_input_require_grads() # fixes issue of RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1572864 || all params: 1317330944 || trainable%: 0.11939778740975206


In [55]:
global_batch_size = 8
per_device_batch_size = 1

assert global_batch_size % per_device_batch_size == 0, "global_batch_size must be divisible by per_device_batch_size"
gradient_accumulation_steps = global_batch_size // per_device_batch_size

training_args = TrainingArguments(
    output_dir="./lora_finetune_opt_deepspeed_out",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    deepspeed="ds_config.json", # use deepspeed for CPU offloading of optimizers, gradients, and parameters (3)
    per_device_train_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    push_to_hub=False,
    gradient_checkpointing=True
)

# OOM without deepspeed NOTE: have to run deepspeed on .py file instead of .ipynb since need > 1 process
trainer =  Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_eli5,
    eval_dataset=tokenized_eli5,
    data_collator=data_collator,
)

In [56]:
trainer.train()



Using /home/eecs/christopherchou/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...


Time to load utils op: 0.0036559104919433594 seconds
Parameter Offload: Total persistent parameters: 2215936 in 338 params


Using /home/eecs/christopherchou/.cache/torch_extensions/py38_cu116 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Time to load utils op: 0.0009481906890869141 seconds






Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [57]:
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live

# 6.4 GB on GPU required for micro_batch_size of 1... 
estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)

Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 1 GPU per node.
SW: Model with 105M total params, 102M largest layer params.
  per CPU  |  per GPU |   Options
    2.64GB |   0.38GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
    2.64GB |   0.38GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
    2.35GB |   0.58GB | offload_param=none, offload_optimizer=cpu , zero_init=1
    2.35GB |   0.58GB | offload_param=none, offload_optimizer=cpu , zero_init=0
    0.58GB |   2.15GB | offload_param=none, offload_optimizer=none, zero_init=1
    0.59GB |   2.15GB | offload_param=none, offload_optimizer=none, zero_init=0
