In [1]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)
from accelerate import Accelerator
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /data/chris/anaconda3/envs/peft-env/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 6.1
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /data/chris/anaconda3/envs/peft-env/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda116_nocublaslt.so...


  warn(msg)


In [2]:
# Load eli5 dataset for the causal LM task
# More info about Eli5: https://facebookresearch.github.io/ELI5/
eli5 = load_dataset("eli5", split="train_asks[:5000]")

Found cached dataset eli5 (/home/eecs/christopherchou/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa)


In [3]:
eli5 = eli5.train_test_split(test_size=0.2) # split dataset into 0.8-0.2
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'document', 'subreddit', 'answers', 'title_urls', 'selftext_urls', 'answers_urls'],
        num_rows: 1000
    })
})

In [4]:
model_name_or_path = "facebook/opt-1.3b"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [5]:
eli5 = eli5["train"].flatten() # flatten because what we want is answers.text but it is nested in the answers' object
eli5[0] # Notice how the answers.text is a list of strings that need to be joined

{'q_id': '1s0bnc',
 'title': 'How does trauma like from boxing or playing American football result in Tau protein build up? What about the physical impact results in changes in proteins produced?',
 'selftext': '',
 'document': '',
 'subreddit': 'askscience',
 'answers.a_id': ['cdsp3ee'],
 'answers.text': ["I'm assuming you're referring to the condition of chronic traumatic encephalopathy, or *dementia pugilistica* as it's commonly known as in the world of boxing.  This disease is seen in those with a history of head trauma, like football players and boxers, as you mentioned.  The disease itself shows symptoms of both alzheimer's dementia and parkinsonian dementia.  As you might expect [brains of these patients showed both tau protein (implicated in Alzhemier's) and alpha-synuclein (implicated in Parkinson's)](_URL_2_).\n\nTo get around to answering your question, it isn't totally clear!  [It is known that individuals with an APO E4 allele have increased risk of chronic traumatic encep

In [6]:
max_seq_length = 128

def preprocess_function(examples):
    grouped_examples = [" ".join(text) for text in examples["answers.text"]] # converts list to a string that can be tokenized jointly
    return tokenizer(grouped_examples, 
        padding=True,
        max_length=max_seq_length,
        truncation=True,
        return_tensors=None
    )

In [7]:
tokenized_eli5 = eli5.map(
    preprocess_function, 
    batched=True, # processes multiple elements of the dataset at once
    remove_columns=eli5.column_names # remove these column names, or else we have column names + input_ids + attention_mask
)

100%|██████████| 4/4 [00:00<00:00,  4.17ba/s]


In [8]:
tokenized_eli5

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4000
})

In [9]:
# Since dataset has inputs that are of length that is greater than max_seq_length, we need to
# 1. concatenate the sequences together
# 2. truncate the sequences to max_seq_length
# NOTE: this is only done for demonstration purposes, in reality, we should be dynamically padding our dataset
def group_texts(examples):
    concatenated_sequence = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_sequence["input_ids"]) # length of the batch concatenated sequence that we are currently working with
    total_length = (total_length // max_seq_length) * max_seq_length # truncate to a multiple of max_seq_length
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_sequence.items()
    }
    # result becomes
    # "input_id" : [[64 tokens], [64 tokens], [64 tokens], ...]
    # "attention_mask" : [[64 masks], [64 masks], [64 masks], ...]

    result["labels"] = result["input_ids"].copy()
    return result

# tokenized_eli5 = tokenized_eli5.map(
#     group_texts,
#     batched=True,
#     num_proc=1
# )

In [10]:
# Data collators are used for creating batches out of our dataset and can also provide some preprocessing like dynamically padding
data_collator = DataCollatorForLanguageModeling( # sets [-100] for pad tokens and the inputs for the labels
    tokenizer=tokenizer,
    mlm=False # masked language modelling set to false because we are doing causal language modelling
)

In [11]:
training_args = TrainingArguments(
    output_dir="./lora_finetune_opt_deepspeed_out",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False
)

trainer =  Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_eli5,
    eval_dataset=tokenized_eli5,
    data_collator=data_collator
)

In [12]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 11.90 GiB total capacity; 11.24 GiB already allocated; 41.69 MiB free; 11.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF