In [145]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [146]:
import numpy as np
import os
import sys
import os
from tqdm import tqdm
import torch
sys.path.append(".")
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from accelerate import Accelerator
from torch.utils.data import DataLoader
from utils.template import TEMPLATE_DICT
import seaborn as sns
from peft import LoraConfig, get_peft_model
from matplotlib import pyplot as plt
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [147]:
def load_data(DATASET_NAME, tasks):
    dataset = load_dataset(DATASET_NAME, split="train")
    dataset = dataset.train_test_split(test_size=0.2, seed=0)
    dataset = dataset['train']
    dataset = dataset.filter(lambda x: x['category'] in tasks) #['open_qa', 'general_qa', 'closed_qa', 'classification', 'brainstorming', 'information_extraction', 'summarization'])
    return dataset

def dolly_format(example):
    if example['context'] == "":
        example["inputs"] = example["instruction"]
    else:
        example["inputs"] = example["instruction"] + " " + example['context']

    return example

In [148]:
alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{} 

### Response: {}{}"""

TEMPLATE_DICT = {
    'alpaca': (alpaca_template, '\n### Response:'),
}

In [149]:
def get_formatting_prompts_func(template_name, eos_token):
    if template_name in TEMPLATE_DICT:
        overall_temp, response_temp = TEMPLATE_DICT[template_name]
        def formatting_prompts_func(example):    
            #output_texts = []    
            text = overall_temp.format(example['instruction'], example['response'], eos_token)
            #output_texts.append(text)    
            return text#output_texts

    elif template_name == 'ag_news':

        formatting_prompts_func = None
        response_temp = None
    
    return formatting_prompts_func, response_temp

In [150]:
template = TEMPLATE_DICT['alpaca'][0]
MODEL_NAME = 'HuggingFaceTB/SmolLM-1.7B'
DATASET_NAME = "databricks/databricks-dolly-15k"

In [151]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, load_in_4bit=True)

peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM")

model = get_peft_model(model, peft_config)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]


In [152]:
experiment_name = 'both'

data = load_data(DATASET_NAME, tasks = ['classification', 'summarizarion'])
data = data.map(dolly_format)

formatting_prompts_func, response_temp = get_formatting_prompts_func('alpaca', tokenizer.eos_token)

In [153]:
# Define script_args and new_lr
class ScriptArgs:
    output_dir = f"./output_centralized/{experiment_name}/1.7B"
    batch_size = 8
    logging_steps = 10
    num_train_epochs = 3
    max_steps = 500
    save_steps = 250
    save_total_limit = 2
    push_to_hub = False
    hub_model_id = "my_model"
    gradient_checkpointing = False

script_args = ScriptArgs()
new_lr = 5e-5

training_args = TrainingArguments(
        output_dir=script_args.output_dir,
        per_device_train_batch_size=script_args.batch_size,
        learning_rate=new_lr,
        logging_steps=script_args.logging_steps,
        num_train_epochs=script_args.num_train_epochs,
        max_steps=script_args.max_steps,
        save_steps=script_args.save_steps,
        save_total_limit=script_args.save_total_limit,
        push_to_hub=script_args.push_to_hub,
        hub_model_id=script_args.hub_model_id,
        gradient_checkpointing=script_args.gradient_checkpointing,
        lr_scheduler_type="constant"
    )


In [154]:
#torch.device("cuda:0")

In [None]:
# Define the missing variables
response_temp = '\n### Response:'
response_temp_ids = tokenizer(response_temp)['input_ids']
data_collator = DataCollatorForCompletionOnlyLM(response_temp_ids, tokenizer=tokenizer)

packing = False  # Example value for packing
dataset_text_field = 'inputs'  # Example field name

trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            args=training_args,
            #max_seq_length=128,
            train_dataset=data,
            formatting_func=formatting_prompts_func,
            data_collator=data_collator,
            #packing=packing,
            #dataset_text_field=dataset_text_field,
        )

  trainer = SFTTrainer(
Applying formatting function to train dataset: 100%|██████████| 100/100 [00:00<00:00, 2485.35 examples/s]
Converting train dataset to ChatML: 100%|██████████| 100/100 [00:00<00:00, 3005.85 examples/s]
Applying chat template to train dataset: 100%|██████████| 100/100 [00:00<00:00, 4880.84 examples/s]
Tokenizing train dataset: 100%|██████████| 100/100 [00:00<00:00, 1549.21 examples/s]
Truncating train dataset: 100%|██████████| 100/100 [00:00<00:00, 2984.25 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [157]:
trainer.train()

Step,Training Loss
10,1.3749
20,1.4754
30,1.1144
40,1.1886
50,1.121
60,1.008
70,1.1278
80,0.9085
90,1.0342
100,1.0253


KeyboardInterrupt: 

In [3]:

experiments = {'experiment_name': ['classification', 'summarization'],
            'tasks': [['classification'], ['summarization']]}

for experiment_name, tasks in experiments.items():
    print(experiment_name)

experiment_name
tasks
