In [1]:
%%shell
git clone https://github.com/FleshRazer/tmp.git

Cloning into 'tmp'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 24 (delta 0), reused 6 (delta 0), pack-reused 16[K
Receiving objects: 100% (24/24), 42.35 MiB | 32.98 MiB/s, done.




In [2]:
%%shell
pip3 install --quiet \
    transformers \
    bitsandbytes \
    accelerate \
    datasets \
    peft \
    trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.0/124.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━



<font size="5"><b>Restart the runtime after installing accelerate package!</b></font>  

In [2]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from peft import LoraConfig, get_peft_model

from trl import SFTTrainer

from tmp.src.data import data_utils

In [3]:
model_name = "NousResearch/Llama-2-7b-hf"

# Activate 4-bit precision base model loading
use_4bit = True

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [4]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)

In [5]:
dataset = data_utils.get_filtered_paranmt_hf()

prompt_template = (
    f"Detoxify the text:\n{{reference}}\n---\nDetoxified text:\n{{translation}}{{eos_token}}"
)

def apply_prompt_template(sample):
    return {
        "text": prompt_template.format(
            reference=sample["reference"],
            translation=sample["translation"],
            eos_token=tokenizer.eos_token,
        )
    }

dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
dataset = dataset.map(lambda x: tokenizer(x["text"]), remove_columns=list(dataset.features), batched=True)
dataset = dataset.train_test_split(test_size=0.05)
dataset


Map:   0%|          | 0/576211 [00:00<?, ? examples/s]

Map:   0%|          | 0/576211 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 547400
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 28811
    })
})

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        warmup_steps=2,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        logging_steps=25,
        group_by_length=True,
        output_dir="tmp/models/llama2-7b-tmp",
        optim="paged_adamw_8bit",
        report_to="none",
        # report_to="wandb",
        # evaluation_strategy="epoch",
        # preprocess_logits_for_metrics=None,

        # save_strategy="steps",
        # save_steps=25,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    # compute_metrics=compute_metrics,
)

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.3035
50,1.2366
75,1.4977
100,1.1789
125,1.5055
150,1.0108
175,1.5277
200,1.1107
225,1.5263
250,1.1336


In [23]:
%%script false --no-raise-error
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_steps=2,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    logging_steps=1,
    group_by_length=True,
    output_dir="tmp/models/llama2-7b-tmp",
    optim="paged_adamw_8bit",
    report_to="none",
    # report_to="wandb",
    # evaluation_strategy="epoch",
    # preprocess_logits_for_metrics=None,

    # save_strategy="steps",
    # save_steps=25,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["test"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

trainer.train()