<a href="https://colab.research.google.com/github/BDH-teacher/Deep_Learning_Audit_code/blob/main/LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, random
import numpy as np
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

device: cuda


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

data = load_dataset("squad")

model_name = "bigscience/bloom-560m"

lora_tokenizer = AutoTokenizer.from_pretrained(model_name)
if lora_tokenizer.pad_token is None:
    lora_tokenizer.pad_token = lora_tokenizer.eos_token

def create_prompt(context, question, answers):
    answer_text = answers["text"][0] if len(answers["text"]) > 0 else ""
    prompt = (
        f"### Context: {context}\n"
        f"### Question: {question}\n"
        f"### Answer: {answer_text}"
    )
    return prompt

train_small = data["train"].select(range(200))
val_small   = data["validation"].select(range(50))

print("train_small:", len(train_small), "val_small:", len(val_small))
print("sample answers:", train_small[0]["answers"])

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

train_small: 200 val_small: 50
sample answers: {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [None]:
MAX_LEN = 256

def tok_fn(ex):
    prompt = create_prompt(ex["context"], ex["question"], ex["answers"])
    return lora_tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

tokenized_train = train_small.map(tok_fn, remove_columns=train_small.column_names)
tokenized_val   = val_small.map(tok_fn, remove_columns=val_small.column_names)

print(tokenized_train[0].keys())
print(tokenized_train[0]["input_ids"][:20])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask'])
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [None]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["query_key_value"],
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/293 [00:00<?, ?it/s]

trainable params: 1,572,864 || all params: 560,787,456 || trainable%: 0.2805


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./lora_results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=30,
    learning_rate=1e-4,
    logging_steps=5,
    fp16=torch.cuda.is_available(),
    save_steps=30,
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=lora_tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss
5,18.394275
10,17.881631
15,11.363309
20,47.866992
25,7.219372
30,32.055228


TrainOutput(global_step=30, training_loss=22.463467915852863, metrics={'train_runtime': 24.4732, 'train_samples_per_second': 4.903, 'train_steps_per_second': 1.226, 'total_flos': 56012329451520.0, 'train_loss': 22.463467915852863, 'epoch': 0.6})

In [None]:
from transformers import pipeline

gen = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

test_prompt = create_prompt(
    context="Transformers use attention to model dependencies between tokens in a sequence.",
    question="What is attention used for?",
    answers={"text": [""], "answer_start": [0]}
)

out = gen(test_prompt, max_new_tokens=40, do_sample=False)
print(out[0]["generated_text"])

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'max_new_tokens', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=40) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


### Context: Transformers use attention to model dependencies between tokens in a sequence.
### Question: What is attention used for?
### Answer:  Attention is used to model dependencies between tokens in a sequence. The model is used to predict the next token in the sequence. The model is used to predict the next token in the sequence. The model
