- All of the data and models are totally interchangable

- Get the most updated version of transformers and peft

# Install Dependencies

In [1]:
!pip install --upgrade bitsandbytes datasets accelerate loralib

#this library gives access to LoRA method of peft (LoRA is subset of peft)
#peft is an umbrella term for fine tuning methods that are parametric efficient (no full finetuning of the model)
!pip install --upgrade peft transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from 

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

# Initialization

Load Base Model
- Decoder only Causual language model, tons of prarameters,fairly permissive token (context) length, reasonable vocab

- trained on fairly large set of languages (multi-lingual) including code (including many programming languages)

- we can fine tune it on mlultiple languages

- model is approx 1GB+
https://huggingface.co/bigscience/bloom-560m

In [3]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

#AutoModelForCausalLM decoder only also reducing precision to float16 for less amount of memory
# device_map is a feature of Accelerate, lets the model layers exist across multiple devices if it's necessary

model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
    torch_dtype=torch.float16,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [4]:
#choose the weight matrices to decompose and to learn
#bloom has specific query,key and value module.
#This will be our target for fine tuning with LoRA
print(model)

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1024)
    (word_embeddings_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (

In [5]:
#pre-processing for training stability
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later(only specific parts of the model)
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
# reduce number of stored activations during training for reduced memory usage
model.gradient_checkpointing_enable()
#Ensures that the gradients for the input tensors are computed,which can be necessary for some fine-tuning methods.
model.enable_input_require_grads()
#This custom layer ensures that the output of the model's language model head (the final layer that generates predictions)
#is converted to 32-bit floating-point, improving numerical stability.
class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

Numerical stability is about ensuring that the computations involved in training a model do not introduce significant errors that could affect the model's ability to learn. Techniques like using higher precision for certain parameters can be used to enhance numerical stability and ensure reliable and accurate training processes.

# Helper functions

In [6]:
#allows to visualize LoRA perks
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Build dataset

In [7]:
#Load Sample Dataset for finetuning BLOOM on extractive Q&A
from datasets import load_dataset
qa_dataset = load_dataset("squad_v2")

Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

We want model not to learn answers rather the structure while training in this fashion. Answer will be produced based on the context given to the question.

```
### CONTEXT
{context}

### QUESTION
{question}

### ANSWER
{answer}</s>
```

In [8]:
def create_prompt(context, question, answer):
    if len(answer["text"]) < 1:
        answer = "Cannot Find Answer"
    else:
        answer = answer["text"][0]
    prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
    return prompt_template

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['context'], samples['question'], samples['answers'])))

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [9]:
mapped_qa_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11873
    })
})

First sample after tokenization

In [10]:
first_sample = mapped_qa_dataset['train'][0]  # Replace 'train' with the correct split if needed

# Create a prompt for the first sample
prompt = create_prompt(first_sample['context'], first_sample['question'], first_sample['answers'])

# Tokenize the prompt
tokenized_output = tokenizer(prompt, return_tensors='pt')  # Using 'pt' for PyTorch tensors; use 'tf' for TensorFlow

# Print the original prompt and its tokenized output
print("Original Prompt:\n", prompt)
print("\nTokenized Output:")
print("Input IDs:", tokenized_output['input_ids'])
print("Attention Mask:", tokenized_output.get('attention_mask', 'Not Available'))

Original Prompt:
 ### CONTEXT
Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".

### QUESTION
When did Beyonce start becoming popular?

### ANSWER
in the late 1990s</s>

Tokenized Output:
Input IDs: tensor([[105311,  97033,  32691,    189, 173652,  33231,  99974,   2450,  72786,
           1336,   7833,  59647,    375,     18,   7024,   5994,   2318,   

In [11]:
#setting up LoRA config
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=4, #rank is the no. of dimension our decomposed matrices are limited to [r x init. dim][init. dim x r]
    lora_alpha=8, #usually is set as double the rank,how much influence the low-rank matrices have relative to the original weights
    target_modules=["query_key_value"], #injecting decomposed matrices in the attention mechanism of each of the 24 blocks
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" #for maksed LM this parameter would be different
)

#returns a model that has all of those injectable matrices pairs, huggingface made it simple!
model = get_peft_model(model, config)

#acquire trainable parameters from our helper function. these 9% of the original parameters that we will finetune
print_trainable_parameters(model)

trainable params: 393216 || all params: 559607808 || trainable%: 0.07026635339584111


0.5 B parameters to 0.3 M parameters

# Finetune with LoRA


* italicised textTO DO TASK *

Prepare your trainer and pass the model, training data, args (choose steps or epochs as eval strategy). Formulate your datacollator as dictated by your transformer architecture type, in our case its CasualLM. Analyze resources while training.

Follow docs for parameters understanding https://huggingface.co/docs/transformers/main_classes/trainer

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import PeftModel, PeftConfig, get_peft_model

# Load the fine-tuned LoRA adapter
# config = PeftConfig.from_pretrained("/content/outputs/checkpoint-100")  # LoRA adapter

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
    return_dict=True,
    load_in_8bit=False,
    device_map='auto'
)

# Apply the LoRA adapter
model = get_peft_model(model, config)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# # Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")


# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Use 'eval_strategy' instead of 'evaluation_strategy'
    eval_steps=25,
    logging_steps=5,
    save_steps=5,
    max_steps = 25,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="tensorboard",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mapped_qa_dataset["train"],
    eval_dataset=mapped_qa_dataset["validation"],
    data_collator=data_collator,
)

model.config.use_cache = False

# Start training
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import PeftModel, PeftConfig

# Load the fine-tuned LoRA adapter
config = PeftConfig.from_pretrained("/content/outputs/checkpoint-100")  # LoRA adapter

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
    return_dict=True,
    load_in_8bit=False,
    device_map='auto'
)

# Apply the LoRA adapter
model = PeftModel.from_pretrained(model, config)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

# Load a dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

# Ensure all input tensors are LongTensors
def convert_to_long(example):
    example['input_ids'] = torch.tensor(example['input_ids']).long()
    example['attention_mask'] = torch.tensor(example['attention_mask']).long()
    if 'labels' in example:
        example['labels'] = torch.tensor(example['labels']).long()
    return example

tokenized_datasets = tokenized_datasets.map(convert_to_long, batched=True, num_proc=4)

# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Choose 'steps' or 'epoch'
    eval_steps=100,
    logging_steps=25,
    save_steps=100,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="tensorboard",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

model.config.use_cache = False
# Start training
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss'])):.2f}")


# Inference

In [None]:
model.config.use_cache = True

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# config = PeftConfig.from_pretrained("/content/outputs/checkpoint-100") #LoRA adapter
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
    return_dict=True,
    load_in_8bit=False,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

# Load the QA model set up with LoRA specific adapter
qa_model = PeftModel.from_pretrained(model, "/content/outputs/checkpoint-100")

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

config = PeftConfig.from_pretrained("/content/outputs/checkpoint-100") #LoRA adapter
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m",
    return_dict=True,
    load_in_8bit=False,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

# Load the QA model set up with LoRA specific adapter
qa_model = PeftModel.from_pretrained(model, "/content/outputs/checkpoint-100")

In [None]:
from IPython.display import display, Markdown

def make_inference(context, question, max_new_tokens=200):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    batch = tokenizer(f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n", return_tensors='pt')
    batch = {k: v.to(device) for k, v in batch.items()}


    with torch.cuda.amp.autocast():
        output_tokens = qa_model.generate(**batch, max_new_tokens=max_new_tokens)

    display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [None]:
context = "Cheese is the best food."
question = "What is the best food?"

make_inference(context, question)

In [None]:
context = "Cheese is the best food."
question = "How far away is the Moon from the Earth?"

make_inference(context, question)

In [None]:
context = "The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration."
question = "At what distance does the Moon orbit the Earth?"

make_inference(context, question)

TASK

In [None]:
# For Hot swapping you need to fine tune your lora adaptor for another downstream task, save your fine tuned model and inference with the base model.
# model_v2 = PeftModel.from_pretrained(model, "add another fine tuned lora dopter here (email generator)")

