To set custom kernel for notebook https://scicomp.aalto.fi/triton/apps/jupyter/#installing-kernels-from-virtualenvs-or-anaconda-environments

Prerequisites: 
- Efficient resource utilization and reducing computational requirements
- LoRA
- Mixed-precision Training

https://huggingface.co/blog/4bit-transformers-bitsandbytes

https://arxiv.org/abs/2106.09685

https://arxiv.org/abs/2305.14314

In [1]:
# # load env variables, NOTE: this cell must be run first
# from dotenv import load_dotenv
# load_dotenv(override=True)
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import time

In [None]:
import os
import torch
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['HF_HOME']='/scratch/shareddata/dldata/huggingface-hub-cache'
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-hf")

prompt = "How many stars in the space?"

model_inputs = tokenizer([prompt], return_tensors="pt")
input_length = model_inputs.input_ids.shape[1]

generated_ids = model.generate(**model_inputs, max_new_tokens=20)
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# os.environ["HF_HUB_OFFLINE"]="1"

In [3]:
os.environ['TRANSFORMERS_OFFLINE']="1"

In [4]:
os.environ['HF_HOME']='/scratch/shareddata/dldata/huggingface-hub-cache'

In [3]:
os.environ['TRANSFORMERS_CACHE'] = '/scratch/shareddata/dldata/huggingface-hub-cache/hub'

In [5]:
os.environ['HF_DATASETS_CACHE']='~/.cache/huggingface/datasets'

In [7]:
os.environ['HF_DATASETS_OFFLINE']="0"

In [5]:
model_name = 'meta-llama/Llama-2-13b-hf'
dataset_name = 'tatsu-lab/alpaca'

# dataset_name = "knowrohit07/know_medical_dialogue_v2"
# dataset_name = "LDJnr/Pure-Dove"

# Preparing data


In [9]:
# from transformers.utils import send_example_telemetry

# send_example_telemetry("question_answering_notebook", framework="pytorch")

In [10]:
# from datasets import list_datasets
# list_datasets()

In [11]:
from datasets import load_dataset

dataset = load_dataset(dataset_name,split='train[:1000]')

dataset

Generating train split: 52002 examples [00:00, 125930.18 examples/s]


Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 1000
})

In [12]:
print(dataset[0]['text'])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Response:
1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough sleep and maintain a consistent sleep schedule.


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def tokenize_function(examples):
    text = examples['text']
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    )
    
    return tokenized_inputs

In [14]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True,
    remove_columns=dataset.column_names,
)

print(tokenized_dataset)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1000
})


In [15]:
tokenized_dataset.train_test_split(test_size=0.3)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 700
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 300
    })
})

In [16]:
split = tokenized_dataset.train_test_split(test_size=0.3)
train_dataset,eval_dataset = split['train'],split['test']

In [17]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling


In [18]:
data_collator=DataCollatorForLanguageModeling(tokenizer,mlm=False)

train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=2, pin_memory=True
    )
eval_dataloader = DataLoader(
        eval_dataset, shuffle=True, collate_fn=data_collator, batch_size=2, pin_memory=True
    )

# Model parellel
- Mannually
- Huggingface


In [2]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # First linear layer
        self.linear1 = nn.Linear(10000, 10)
        # ReLU activation
        self.relu = nn.ReLU()
        # Second linear layer
        self.linear2 = nn.Linear(10, 5)

    def forward(self, x):
        x = self.linear1(x.to("cuda:0")) # Apply the first linear layer
        x = self.relu(x)    # Apply the ReLU activation
        x = self.linear2(x.to("cuda:1")) # Apply the second linear layer
        return x

# Example of creating an instance of the model
model = MyModel()
print(model)

MyModel(
  (linear1): Linear(in_features=10000, out_features=10, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=10, out_features=5, bias=True)
)


In [8]:
next(model.linear1.parameters()).device

device(type='cpu')

In [9]:
next(model.linear2.parameters()).device

device(type='cpu')

In [10]:

# Define a loss function, for example, Mean Squared Error for a regression task
loss_function = nn.MSELoss()

# Define an optimizer, e.g., Stochastic Gradient Descent, with a learning rate of 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


batch_size = 3
input_size = 10000
output_size = 5
dummy_input = torch.randn(batch_size, input_size)
dummy_label = torch.randn(batch_size, output_size).to(next(model.linear2.parameters()).device)

# Forward step
model.train()  # Set the model to training mode
optimizer.zero_grad()  # Clear any gradients from the previous step
output = model(dummy_input)  # Compute the model's output
loss = loss_function(output, dummy_target)  # Compute the loss

# Backward step
loss.backward()  # Compute gradients
optimizer.step()  # Update parameters

print(f"Loss: {loss.item()}")


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [7]:
from transformers import DistilBertModel, DistilBertConfig

class ModelParallelDistilBERT(nn.Module):
    def __init__(self, *args, **kwargs):
        super(ModelParallelDistilBERT, self).__init__()

        # Load the configuration and create a DistilBERT model
        config = DistilBertConfig.from_pretrained('distilbert-base-uncased', *args, **kwargs)
        distilbert = DistilBertModel(config)

        # Split the model into two parts
        self.part1 = nn.Sequential(
            distilbert.embeddings,
            *distilbert.transformer.layer[:3]  # First half of the layers
        )

        self.part2 = nn.Sequential(
            *distilbert.transformer.layer[3:]  # Second half of the layers
        )

        # Place each part on a different GPU
        self.part1.cuda(0)
        self.part2.cuda(1)

    def forward(self, input_ids, attention_mask=None):
        # Forward pass through the first part
        output = self.part1(input_ids.to('cuda:0'), attention_mask.to('cuda:0'))

        # Forward pass through the second part
        output = self.part2(*output.to('cuda:1'))

        return output

# Example usage
model = ModelParallelDistilBERT()

# Create dummy input data
input_ids = torch.randint(0, 30522, (1, 512)).cuda(0)  # Example input token IDs
attention_mask = torch.ones((1, 512)).cuda(0)          # Example attention mask

# Forward pass
output = model(input_ids, attention_mask)


NameError: name 'nn' is not defined

device_map
"auto" "balanced" ...
GPUs>CPU>Disk

In [19]:
# # Example usage
# prompt = "Once upon a time"  # Replace with your own prompt
# # Encode the prompt
# input_ids = tokenizer.encode(prompt, return_tensors='pt')

# # Generate text
# output = base_model.generate(input_ids, max_length=20, num_return_sequences=3, no_repeat_ngram_size=2)

# # Decode and return the generated text
# for text in [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in output]:
#     print(text)

In [20]:
# import os
# import csv
# import torch


#  DataCollatorForSeq2Seq

# from datasets import load_dataset
# from peft import LoraConfig, TaskType, get_peft_model
# from peft.utils.other import fsdp_auto_wrap_policy


In [9]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, LoraConfig, get_peft_model
import torch

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_name,
#                                                  load_in_8bit=True,
#                                                   torch_dtype=torch.float16,
#                                                  device_map="auto",
                                                  # device_map = {"": "cuda:" + str(int(os.environ.get("LOCAL_RANK") or 0))}
                                                 )

Loading checkpoint shards:  33%|███▎      | 1/3 [00:20<00:41, 20.86s/it]

In [9]:
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head):

In [None]:
# for i, param in enumerate(base_model.named_parameters()):
#     print(f'{i},{param[0]}\t {param[1].device} \t{param[1].dtype}')

In [None]:
# Freeze all parameters
for param in base_model.parameters():
    param.requires_grad = False

In [None]:
def print_trainable_parameters(model):
    """
    Print the names and shapes of trainable parameters in a Hugging Face model.

    Args:
    model: A Hugging Face model instance.
    """
    trainable_params = 0
    all_params = 0
    for name, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable_params: {trainable_params}")
    print(f"all_params: {all_params}")
    
print_trainable_parameters(base_model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

LoRA: Low-Rank Adaptation of Large Language Models
https://arxiv.org/abs/2106.09685

In [None]:
base_model

In [None]:
peft_model = get_peft_model(base_model, config)

In [None]:
peft_model

**Different training tools from Huggingface**:

- Huggingface accelerate library

- Trainer: this API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex] for NVIDIA GPUs, ROCm APEX for AMD GPUs, and Native AMP for PyTorch.


## Huggingface accelerate library

You may want to use different settings/resources/environments for model training in different phases of your research, different APIs or libraries can provide interfaces to run training:
```bash
# A single GPU/CPU
python your_script.py
```
or 

```bash
# Multiple GPUs
torchrun --nnode=1 --nproc_per_node=4 your_script.py
```
or 

```bash
# Multiple GPUs
deepspeed --num_gpus=4 your_script.py
```
or

......

This often means many lines of code changed. 

Is there a better way of doing this? 

Yes, the accelerate library solves this and ensures the same code can be ran on different computing resources.

In [None]:
from accelerate import Accelerator

In [None]:
accelerator = Accelerator()

batch_size = 1
gradient_accumulation_steps = 8
max_length = 512
lr = 1e-4
num_epochs = 3

accelerator.print(peft_model.print_trainable_parameters())

In [None]:
# for batch in train_dataloader:
#     print(batch)
#     input_ids, attention_mask = batch
#     outputs = model(input_ids, attention_mask=attention_mask)
#     # Now, you can use outputs for your task


In [None]:
from torch import optim
from transformers import get_linear_schedule_with_warmup

In [None]:
optimizer = optim.AdamW(peft_model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs)
    )

In [None]:
peft_model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        peft_model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
    )

In [None]:
if getattr(accelerator.state, "fsdp_plugin", None) is not None:
        accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)

In [None]:
from tqdm import tqdm
for epoch in range(num_epochs):
    peft_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        # print(step)
        outputs = peft_model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        # print(loss)
        accelerator.backward(loss)
        
        if step % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            peft_model.zero_grad()

#         capture_batch_analytics(epoch, 'train', step, loss.detach().float(), total_loss, batch["input_ids"], batch["labels"])

#     peft_model.eval()
#     eval_loss = 0
#     for step, batch in enumerate(tqdm(eval_dataloader)):
#         with torch.no_grad():
#             outputs = model(**batch)
#         loss = outputs.loss
#         eval_loss += loss.detach().float()
#         capture_batch_analytics(epoch, 'eval', step, loss.detach().float(), eval_loss, batch["input_ids"], batch["labels"])

# #     model.save_pretrained(f"trained_model-{epoch}")

In [None]:
model

In [None]:
# import torch
# device_count = torch.cuda.device_count()
# if device_count > 0:
# #     logger.debug("Select GPU device")
#     device = torch.device("cuda")
# else:
# #     logger.debug("Select CPU device")
#     device = torch.device("cpu")

# transformers Trainer API

In [None]:
from transformers import Trainer, TrainingArguments
# DataCollatorForLanguageModeling

In [None]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=30,
  output_dir='out',
  # Batch size for training
  per_device_train_batch_size=1,
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, 
  logging_steps=1,
  gradient_accumulation_steps = 4,
  ddp_find_unused_parameters=False,
)

In [None]:
trainer = Trainer(
    model=peft_model,
    train_dataset=tokenized_dataset,
#     eval_dataset=test_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer,mlm=False)
    
)

In [None]:
result = trainer.train()

In [None]:

peft_model.save_pretrained("./llama_7b_peft", save_adapter=True, save_config=True)

# model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(base_model).to("cuda"), "./llama_7b_peft")

# merged_model = model_to_merge.merge_and_unload()
# merged_model.save_pretrained(merged_model)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="offload", 
)


In [None]:
model = PeftModel.from_pretrained(
    model, 
    "./llama_7b_peft", 
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="offload", 
)

In [None]:
model = model.merge_and_unload()

In [None]:
model.save_pretrained("./finetuned_llama2-7b")

In [None]:
######  TEST THIS ##########
trainer.generate()

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    # Tokenize
    input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

# Candidate topics for next session(TBD):