# **Installation and Setup**

In [1]:
!pip install -q ipython-autotime
%load_ext autotime


time: 322 µs (started: 2024-03-10 16:56:16 +00:00)


In [2]:
!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"

time: 2.51 s (started: 2024-03-10 16:56:16 +00:00)


In [3]:
!pip install -qqq transformers==4.37
!pip install -qqq git+https://github.com/huggingface/datasets
!pip install -qqq git+https://github.com/huggingface/peft
!pip install -qqq git+https://github.com/huggingface/accelerate
!pip install -qqq --upgrade bitsandbytes
!pip install -qqq git+https://github.com/huggingface/trl
!pip install -qqq --upgrade safetensors
!pip install -qqq -U flash-attn
!pip install -qqq evaluate
!pip install -qqq gputil psutil
!pip install -qqq wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
time: 1min 40s (started: 2024-03-10 16:56:19 +00:00)


In [4]:
!pip install optimum

time: 5.83 s (started: 2024-03-10 16:58:00 +00:00)


In [5]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import randrange

# Import datasets and related modules
import datasets
from datasets import load_dataset, Dataset, DatasetDict

# Import PyTorch and Hugging Face Transformers
import torch
import transformers
from transformers import  AutoTokenizer

# Import training and evaluation components
from transformers import TrainingArguments, Trainer, set_seed

# Import Evaluate
import evaluate

time: 8.93 s (started: 2024-03-10 16:58:06 +00:00)


In [6]:
set_seed(42)

time: 3.08 ms (started: 2024-03-10 16:58:15 +00:00)


In [7]:
library_versions = {
    "Python": sys.version.split()[0],
    "NumPy": np.__version__,
    "Pandas": pd.__version__,
    "Datasets": datasets.__version__,
    "Transformers": transformers.__version__,
    "Torch": torch.__version__,
    "Evaluate": evaluate.__version__,
}

for name, version in library_versions.items():
    print(name.rjust(15), ":", version)

         Python : 3.10.12
          NumPy : 1.25.2
         Pandas : 1.5.3
       Datasets : 2.18.1.dev0
   Transformers : 4.37.0
          Torch : 2.1.0+cu121
       Evaluate : 0.4.1
time: 2.13 ms (started: 2024-03-10 16:58:15 +00:00)


# **Data Preparation and Analysis**

In [8]:
from datasets import load_dataset

time: 328 µs (started: 2024-03-10 16:58:15 +00:00)


In [9]:
from random import randrange

# Load dataset from the hub
dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset", split = "train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


time: 3.97 s (started: 2024-03-10 16:58:15 +00:00)


In [10]:
print(dataset)

Dataset({
    features: ['flags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 26872
})
time: 448 µs (started: 2024-03-10 16:58:19 +00:00)


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-large"

# Load tokenizer of FLAN-t5-XL
tokenizer = AutoTokenizer.from_pretrained(model_id)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

time: 1.62 s (started: 2024-03-10 16:58:19 +00:00)


In [12]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["answer: " + item for item in sample["instruction"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=2048, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["response"], max_length=2048, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['flags', 'instruction', 'category', 'intent', 'response'])




Map:   0%|          | 0/26872 [00:00<?, ? examples/s]

time: 1min 34s (started: 2024-03-10 16:58:20 +00:00)


In [13]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 26872
})

time: 2.43 ms (started: 2024-03-10 16:59:54 +00:00)


In [14]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "google/flan-t5-large"

# load model from the hub
#model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")


time: 467 µs (started: 2024-03-10 16:59:54 +00:00)


In [15]:
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSeq2SeqLM

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

time: 1.99 ms (started: 2024-03-10 16:59:54 +00:00)


In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_id,
                                            quantization_config=bnb_config,
                                            torch_dtype=torch.bfloat16,
                                            device_map="auto",
                                            use_flash_attention_2=False
                                                    )

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

time: 19.7 s (started: 2024-03-10 16:59:54 +00:00)


In [17]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

time: 331 µs (started: 2024-03-10 17:00:14 +00:00)


In [18]:
model = prepare_model_for_kbit_training(model)

time: 29.8 ms (started: 2024-03-10 17:00:14 +00:00)


In [19]:
print("\n====================================================================\n")
print("\t\t\tMODEL CONFIG UPDATED")
print("\n====================================================================\n")



			MODEL CONFIG UPDATED


time: 586 µs (started: 2024-03-10 17:00:14 +00:00)


In [20]:
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (k): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (v): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (o): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear4bit(in_features=1024, out_features=2

In [21]:
print(model.config)

T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "quantization_config": {
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "relative

In [22]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

time: 405 µs (started: 2024-03-10 17:00:14 +00:00)


In [23]:
peft_config = LoraConfig(
                                    r=64,
                                    lora_alpha=64,
                                    lora_dropout=0.0,
                                    bias="none",
                                    task_type=TaskType.SEQ_2_SEQ_LM,
                                    target_modules= "all-linear"
                                )


time: 516 µs (started: 2024-03-10 17:00:14 +00:00)


In [24]:
model = get_peft_model(model, peft_config)

time: 1.28 s (started: 2024-03-10 17:00:14 +00:00)


In [25]:
print("\n====================================================================\n")
print("\t\t\tPREPARED MODEL FOR FINETUNING")
print(model)
print(model.config)
print("\n====================================================================\n")



			PREPARED MODEL FOR FINETUNING
PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Identity()
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=64, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=64, out_features=1024, bias=False)
                    )
                    (lora_embedding_A):

In [26]:
print(model.print_trainable_parameters())

trainable params: 73,138,176 || all params: 856,288,256 || trainable%: 8.541303175364348
None
time: 14.8 ms (started: 2024-03-10 17:00:15 +00:00)


In [27]:
print("\n====================================================================\n")
print("\t\t\tPREPARED FOR FINETUNING")
print("\n====================================================================\n")



			PREPARED FOR FINETUNING


time: 482 µs (started: 2024-03-10 17:00:15 +00:00)


In [28]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


time: 499 µs (started: 2024-03-10 17:00:15 +00:00)


# **Instantiate LoRAConfig and Prepare Model for Training**

In [29]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

time: 7.25 ms (started: 2024-03-10 17:00:15 +00:00)


In [30]:


args = Seq2SeqTrainingArguments(
    output_dir= f"flan-t5-large-lora-bitext-customer-support",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=2e-4,
    report_to="wandb",
    optim="adamw_torch_fused",
    bf16=True,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_strategy='epoch', # log according to log_steps
    save_safetensors=True,
    push_to_hub=True,                      # push model to hub
    seed = 42
)

time: 1.78 ms (started: 2024-03-10 17:00:15 +00:00)


# **Initializing the Seq2seqTrainer with Configuration for Model Training**

In [31]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

time: 17.8 ms (started: 2024-03-10 17:00:15 +00:00)


In [32]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


time: 561 ms (started: 2024-03-10 17:00:15 +00:00)


time: 567 ms (started: 2024-03-10 17:00:15 +00:00)


In [33]:
!pip install -qqq wandb

time: 5.42 s (started: 2024-03-10 17:00:16 +00:00)


In [34]:
import wandb

time: 375 µs (started: 2024-03-10 17:00:21 +00:00)


In [35]:
wandb.init(project="zephyr_vs_t5_vs_gemma_bitext_customer_support", entity="drishtisharma96505", group='t5_no_flash_atn')


[34m[1mwandb[0m: Currently logged in as: [33mdrishtisharma96505[0m. Use [1m`wandb login --relogin`[0m to force relogin


time: 7.51 s (started: 2024-03-10 17:00:21 +00:00)


In [36]:
import time


time: 729 µs (started: 2024-03-10 17:00:29 +00:00)


In [37]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
1.646 GB of memory reserved.
time: 2.15 ms (started: 2024-03-10 17:00:29 +00:00)


In [None]:
start_time = time.time()  # Start timer

trainer_stats = trainer.train()

end_time = time.time()  # End timer
training_duration = (end_time - start_time)/60

print(f"Training Time: {training_duration} minutes")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


In [None]:
print(model)