<a href="https://colab.research.google.com/github/11kartheek/ERA-v2/blob/main/qlora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q -U datasets bitsandbytes

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [56]:
# !git clone https://github.com/11kartheek/Qlora_peft.git


In [57]:
%cd /content/Qlora_peft

/content/Qlora_peft


In [58]:
import warnings
warnings.filterwarnings("ignore")

In [59]:
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel

In [60]:
import pandas as pd
df = pd.read_csv("train_data_filtered.csv")


In [61]:
df = df.sort_values(by = 'input_length')

In [62]:
df.columns

Index(['msg_id', 'input', 'output', 'role', 'input_length'], dtype='object')

In [63]:
df.rename(columns={'input': 'prompt', 'output':'response'}, inplace=True)

In [64]:

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')


There are 24002 successfully-generated examples. Here are the first few:


In [65]:
df = df[['prompt','response']]

In [66]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)

In [67]:


import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [68]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dataset_name = "/content/Qlora_peft/train.jsonl"


In [69]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 4000
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          max_seq_length=max_seq_length,
                                         )
tokenizer.pad_token = tokenizer.eos_token



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [70]:
model = prepare_model_for_kbit_training(model)

In [71]:
tokenizer.padding_side = 'right'

In [72]:
# Load datasets
train_dataset = load_dataset('json', data_files='/content/Qlora_peft/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/content/Qlora_peft/test.jsonl', split="train")

# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/21602 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

In [73]:
#use bf16 and FlashAttention if supported
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

print(attn_implementation)
print(compute_dtype)

flash_attention_2
torch.bfloat16


In [74]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_fe

In [30]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['qkv_proj', 'o_proj','lm_head'],
    lora_dropout=0.00,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_8bit",
    save_steps=0,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="none",
    evaluation_strategy="epoch"
)



In [31]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    eval_dataset=valid_dataset_mapped,  # Pass validation dataset here
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    args=training_arguments,
    packing=False,
)

Map:   0%|          | 0/21602 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

In [32]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("phi3-qlora")

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.91 GiB. GPU 0 has a total capacity of 14.75 GiB of which 525.06 MiB is free. Process 115376 has 14.23 GiB memory in use. Of the allocated memory 13.21 GiB is allocated by PyTorch, and 916.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [35]:
# del model
# del optimizer

# Clear GPU cache
torch.cuda.empty_cache()

In [37]:
del tokenizer

In [53]:
import gc
# Run garbage collector
gc.collect()

# Clear GPU cache again
torch.cuda.empty_cache()

In [54]:
!nvidia-smi


Fri Sep  6 16:22:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0              30W /  70W |   6075MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [52]:
# Load datasets
train_dataset = load_dataset('json', data_files='/content/Qlora_peft/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/content/Qlora_peft/test.jsonl', split="train")

# Preprocess datasets
train_dataset_mapped = train_dataset.map(lambda examples: {'text': [prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)


In [None]:

# Cell 4: Test the model
logging.set_verbosity(logging.CRITICAL)
prompt = f"[INST] Write a function that reverses a string. [/INST]" # replace the command here with something relevant to your task
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])