## LoRA (Low Rank Adaptation)

##### Build the Dataset

In [None]:
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

We will use the python questions dataset from [Kaggle](https://www.kaggle.com/datasets/stackoverflow/pythonquestions). To train a Transformer from [Bigcode](https://huggingface.co/bigcode/starcoderbase-1b) 

In [None]:
# ! kaggle datasets download stackoverflow/pythonquestions
# ! unzip pythonquestions.zip -d ./pythonquestions
# ! pip install torch-summary

In [1]:
import pandas as pd
from datasets import Dataset

In [2]:
pythonquestions = "./pythonquestions"
questions_df = pd.read_csv(f"{pythonquestions}/Questions.csv", encoding = 'ISO-8859-1')
answers_df = pd.read_csv(f"{pythonquestions}/Answers.csv", encoding = 'ISO-8859-1')

In [3]:
questions_df.rename(columns={"Id":"QuestionId"}, inplace=True)
answers_df.rename(columns={"ParentId":"QuestionId"}, inplace=True)

QnA_df = pd.merge(questions_df, answers_df, on = ["QuestionId"], suffixes=("_q", "_a"))
QnA_df = QnA_df[["QuestionId", "Body_q", "Body_a"]]
QnA_df.rename(columns={"Body_q":"Question", "Body_a":"Answer"}, inplace=True)

QnA_df.head()

Unnamed: 0,QuestionId,Question,Answer
0,469,<p>I am using the Photoshop's javascript API t...,<p>open up a terminal (Applications-&gt;Utilit...
1,469,<p>I am using the Photoshop's javascript API t...,<p>I haven't been able to find anything that d...
2,469,<p>I am using the Photoshop's javascript API t...,<p>Unfortunately the only API that isn't depre...
3,469,<p>I am using the Photoshop's javascript API t...,<p>There must be a method in Cocoa to get a li...
4,502,<p>I have a cross-platform (Python) applicatio...,<p>You can use ImageMagick's convert utility f...


In [12]:
dataset = Dataset.from_pandas(QnA_df).train_test_split(test_size=0.3)

  if _pandas_api.is_sparse(col):


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/starcoderbase-1b"
device = "cuda" 
token="<HF_TOKEN>"
cache_dir="./model_cache/"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=token, cache_dir=cache_dir)


In [2]:
tokenizer.pad_token = tokenizer.eos_token

In [14]:
from bs4 import BeautifulSoup
def process_html_text(batch):
    
    q = []
    a = []
    
    for b in batch["Question"]:   
        q.append(BeautifulSoup(b, "html").text)
    
    for b in batch["Answer"]:   
        a.append(BeautifulSoup(b, "html").text)
    
    model_inputs = tokenizer(q, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(text_target=a, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


In [5]:

t_dataset = dataset["test"].map(process_html_text, batched=True)

In [16]:
t_dataset = t_dataset.remove_columns(["QuestionId","Answer","Question"])
t_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 296137
})

**Store the processed data**

In [None]:
t_dataset.save_to_disk("./pythonquestions/processed_ids.parquet")

##### Load processed data, input_ids and Model 

In [17]:
from datasets import Dataset
t_dataset = Dataset.load_from_disk("./pythonquestions/processed_ids.parquet")
t_dataset

Saving the dataset (0/4 shards):   0%|          | 0/296137 [00:00<?, ? examples/s]

##### Model 

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/starcoderbase-1b"
device = "cuda" 
token="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"
cache_dir="./model_cache/"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True, cache_dir=cache_dir)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("rouge", "f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
from torchsummary import summary
summary(model)

Layer (type:depth-idx)                             Param #
├─GPTBigCodeModel: 1-1                             --
|    └─Embedding: 2-1                              100,663,296
|    └─Embedding: 2-2                              16,777,216
|    └─Dropout: 2-3                                --
|    └─ModuleList: 2-4                             --
|    |    └─GPTBigCodeBlock: 3-1                   42,490,112
|    |    └─GPTBigCodeBlock: 3-2                   42,490,112
|    |    └─GPTBigCodeBlock: 3-3                   42,490,112
|    |    └─GPTBigCodeBlock: 3-4                   42,490,112
|    |    └─GPTBigCodeBlock: 3-5                   42,490,112
|    |    └─GPTBigCodeBlock: 3-6                   42,490,112
|    |    └─GPTBigCodeBlock: 3-7                   42,490,112
|    |    └─GPTBigCodeBlock: 3-8                   42,490,112
|    |    └─GPTBigCodeBlock: 3-9                   42,490,112
|    |    └─GPTBigCodeBlock: 3-10                  42,490,112
|    |    └─GPTBigCodeBlock: 3-11 

Layer (type:depth-idx)                             Param #
├─GPTBigCodeModel: 1-1                             --
|    └─Embedding: 2-1                              100,663,296
|    └─Embedding: 2-2                              16,777,216
|    └─Dropout: 2-3                                --
|    └─ModuleList: 2-4                             --
|    |    └─GPTBigCodeBlock: 3-1                   42,490,112
|    |    └─GPTBigCodeBlock: 3-2                   42,490,112
|    |    └─GPTBigCodeBlock: 3-3                   42,490,112
|    |    └─GPTBigCodeBlock: 3-4                   42,490,112
|    |    └─GPTBigCodeBlock: 3-5                   42,490,112
|    |    └─GPTBigCodeBlock: 3-6                   42,490,112
|    |    └─GPTBigCodeBlock: 3-7                   42,490,112
|    |    └─GPTBigCodeBlock: 3-8                   42,490,112
|    |    └─GPTBigCodeBlock: 3-9                   42,490,112
|    |    └─GPTBigCodeBlock: 3-10                  42,490,112
|    |    └─GPTBigCodeBlock: 3-11 

In [None]:
from transformers import Seq2SeqTrainingArguments, Trainer 

training_args = Seq2SeqTrainingArguments(
    output_dir="stackoverflowpython",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=t_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

**OOM Error** Cannot load the model into GPU Memory.  

Let's use LoRA to reduce the number of trainable parameters

In [9]:
model 

GPTBigCodeForCausalLM(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 2048)
    (wpe): Embedding(8192, 2048)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPTBigCodeBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeAttention(
          (c_attn): Linear(in_features=2048, out_features=2304, bias=True)
          (c_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e

##### LoRA Model

**c_attn, c_proj, c_fc, wte, wpe** are the modules that have lot parameters that can be reduced to just **r** parameters 

In [None]:
from peft import LoraConfig, get_peft_model
import copy

config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["c_attn", "c_proj", "c_fc", "wte", "wpe"],
    lora_dropout=0.1,
    bias="none",
)
lora_model = get_peft_model(copy.deepcopy(model), config)

In [12]:
summary(lora_model)

Layer (type:depth-idx)                   Param #
├─LoraModel: 1-1                         --
|    └─GPTBigCodeForCausalLM: 2-1        --
|    |    └─GPTBigCodeModel: 3-1         (1,149,298,688)
|    |    └─Linear: 3-2                  (100,663,296)
Total params: 1,249,961,984
Trainable params: 0
Non-trainable params: 1,249,961,984


Layer (type:depth-idx)                   Param #
├─LoraModel: 1-1                         --
|    └─GPTBigCodeForCausalLM: 2-1        --
|    |    └─GPTBigCodeModel: 3-1         (1,149,298,688)
|    |    └─Linear: 3-2                  (100,663,296)
Total params: 1,249,961,984
Trainable params: 0
Non-trainable params: 1,249,961,984

In [13]:
lora_model

PeftModel(
  (base_model): LoraModel(
    (model): GPTBigCodeForCausalLM(
      (transformer): GPTBigCodeModel(
        (wte): lora.Embedding(
          (base_layer): Embedding(49152, 2048)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 16x49152])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 2048x16])
        )
        (wpe): lora.Embedding(
          (base_layer): Embedding(8192, 2048)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.FloatTensor of size 16x8192])
          (lora_embedding_B): Parame

In [None]:
from transformers import Seq2SeqTrainingArguments, Trainer 

training_args = Seq2SeqTrainingArguments(
    output_dir="stackoverflowpython",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False

)

trainer = Trainer(
    lora_model,
    training_args,
    train_dataset=t_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/starcoderbase-1b"
device = "cuda" 
token="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"
cache_dir="./model_cache/"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True, cache_dir=cache_dir)

In [2]:
! nvidia-smi

Wed Feb  7 18:36:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3080        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   36C    P8              26W / 340W |     23MiB / 10240MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
model.to("cuda")

GPTBigCodeForCausalLM(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 2048)
    (wpe): Embedding(8192, 2048)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPTBigCodeBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeAttention(
          (c_attn): Linear(in_features=2048, out_features=2304, bias=True)
          (c_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e

In [4]:
! nvidia-smi

Wed Feb  7 18:37:36 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3080        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   40C    P2              95W / 340W |   4646MiB / 10240MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
get_ipython().kernel.do_shutdown(restart=True)

{'status': 'ok', 'restart': True}

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/starcoderbase-1b"
device = "cuda" 
token="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"
cache_dir="./model_cache/"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True, cache_dir=cache_dir)

In [4]:
from peft import LoraConfig, get_peft_model
import copy

config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["c_attn", "c_proj", "c_fc", "wte", "wpe"],
    lora_dropout=0.1,
    bias="none",
)
lora_model = get_peft_model(copy.deepcopy(model), config)

In [5]:
lora_model.to("cuda")

PeftModel(
  (base_model): LoraModel(
    (model): GPTBigCodeForCausalLM(
      (transformer): GPTBigCodeModel(
        (wte): lora.Embedding(
          (base_layer): Embedding(49152, 2048)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 16x49152 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 2048x16 (cuda:0)])
        )
        (wpe): lora.Embedding(
          (base_layer): Embedding(8192, 2048)
          (lora_dropout): ModuleDict(
            (default): Dropout(p=0.1, inplace=False)
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 16x8192 (cuda

In [6]:
! nvidia-smi

Wed Feb  7 18:43:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3080        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   40C    P2              95W / 340W |   4692MiB / 10240MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
from torchsummary import summary
summary(model)

Layer (type:depth-idx)                             Param #
├─GPTBigCodeModel: 1-1                             --
|    └─Embedding: 2-1                              100,663,296
|    └─Embedding: 2-2                              16,777,216
|    └─Dropout: 2-3                                --
|    └─ModuleList: 2-4                             --
|    |    └─GPTBigCodeBlock: 3-1                   42,490,112
|    |    └─GPTBigCodeBlock: 3-2                   42,490,112
|    |    └─GPTBigCodeBlock: 3-3                   42,490,112
|    |    └─GPTBigCodeBlock: 3-4                   42,490,112
|    |    └─GPTBigCodeBlock: 3-5                   42,490,112
|    |    └─GPTBigCodeBlock: 3-6                   42,490,112
|    |    └─GPTBigCodeBlock: 3-7                   42,490,112
|    |    └─GPTBigCodeBlock: 3-8                   42,490,112
|    |    └─GPTBigCodeBlock: 3-9                   42,490,112
|    |    └─GPTBigCodeBlock: 3-10                  42,490,112
|    |    └─GPTBigCodeBlock: 3-11 

Layer (type:depth-idx)                             Param #
├─GPTBigCodeModel: 1-1                             --
|    └─Embedding: 2-1                              100,663,296
|    └─Embedding: 2-2                              16,777,216
|    └─Dropout: 2-3                                --
|    └─ModuleList: 2-4                             --
|    |    └─GPTBigCodeBlock: 3-1                   42,490,112
|    |    └─GPTBigCodeBlock: 3-2                   42,490,112
|    |    └─GPTBigCodeBlock: 3-3                   42,490,112
|    |    └─GPTBigCodeBlock: 3-4                   42,490,112
|    |    └─GPTBigCodeBlock: 3-5                   42,490,112
|    |    └─GPTBigCodeBlock: 3-6                   42,490,112
|    |    └─GPTBigCodeBlock: 3-7                   42,490,112
|    |    └─GPTBigCodeBlock: 3-8                   42,490,112
|    |    └─GPTBigCodeBlock: 3-9                   42,490,112
|    |    └─GPTBigCodeBlock: 3-10                  42,490,112
|    |    └─GPTBigCodeBlock: 3-11 

In [10]:

summary(lora_model)

Layer (type:depth-idx)                   Param #
├─LoraModel: 1-1                         --
|    └─GPTBigCodeForCausalLM: 2-1        --
|    |    └─GPTBigCodeModel: 3-1         (1,149,298,688)
|    |    └─Linear: 3-2                  (100,663,296)
Total params: 1,249,961,984
Trainable params: 0
Non-trainable params: 1,249,961,984


Layer (type:depth-idx)                   Param #
├─LoraModel: 1-1                         --
|    └─GPTBigCodeForCausalLM: 2-1        --
|    |    └─GPTBigCodeModel: 3-1         (1,149,298,688)
|    |    └─Linear: 3-2                  (100,663,296)
Total params: 1,249,961,984
Trainable params: 0
Non-trainable params: 1,249,961,984