In [1]:
# Install Pytorch & other libraries
!pip install "torch==2.1.2" tensorboard

# Install Hugging Face libraries
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  # "trl==0.7.10" # \
  # "peft==0.7.1" \

# install peft & trl from github
!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade


Collecting git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
  Cloning https://github.com/huggingface/trl (to revision a3c5b7178ac4f65569975efadc97db2f3749c65e) to /tmp/pip-req-build-zrywclpc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-zrywclpc
  Running command git rev-parse -q --verify 'sha^a3c5b7178ac4f65569975efadc97db2f3749c65e'
  Running command git fetch -q https://github.com/huggingface/trl a3c5b7178ac4f65569975efadc97db2f3749c65e
  Running command git checkout -q a3c5b7178ac4f65569975efadc97db2f3749c65e
  Resolved https://github.com/huggingface/trl to commit a3c5b7178ac4f65569975efadc97db2f3749c65e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
  Cloning https://github.co

In [2]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'
# install flash-attn
!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation




In [3]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }




In [4]:
# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
print(dataset)

Dataset({
    features: ['question', 'answer', 'context'],
    num_rows: 78577
})


In [6]:
dataset = dataset.shuffle().select(range(25000))


dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
#dataset = dataset.train_test_split(test_size=3000/20000)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [7]:
# Assuming `dataset` is your preprocessed and selected dataset
total_size = len(dataset)
test_size_exact = 1000  # The exact number of samples you want in your test set
test_size_proportion = test_size_exact / total_size

# First, split the dataset into training+validation and test sets
train_val_test_split = dataset.train_test_split(test_size=test_size_proportion)

# Now, split the training+validation set into training and validation sets
# Adjust the test_size in train_test_split to get your desired validation proportion of the remaining data
train_validation_split = train_val_test_split['train'].train_test_split(test_size=0.20)

# Organize the final splits with the specific test set size
dataset = {
    'train': train_validation_split['train'],
    'validation': train_validation_split['test'],
    'test': train_val_test_split['test']  # This will have exactly 1000 samples
}

In [8]:
dataset

{'train': Dataset({
     features: ['messages'],
     num_rows: 19200
 }),
 'validation': Dataset({
     features: ['messages'],
     num_rows: 4800
 }),
 'test': Dataset({
     features: ['messages'],
     num_rows: 1000
 })}

In [9]:
print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["validation"].to_json("validation_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_25216791_3 (january_15_16 VARCHAR, march_27_29 VARCHAR)', 'role': 'system'}, {'content': 'What was the data on January 15-16 if March 27-29 is March 29, 2006?', 'role': 'user'}, {'content': 'SELECT january_15_16 FROM table_25216791_3 WHERE march_27_29 = "March 29, 2006"', 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

478051

In [10]:
from datasets import load_dataset

# Load jsonl data from disk
dataset_train = load_dataset("json", data_files="train_dataset.json", split="train")
dataset_validation = load_dataset("json", data_files="validation_dataset.json", split="train")
dataset_test = load_dataset("json", data_files="test_dataset.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

In [12]:
# Hugging Face model id
model_id = "codellama/CodeLlama-13b-hf" # or `mistralai/Mistral-7B-v0.1`

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_id)


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [14]:
print(tokenizer)

CodeLlamaTokenizerFast(name_or_path='codellama/CodeLlama-13b-hf', vocab_size=32016, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'additional_special_tokens': ['▁<PRE>', '▁<MID>', '▁<SUF>', '▁<EOT>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32007: AddedToken("▁<PRE>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32008: AddedToken("▁<SUF>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32009: AddedToken("▁<MID>", rstrip=False, lstrip=False, single_wor

In [15]:
tokenizer.pad_token = tokenizer.eos_token

In [16]:
tokenizer.padding_side = 'right' # to prevent warnings

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

In [17]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=64,
        lora_dropout=0.0,
        r=64,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)


In [18]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=7d4004f649694b6035244d89a8946ab41866f61770fa2d302cc052c99e1ebbb1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [19]:
from datasets import Dataset, load_dataset, load_metric

In [20]:
rouge_score = load_metric("rouge")

  rouge_score = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [21]:
import numpy as np
from transformers import TrainingArguments
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Import evaluate from py-rouge package
import evaluate

# Initialize ROUGE metric using py-rouge
rouge_metric = evaluate.load("rouge")

def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)

def compute_metrics(eval_preds, tokenizer):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 in the preds as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)

    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(pred.strip()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip()) for label in decoded_labels]

    # Calculate ROUGE scores using py-rouge
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results and convert to percentage
    result = {key: value for key, value in result.items()}

    #smoothing function
    smoothing_function = SmoothingFunction().method4

    # Calculate BLEU score for n-grams (1 to 4)
    bleu_scores = []
    for n in range(1, 5):
        weights = (1/n,) * n
        bleu_scores_n = [corpus_bleu([[ref]], [pred], weights=weights, smoothing_function=smoothing_function) for ref, pred in zip(decoded_labels, decoded_preds)]
        bleu_scores.append(sum(bleu_scores_n) / len(bleu_scores_n))

    # Calculate mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    gen_len = np.mean(prediction_lens)

    # You can process the rouge_scores, bleu_scores, and gen_len as needed
    return {
        "rouge_scores": result,
        "bleu_scores": bleu_scores,
        "gen_len": gen_len
    }


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [22]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.40.6-py2.py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.5/258.5 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [23]:
import wandb

In [24]:
wandb.init(project="codellama_7b_vs_13b_vs_mistral", entity="drishtisharma96505", group='codellama13b')



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [25]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="codellama-13b-text-to-sql", # directory to save and repository id
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=2,          # batch size per device during training
    per_device_eval_batch_size=2,          # batch size per device during evaluation
    do_eval = True,
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    evaluation_strategy="epoch",
    optim="adamw_torch_fused",              # use fused adamw optimizer
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    seed = 42
)


In [27]:
from trl import SFTTrainer

In [28]:
# Declare trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_validation,
    peft_config=peft_config,
    packing = False,
    max_seq_length=4000,
    compute_metrics=lambda eval_preds: compute_metrics(eval_preds, tokenizer),  # Pass the tokenizer
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

Map:   0%|          | 0/19200 [00:00<?, ? examples/s]


No chat template is defined for this tokenizer - using the default template for the CodeLlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



Map:   0%|          | 0/4800 [00:00<?, ? examples/s]



In [29]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
9.574 GB of memory reserved.


In [30]:
import time

In [None]:
start_time = time.time()  # Start timer

trainer_stats = trainer.train()

end_time = time.time()  # End timer
training_duration = (end_time - start_time)/60

print(f"Training Time: {training_duration} minutes")

You're using a CodeLlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Epoch,Training Loss,Validation Loss,Rouge Scores,Bleu Scores,Gen Len
1,0.4214,0.409771,"{'rouge1': 0.9519591223983631, 'rouge2': 0.8968276191077902, 'rougeL': 0.9161136796862779, 'rougeLsum': 0.9519593077583659}","[0.9649379955152158, 0.9558757473284976, 0.945003312865454, 0.933620373361328]",138.919167
2,0.3243,0.395048,"{'rouge1': 0.9527187528771346, 'rouge2': 0.8987525809723083, 'rougeL': 0.9177597737425829, 'rougeLsum': 0.9527243508262155}","[0.9660381294737971, 0.957357920008107, 0.9468585268500406, 0.9358486188913737]",138.919167


Trainer is attempting to log a value of "{'rouge1': 0.9519591223983631, 'rouge2': 0.8968276191077902, 'rougeL': 0.9161136796862779, 'rougeLsum': 0.9519593077583659}" of type <class 'dict'> for key "eval/rouge_scores" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9649379955152158, 0.9558757473284976, 0.945003312865454, 0.933620373361328]" of type <class 'list'> for key "eval/bleu_scores" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'rouge1': 0.9527187528771346, 'rouge2': 0.8987525809723083, 'rougeL': 0.9177597737425829, 'rougeLsum': 0.9527243508262155}" of type <class 'dict'> for key "eval/rouge_scores" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.9660381294737971, 0.9573

In [None]:
trainer.push_to_hub()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")