In [47]:
%%capture
!pip install datasets peft

In [48]:
%%capture
!pip install bitsandbytes

In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load the Dataset
dataset = load_dataset("iamtarun/code_instructions_120k_alpaca")

In [3]:
# Step 2: Prepare the data in prompt-response format
def format_example(example):
    instruction = example['instruction']
    input_text = example['input']
    response = example['output']

    if input_text.strip():
        prompt = (
            f"Below is an instruction that describes a task.\n"
            f"### Instruction: {instruction}\n"
            f"### Input: {input_text}\n"
            f"### Response:\n"
        )
    else:
        prompt = (
            f"Below is an instruction that describes a task.\n"
            f"### Instruction: {instruction}\n"
            f"### Response:\n"
        )

    return {'prompt': prompt, 'response': response}

In [4]:
# Apply formatting to the dataset
formatted_dataset = dataset.map(format_example)

In [5]:
formatted_dataset['train'][2]

{'instruction': 'Design a data structure in C++ to store information of a grocery store inventory.',
 'input': '',
 'output': '#include <map>\n#include <string>\n\nclass GroceryStoreInventory {\nprivate:\n    std::map<std::string, int> items;\npublic:\n    void addItems(std::string item, int quantity) {\n        items[item] = quantity;\n    }\n    int getQuantity(std::string item) {\n        if (items.find(item) != items.end()) {\n            return items[item];\n        }\n        return 0;\n    }\n};',
 'prompt': 'Below is an instruction that describes a task.\n### Instruction: Design a data structure in C++ to store information of a grocery store inventory.\n### Response:\n',
 'response': '#include <map>\n#include <string>\n\nclass GroceryStoreInventory {\nprivate:\n    std::map<std::string, int> items;\npublic:\n    void addItems(std::string item, int quantity) {\n        items[item] = quantity;\n    }\n    int getQuantity(std::string item) {\n        if (items.find(item) != items.

In [6]:
formatted_dataset['train'][2]['response']

'#include <map>\n#include <string>\n\nclass GroceryStoreInventory {\nprivate:\n    std::map<std::string, int> items;\npublic:\n    void addItems(std::string item, int quantity) {\n        items[item] = quantity;\n    }\n    int getQuantity(std::string item) {\n        if (items.find(item) != items.end()) {\n            return items[item];\n        }\n        return 0;\n    }\n};'

In [7]:
# Step 1: Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf", use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

In [8]:
# Tokenize the data
def tokenize_function(examples):
    # Tokenize prompt and response separately
    inputs = tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(examples['response'], padding="max_length", truncation=True, max_length=512)

    inputs['labels'] = outputs['input_ids']
    return inputs

# Tokenize the dataset
tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=['instruction', 'input', 'output', 'prompt', 'response'])

##### How to Analyze Sequence Lengths

In [57]:
tokenized_dataset['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 121959
})

In [58]:
# Calculate lengths
def calculate_lengths(example):
    prompt_length = len(tokenizer(example['prompt'])['input_ids'])
    response_length = len(tokenizer(example['response'])['input_ids'])
    combined_length = prompt_length + response_length

    return {
        'prompt_length': prompt_length,
        'response_length': response_length,
        'combined_length': combined_length
    }

In [59]:
# Apply the function to calculate lengths for each example
lengths_dataset = formatted_dataset.map(calculate_lengths)

In [60]:
lengths_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'prompt', 'response', 'prompt_length', 'response_length', 'combined_length'],
        num_rows: 121959
    })
})

In [61]:
# Get statistics
prompt_lengths = lengths_dataset['train']['prompt_length']
response_lengths = lengths_dataset['train']['response_length']
combined_lengths = lengths_dataset['train']['combined_length']

max_prompt_length = max(prompt_lengths)
max_response_length = max(response_lengths)
max_combined_length = max(combined_lengths)

print(f"Max prompt length: {max_prompt_length}")
print(f"Max response length: {max_response_length}")
print(f"Max combined length: {max_combined_length}")

Max prompt length: 2240
Max response length: 5677
Max combined length: 5720


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [10]:
base_model = "NousResearch/Llama-2-7b-hf"

In [11]:
# Load the model in 4-bit precision with device map
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [12]:
# Step 2: Load the Quantized LLaMA Model with QLoRA
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto"
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.05s/it]


In [13]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [14]:
from peft import LoraConfig, get_peft_model

# Define QLoRA configuration
lora_config = LoraConfig(
    r=4,            # Low-rank dimension
    lora_alpha=32,   # Scaling factor
    lora_dropout=0.1,
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
    task_type="CAUSAL_LM"
)

In [15]:
from transformers import DataCollatorForLanguageModeling
import torch

class CustomDataCollator(DataCollatorForLanguageModeling):
    def __call__(self, features):
        # Extract input_ids, attention_mask, and labels from the features
        input_ids = [feature['input_ids'] for feature in features]
        attention_mask = [feature['attention_mask'] for feature in features]
        labels = [feature['labels'] for feature in features]

        # Convert to tensors
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        labels = torch.tensor(labels, dtype=torch.long)

        # Ensure labels are masked correctly (e.g., -100 for prompt parts)
        for i, label in enumerate(labels):
            prompt_length = sum([1 for l in label if l == -100])
            labels[i, :prompt_length] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

In [16]:
# Apply QLoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 9,994,240 || all params: 6,748,409,856 || trainable%: 0.1481


In [17]:
# Function to count model parameters
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Apply QLoRA to the model
#model = get_peft_model(model, lora_config)

# Count parameters
total_params, trainable_params = count_parameters(model)

print(f"Total Model Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")

Total Model Parameters: 3,510,407,168
Trainable Parameters: 9,994,240


In [18]:
model.config.use_cache = False

In [19]:
print(model.config)

LlamaConfig {
  "_name_or_path": "NousResearch/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_metho

In [20]:
# Step 3: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./fine-tuned-llama",
    per_device_train_batch_size=2,
    max_steps=500,
    gradient_accumulation_steps=1,
    # num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=50,
    save_steps=500,
    save_total_limit=1,
    fp16=True,
    report_to="none",
    remove_unused_columns=True
)


In [21]:
from transformers import DataCollatorWithPadding

# Initialize a data collator that dynamically pads sequences
data_collator = DataCollatorWithPadding(tokenizer)

In [22]:
# Step 4: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    #data_collator=data_collator,
    #tokenizer=tokenizer,
)

max_steps is given, it will override any value given in num_train_epochs


In [23]:
# Step 5: Fine-tuning
trainer.train()

  return fn(*args, **kwargs)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 10%|█         | 50/500 [02:44<24:59,  3.33s/it] 

{'loss': 12.3818, 'grad_norm': 50.509132385253906, 'learning_rate': 4.5600000000000004e-05, 'epoch': 0.0}


 20%|██        | 100/500 [05:34<22:51,  3.43s/it]

{'loss': 6.5781, 'grad_norm': 34.021507263183594, 'learning_rate': 4.0600000000000004e-05, 'epoch': 0.0}


 30%|███       | 150/500 [08:26<20:12,  3.46s/it]

{'loss': 5.6918, 'grad_norm': 9.601886749267578, 'learning_rate': 3.56e-05, 'epoch': 0.0}


 40%|████      | 200/500 [11:23<17:25,  3.48s/it]

{'loss': 5.5742, 'grad_norm': 5.90173864364624, 'learning_rate': 3.06e-05, 'epoch': 0.0}


 50%|█████     | 250/500 [14:17<14:27,  3.47s/it]

{'loss': 5.6438, 'grad_norm': 5.829595565795898, 'learning_rate': 2.5600000000000002e-05, 'epoch': 0.0}


 60%|██████    | 300/500 [17:10<11:32,  3.46s/it]

{'loss': 5.5537, 'grad_norm': 5.038314342498779, 'learning_rate': 2.06e-05, 'epoch': 0.0}


 70%|███████   | 350/500 [20:03<08:38,  3.46s/it]

{'loss': 5.5656, 'grad_norm': 15.42793083190918, 'learning_rate': 1.56e-05, 'epoch': 0.01}


 80%|████████  | 400/500 [22:56<05:44,  3.44s/it]

{'loss': 5.3776, 'grad_norm': 3.747535228729248, 'learning_rate': 1.06e-05, 'epoch': 0.01}


 90%|█████████ | 450/500 [25:49<02:52,  3.45s/it]

{'loss': 5.3461, 'grad_norm': 10.700518608093262, 'learning_rate': 5.600000000000001e-06, 'epoch': 0.01}


100%|██████████| 500/500 [28:41<00:00,  3.44s/it]

{'loss': 5.603, 'grad_norm': 8.135442733764648, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.01}


100%|██████████| 500/500 [28:43<00:00,  3.45s/it]

{'train_runtime': 1723.1403, 'train_samples_per_second': 0.58, 'train_steps_per_second': 0.29, 'train_loss': 6.33155615234375, 'epoch': 0.01}





TrainOutput(global_step=500, training_loss=6.33155615234375, metrics={'train_runtime': 1723.1403, 'train_samples_per_second': 0.58, 'train_steps_per_second': 0.29, 'total_flos': 2.0328461893632e+16, 'train_loss': 6.33155615234375, 'epoch': 0.008199409642505739})

In [24]:
# Step 8: Save the Fine-Tuned Model
trainer.save_model("./fine-tuned-llama")
tokenizer.save_pretrained("./fine-tuned-llama")

('./fine-tuned-llama\\tokenizer_config.json',
 './fine-tuned-llama\\special_tokens_map.json',
 './fine-tuned-llama\\tokenizer.model',
 './fine-tuned-llama\\added_tokens.json')

In [1]:
from codebleu import calc_codebleu

In [3]:
prediction = "def add ( a , b ) :\n return a + b"
reference = "def sum ( first , second ) :\n return second + first"

In [None]:
result = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
print(result)

#### Inference

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_path = 'fine-tuned-llama'

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModelForCausalLM.from_pretrained(model_path)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16  # Load model in FP16
).to(device)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  7.92it/s]


In [5]:
def create_prompt(instruction, input_text=""):
    if input_text.strip():
        prompt = (
            f"Below is an instruction that describes a task.\n"
            f"### Instruction: {instruction}\n"
            f"### Input: {input_text}\n"
            f"### Response:\n"
        )
    else:
        prompt = (
            f"Below is an instruction that describes a task.\n"
            f"### Instruction: {instruction}\n"
            f"### Response:\n"
        )
    return prompt



In [6]:
# Example prompt
instruction = "Create a Python function to reverse a string"
input_text = "Hello World"
prompt = create_prompt(instruction, input_text)

In [7]:
# Tokenize input prompt
inputs = tokenizer(prompt, return_tensors='pt').to(device)

In [8]:
inputs

{'input_ids': tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
            13,  2277, 29937,  2799,  4080, 29901,  6204,   263,  5132,   740,
           304, 11837,   263,  1347,    13,  2277, 29937, 10567, 29901, 15043,
          2787,    13,  2277, 29937, 13291, 29901,    13]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [30]:
import torch
# Clear unused memory
torch.cuda.empty_cache()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [31]:
inputs

{'input_ids': tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
             13,  2277, 29937,  2799,  4080, 29901,  6204,   263,  5132,   740,
            304, 11837,   263,  1347,    13,  2277, 29937, 13291, 29901,    13]],
        device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [13]:
# Generate response
output = model.generate(
    **inputs,
    max_length=128,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.2,
    eos_token_id=tokenizer.eos_token_id
)

In [14]:
output

tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
            13,  2277, 29937,  2799,  4080, 29901,  6204,   263,  5132,   740,
           304, 11837,   263,  1347,    13,  2277, 29937, 10567, 29901, 15043,
          2787,    13,  2277, 29937, 13291, 29901,    13,   353, 29898, 29897,
         29879,  2158, 18764, 29871, 29892, 24244, 29871, 24244,   710,  2314,
           851, 29962, 29900, 24244, 29871,   710, 29918, 18764,   851, 29918,
            13, 29896,    13,   736,  1125, 29898,   353, 29871,  1678, 29906,
         29896, 29892,   736,   703,  1678,    13,  1678,   736, 29889,   474,
          4706,    13, 29936,    13,  2435, 29898, 29918,    13, 29875,   851,
           543, 24244, 29898,  1596, 29889, 29898,    13, 24244,   353, 24244,
         29961, 29871,   718, 29941,   363,    13, 29871, 29900,   297,  7431,
         29898, 29896,   353, 29898, 24244, 29871, 11837,   353,    13, 29901,
         29918, 24244, 29958, 29871,   297, 29871,  

In [15]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Model response:\n{generated_text}")

Model response:
Below is an instruction that describes a task.
### Instruction: Create a Python function to reverse a string
### Input: Hello World
### Response:
 =()sprint revers ,reverse reversestr]) str]0reverse str_ revers str_
1
 return):( =    21, return("   
    return. i       
;
len(_
i str="reverse( print.(
reverse =reverse[  +3 for
 0 in len(1 =(reverse  reverse =
:_reverse>  in  =len


In [64]:
instruction = "Write a Python function to reverse a string"
input_text = ""

# Create prompt with stronger response marker
prompt = create_prompt(instruction, input_text)

# Tokenize and move to device
inputs = tokenizer(prompt, return_tensors='pt').to(device)

# Generate response with adjusted parameters
output = model.generate(
    **inputs,
    max_length=2000,
    temperature=1.2,  # Increase temperature for more randomness
    top_k=50,         # Add top-k sampling
    top_p=0.95,       # Increase top-p for more diverse generation
    do_sample=True,
    repetition_penalty=1.2,
    eos_token_id=tokenizer.eos_token_id
)

# Decode and display the response
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Model response:\n{generated_text}")


Model response:
Below is an instruction that describes a task.
### Instruction: Write a Python function to reverse a string
### Response:



In [65]:
output

tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
            13,  2277, 29937,  2799,  4080, 29901, 14350,   263,  5132,   740,
           304, 11837,   263,  1347,    13,  2277, 29937, 13291, 29901,    13,
             2]], device='cuda:0')