In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q -U trl
!pip install flash-attn --no-build-isolation

#### Import needed moduls GPU:

In [5]:
import os
import random
import torch
from datasets import load_dataset

from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer

In [None]:
# Load the dataset
raw_dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca")

# Display the dataset
print(raw_dataset)

In [None]:
# Show data sample
train_data = raw_dataset['train']

# Print the fields of the example
print("Instruction:", train_data[0]["instruction"])
print("Input:", train_data[0]["input"])
print("Output:", train_data[0]["output"])
print("Prompt:", train_data[0]["prompt"])

In [None]:
# Splitting the dataset with 90/10 ratio
split_dataset = raw_dataset["train"].train_test_split(test_size=0.1)

# Creating the new DatasetDict with train and test splits and wrapping it back into a DatasetDict
train_data = split_dataset["train"]
test_data = split_dataset["test"]

new_dataset_dict = DatasetDict({
    "train": train_data,
    "test": test_data
})

# Display the resulting splits
new_dataset_dict

In [None]:
# Creating smaller subsets for training (just 1000 samples to speedup training)
debug_train_data_1k = new_dataset_dict["train"].select(range(1000))
debug_test_data_1k = new_dataset_dict["test"].select(range(100))

# Wrapping it into a new DatasetDict for debugging
debug_dataset_dict_1k = DatasetDict({
    "train": debug_train_data_1k,
    "test": debug_test_data_1k
})

# Show the resulting debug DatasetDict
debug_dataset_dict_1k


#### Inference with Base Model:

In [None]:
use_flash_attention = False

# Your Hugging Face model and configurations
model_name = "meta-llama/Llama-2-7b"
mode_name = "mistralai/Mistral-7B-v0.1"
# If you're using a different model that supports safetensors, you can use one like this:
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"
# model_name = "tiiuae/falcon-7b"

# BitsAndBytes configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model with safetensors
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    use_flash_attention_2=True,
    trust_remote_code=True, 
    use_safetensors=True
)

model.config.pretraining_tp = 1

print("Model loaded successfully!")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Tokenizer loaded successfully!")


In [22]:
def format_instruction(sample):
    return f"""### INSTRUCTION:
You are an AI coding assistant specialized in generating Python code from user instructions. Your task is to return only the code that directly fulfills the given instruction.</s>

### Input:
{sample['instruction']}</s>

### RESPONSE:
{sample['output']}</s>
"""

In [None]:
def generate_prompt(user_input):
    return f"""### INSTRUCTION:
You are an AI coding assistant specialized in generating Python code from user instructions.
Your task is to return only the code that directly fulfills the given instruction.</s>

### Input:
{user_input}</s>

### RESPONSE:
"""

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)
model.config.use_cache = True

prompt = generate_prompt(debug_dataset_dict_1k["test"][0]["instruction"])
# Run text generation pipeline with our next model
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

#### Train Adapter:

In [None]:
import gc
import torch

def clear_cache_and_collect():
    """
    Perform garbage collection, clear CUDA cache, and delete model, trainer, and tokenizer
    if they exist, without causing errors if they are not defined. Additionally, print CUDA
    memory summary after clearing the cache.
    """
    # Safely delete model, trainer, and tokenizer if they exist
    try:
        del model
    except NameError:
        pass  # If 'model' doesn't exist, do nothing
    
    try:
        del trainer
    except NameError:
        pass  # If 'trainer' doesn't exist, do nothing
    
    try:
        del tokenizer
    except NameError:
        pass  # If 'tokenizer' doesn't exist, do nothing

    # Loop until garbage collection collects no objects
    while True:
        gc_collected = gc.collect()  # Perform garbage collection
        torch.cuda.empty_cache()     # Clear the CUDA cache
        
        # Break the loop if no objects are collected
        if gc_collected == 0:
            break

    # Clear the cache and print a summary of CUDA memory usage
    torch.cuda.empty_cache()
    print(torch.cuda.memory_summary(device=0, abbreviated=True))
    
    print("Cache clearing, garbage collection, and variable deletion are complete.")


In [None]:
clear_cache_and_collect()

In [26]:
import time

def timed_training(trainer):
    """
    Measures the time taken to train the model using the provided trainer.
    """
    start_time = time.time() 

    trainer.train() 

    end_time = time.time()  
    elapsed_time = end_time - start_time  

    # Convert the elapsed time to hours, minutes, and seconds
    hours, remainder = divmod(elapsed_time, 3600)
    minutes, seconds = divmod(remainder, 60)

    print(f"Training completed in {int(hours)} hours, {int(minutes)} minutes, and {int(seconds)} seconds.")

In [None]:
use_flash_attention = False

# Your Hugging Face model and configurations
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

# BitsAndBytes configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model with safetensors
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    use_flash_attention_2=True,
    trust_remote_code=True,  
    use_safetensors=True,   
)

# Additional configuration
model.config.pretraining_tp = 1

# Print a success message
print("Model loaded successfully!")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Tokenizer loaded successfully!")


In [18]:
config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=[
                  "q_proj",
                  "up_proj",
                  "o_proj",
                  "k_proj",
                  "down_proj",
                  "gate_proj",
                  "v_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, config)

In [30]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_eval_batch_size=5,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    save_strategy="epoch", 
    eval_strategy="epoch", 
    load_best_model_at_end=True,  
    logging_strategy="steps",
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    seed=42,
    save_total_limit=1,  # Only keep the best model
)

In [None]:
import transformers

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

from transformers import EarlyStoppingCallback

# Add the EarlyStoppingCallback
trainer = SFTTrainer(
    model=model,
    train_dataset=debug_dataset_dict_1k["train"],
    eval_dataset=debug_dataset_dict_1k["test"],
    peft_config=config,
    formatting_func=format_instruction,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,
    max_seq_length=512,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)]
)


model.config.use_cache = False

In [None]:
timed_training(trainer)

In [33]:
new_model_name = "early_stopping_3_epoch_fine_tuned_laama"
trainer.model.save_pretrained(new_model_name)

In [None]:
clear_cache_and_collect()

In [None]:
# Reload model in FP16
device_map="auto"
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load fine-tuned model and merge with Adapter
model = PeftModel.from_pretrained(base_model, new_model_name)
model = model.merge_and_unload()

In [None]:
prompt = generate_prompt(debug_dataset_dict_1k["test"][0]["instruction"])
# Run text generation pipeline with our next model
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
result = pipe(f"{prompt}")
print(result[0]['generated_text'])