TASK SPECIFIC FINETUNING 

In [1]:
# Installing dependancies
import os
import torch
import warnings
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    logging, 
    pipeline,
    EvalPrediction
)
from datasets import Dataset, DatasetDict, load_dataset
from docx import Document
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader


In [2]:
# Clear GPU cache
torch.cuda.empty_cache()

# Checking for GPU statistics
def check_cuda_availability():
    if torch.cuda.is_available():
        num_gpus = torch.cuda.device_count()
        print(f"Number of GPUs available: {num_gpus}")
        for i in range(num_gpus):
            gpu_stats = torch.cuda.get_device_properties(i)
            # Calculate starting GPU memory usage (in GB)
            start_gpu_memory = round(torch.cuda.max_memory_reserved(i) / 1024 / 1024 / 1024, 3)
            # Calculate maximum GPU memory (in GB)
            max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
            print(f"\nGPU {i}: {gpu_stats.name}")
            print(f"Starting GPU Memory Reserved: {start_gpu_memory} GB")
            print(f"Total GPU Memory: {max_memory} GB")
    else:
        print("CUDA is not available on this system.")

In [4]:
# Model
model_id = "meta-llama/Llama-2-7b-chat-hf"

# Token 
token=" "

In [5]:
# Quantisation configuration setup
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Lora configuration setup
config = LoraConfig(
    r=4, 
    lora_alpha=32, 
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

# Define function to load the tokenizer and model
def load_tokenizer_and_model(model_id):
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": "cuda"}, token=token)
    model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"    
    return tokenizer, model

# Load the tokenizer and model 
tokenizer, model = load_tokenizer_and_model(model_id)

# Enabling gradient checkpoint
model.gradient_checkpointing_enable()

# Prepare the model for kbit training
model = prepare_model_for_kbit_training(model)

# Apply LoRA to the model
model = get_peft_model(model, config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Prints trainable parameters %
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.4f}%"
    )
print_trainable_parameters(model)

trainable params: 4194304 || all params: 3504607232 || trainable%: 0.1197%


In [15]:
# Load the dataset 
dataset_dict = load_from_disk(" ")

# Tokenization 
tokenized_dataset = dataset_dict.map(
    lambda samples: tokenizer(samples["text"], truncation=True, padding=True, max_length=1200), 
    batched=True, 
    remove_columns=['text'])
def convert_to_tensors(batch):
    return {
        'input_ids': torch.tensor(batch['input_ids']),
        'attention_mask': torch.tensor(batch['attention_mask'])}

# Apply to the dataset
tokenized_torch_dataset = tokenized_dataset.map(convert_to_tensors, batched=True)
tokenized_train_dataset = tokenized_torch_dataset["train"]
tokenized_test_dataset = tokenized_torch_dataset["test"]

In [19]:
# Ensure that the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [21]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir=" ",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",     
    eval_steps=5,                  
    logging_steps=5,     
    max_steps=75,
    learning_rate=2e-4,
    warmup_steps=2,    
    save_steps=15,                  
    weight_decay=0.01,    
    logging_dir="./logs",
    optim="paged_adamw_8bit",
    remove_unused_columns=False,
    report_to="tensorboard",
    lr_scheduler_type="linear",    
)

# Define a data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [23]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    args=training_args,
) 

max_steps is given, it will override any value given in num_train_epochs


In [None]:
from torch.cuda.amp import GradScaler
import os

# Mixed precision training setup
scaler = GradScaler()

In [27]:
# training
trainer.train()

Step,Training Loss,Validation Loss
5,1.9743,1.919829
10,1.8581,1.75226
15,1.5766,1.592502
20,1.4205,1.452209
25,1.3769,1.359746
30,1.1971,1.299819
35,1.1704,1.24771
40,1.128,1.206051
45,1.0483,1.164921
50,1.017,1.130786


TrainOutput(global_step=75, training_loss=1.202188622156779, metrics={'train_runtime': 10931.6848, 'train_samples_per_second': 0.055, 'train_steps_per_second': 0.007, 'total_flos': 2.85618438144e+16, 'train_loss': 1.202188622156779, 'epoch': 12.244897959183673})

In [31]:
# Save the fine-tuned model locally
fine_tuned_model_directory = "Llama-2-7b-chat-hf-ft-config-playbooks"
trainer.model.save_pretrained(fine_tuned_model_directory)
tokenizer.save_pretrained(fine_tuned_model_directory)

('Llama-2-7b-chat-hf-ft-config-playbooks\\tokenizer_config.json',
 'Llama-2-7b-chat-hf-ft-config-playbooks\\special_tokens_map.json',
 'Llama-2-7b-chat-hf-ft-config-playbooks\\tokenizer.json')

In [39]:
# Call the function to check CUDA availability
check_cuda_availability()

Number of GPUs available: 2

GPU 0: Quadro RTX 4000
Starting GPU Memory Reserved: 6.096 GB
Total GPU Memory: 8.0 GB

GPU 1: Quadro RTX 4000
Starting GPU Memory Reserved: 0.0 GB
Total GPU Memory: 8.0 GB
