<a href="https://colab.research.google.com/github/B-IJoe1/UniChat/blob/main/Fine_Tuning_UniChat_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Code below checks library versions (great to determine stability)

In [None]:
import transformers
import trl
import peft

print("TRL version:", trl.__version__)
print("PEFT version:", peft.__version__)
print("Transformers:", transformers.__version__) #Newest version is 4.50 best to use 4.39 for
                                                #stability when using trl and peft

####Disable wandb to prevent needing an API key prior to training: It's automatically called through trl

In [None]:
#!pip uninstall -y wandb


Found existing installation: wandb 0.19.8
Uninstalling wandb-0.19.8:
  Successfully uninstalled wandb-0.19.8


In [None]:
#Install dependencies
!pip install -q fsspec==2025.3.0 transformers peft accelerate tensorboard evaluate trl


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m86.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#import os
#os._exit(00)
#This restarts runtime if you need or you could disconnect runtime on the top right

# Importing the modules for training

In [None]:
# Prompt for your token without echoing it in the output
import os
import getpass

hf_token = getpass.getpass("Enter your Hugging Face token: ")
os.environ["HF_TOKEN"] = hf_token

Enter your Hugging Face token: ··········


In [None]:
#AutoModelForCausalLM is used to import the Llama 2-7B Causal model
from transformers import AutoModelForCausalLM, AutoTokenizer   #AutoTokenizer used for formatting & tokenizing datasets & tokenizing models

#identify the model you'll use
model_name = "meta-llama/Llama-2-7b-chat-hf"

#Load the Tokenizer first to add special tokens prior to freezing & loading the model to resize it later!

In [None]:
import torch
#Load the model
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map = "auto",
                                              trust_remote_code=True,
                                             torch_dtype=torch.bfloat16,
)

#Adding the Special tokens for prompt to indicate the model beggining of a new sentence (bos)
#end of sentence (eos) and pad token to move to the next token in a sentence (<PAD>).
#Use "[INST] [/INST]" for instruction delimeter for a user's start and end of a sentence.


model.config.use_cache = False #Gradient checkpointing; uses a lot of memory if true the point is to save storage
model.config.pretraining_tp = 1 #disable tensor parallelism best for TPU but not for multi GPU's

#Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=hf_token)
tokenizer.pad_token = "<|PAD|>"
tokenizer.padding_side = "left"  # for causal models


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

#Freezing all layers except for specific attention heads that affects tone & style (Base Model)

In [None]:
#Print layer names
for name, param in model.named_parameters():
    print(name, param.shape)

In [None]:
for name, param in model.named_parameters():

   #Only focus on trainable layers for tone and style
    if any(substring in name for substring in ["q_proj", "v_proj", "o_proj"]):
        param.requires_grad = True

      #Freeze everything else
    else:
        param.requires_grad = False


# Format and prepare dataset for training before Lora!

### Loading & Formatting the Dataset

In [None]:
import pandas as pd
from datasets import Dataset, Features, Value


#Upload the file path
file_path = "/content/Admissions Email QA.csv" # Loaded to working directory


# Load the CSV using pandas
df = pd.read_csv(file_path)

# Convert to Hugging Face dataset
features = Features({
    "Questions": Value("string"),
    "Answers": Value("string")
})
print("CSV shape:", df.shape)


#This bypasses the HuggingFace file reading logic - no caching issues!
dataset = Dataset.from_pandas(df, features=features)
dataset = dataset.shuffle(seed=42)

#sanity check
print(df.head())

# Function to convert a row into a messages list
def row_to_messages(example):
    return {
        "messages": [
        {"role": "system", "content": "You are a helpful university admissions assistant."}, #User query
        {"role": "user", "content": example["Questions"]},
        {"role": "assistant", "content": example["Answers"]} #Giving the prompt examples on how to respond
    ]
    }


#tokenizing my prompt template from row_to_messages and returning a text
def format_prompt(example):
    prompt = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    return {"text": prompt}



#Split your dataset prior to uploading it into rows to prevent data leakage
dataset = dataset.train_test_split(test_size=0.2, seed=42)

#Check to see the number of rows
print(f"Dataset size: {len(dataset)}")

#Uploading my dataset's format to HuggingFace
if len(dataset) > 0:
    dataset = dataset.map(row_to_messages)
else:
    print("Dataset is empty. Check your CSV or earlier processing steps.")


#Uploading the prompt to HuggingFace
dataset = dataset.map(format_prompt)



CSV shape: (50, 2)
                                           Questions  \
0  Good morning, I talked with a counselor a coup...   
1  I am considering applying to Salem State’s Acc...   
2  I hope you’re well! I have one more question a...   
3  Did 2.3 gpa will accept for bachelor admission...   
4  Did 2.3 gpa will accept for bachelor admission...   

                                             Answers  
0  Good morning, Thank you for reaching out! We n...  
1  Thank you for reaching out! We do require that...  
2  Thank you for reaching out! You can look at ou...  
3  Thank you for reaching out! Some of our progra...  
4  According to our counselors, a 2.3 should be a...  
Dataset size: 2


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
#Load a tokenizer to use its chat template if needed; already created a template called "messages"

#template_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Train your model on Lora adapters

###Lora Configurations

In [None]:
from peft import LoraConfig, get_peft_model

#Prepare LoRA Configuration
peft_config = LoraConfig  #Remove this if you want to have full instruction tuning without QLoRA


In [None]:
peft_config = LoraConfig(
    lora_alpha=32, #LoRA Scaling; good rule of thumb always make lora_alpha two times more than r (rank)
    lora_dropout=0.1, #Dropout for LoRA Layers
    r = 16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "v_proj",
        "o_proj",
    ]
)

#Prepare model for training
model = get_peft_model(model, peft_config)

#Training the models with Training Arguments then Fine-tuning it for Training

In [None]:
!pip install tf-keras


###Training Arguments

In [None]:
# LoRA-specific trainer
from transformers import TrainingArguments
import torch

# Recommended for NVIDIA Ampere / Hopper GPUs (A100, H100, RTX 30xx, 40xx) enables faster LLM training
torch.backends.cuda.matmul.allow_tf32 = True
print("TF32 enabled is enabled!")


output_dir = "./llama_results" #Better to track where your outputs are

#Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    logging_dir="./logs",
    logging_steps= 5,
    per_device_train_batch_size=2, #Best for small dataset and TPU
    gradient_accumulation_steps=1, #same here as above
    learning_rate = 2e-4,
    optim="adamw_torch_fused",  #recommended for TPU v28
    lr_scheduler_type="cosine",
    num_train_epochs = 1,
    bf16= True, #needed for TPU
    gradient_checkpointing=False, #Avoid for smaller datasets
    save_strategy="epoch", #Best practive for TPU to save overhead
    eval_strategy="epoch",
    report_to="none" #testing evaluation at the end of each epoch

    )


TF32 enabled is enabled!


###Use Tensorboard to plot f1 score and metrics

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./logs

# Compute Metrics

In [None]:
import numpy as np
import evaluate

# Load F1 metric properly
load_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    """Compute F1 Score"""
    logits, labels = eval_pred

    # Convert logits to class predictions
    predictions = np.where(logits[:, :, 1] > logits[:, :, 0], 1, 0) # Assuming 0 and 1 are your classes

    # Apply a mask for padding tokens, assuming -100 represents padding
    mask = labels != -100  # Ensures F1 computation is done on valid (non-padding) labels (special token used by HuggingFAce to indicate padding tokens)
    valid_predictions = predictions[mask] #uses the mase to select only the predictions w/o padding tokens
    valid_references = labels[mask] #uses the mask to select only ground truth labels w/o padding

    # Calculate F1 score after removing padding
    f1 = load_f1.compute(predictions=valid_predictions.astype(np.int32),
                         references=valid_references.astype(np.int32),
                         average = 'micro')["f1"]
    return {"f1": f1}

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

###Training

###Use only if meta device doesn't have tensors yet to train or if CUDA is OOM

In [None]:
import os
# Set PYTORCH_CUDA_ALLOC_CONF to expand segment sizes and avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# Before calling trainer.train(), move the model to the device
if training_arguments.device != torch.device("cpu"):
    #Added if statement here in case there is insufficient GPU memory
    if not torch.cuda.is_available():
        model = model.to(torch.device("cpu"))
        print("Model moved to CPU due to unavailability of CUDA.")
    else:
        try:
            #Attempt move to GPU and proceed with training if successful
            model.to_empty(device=training_arguments.device)
            model = model.to(training_arguments.device)
            print("CUDA is available. Model will remain on GPU.")
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                #In case of insufficient GPU memory, move to CPU and issue a warning
                print("Insufficient GPU memory. Moving model to CPU.")
                model = model.to(torch.device("cpu")) #Move to CPU
                print("Model moved to CPU. Training will proceed on CPU, which will be slower.")
            else:
                #Raise the error if the cause was not OOM
                raise e

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


###Disable entering wandb API key for training in case report_to = "none" doesnt work

In [None]:
#import os
#os.environ["WANDB_DISABLED"] = "true"

In [None]:
import gc
# Free up memory before training
gc.collect()
torch.cuda.empty_cache()

In [None]:
from trl import SFTTrainer


#Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_arguments,
    compute_metrics=compute_metrics,
    dataset_text_field="text",  # Specify the text field directly
    max_seq_length=64,
    #Leave this out for regular SFT
    peft_config=peft_config,

)

#Train model
trainer.train()



# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
#Save LoRA weights
trainer.model.save_pretrained("Llama-2-7B-lora")

#trainer.push_to_hub() - push to HuggingFace if you want


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,F1
1,1.9077,1.84421,0.03125


#Finally Merging  Lora Weights with original model weights

###Merge Weights

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained("Llama-2-7B-lora",
                                                 low_cpu_mem_usage=True,
                                                 device_map="auto",
                                                 torch_dtype=torch.bfloat16,
                                                 )

#Merge QLoRA and base model
merged_model = model.merge_and_unload()

#Save the merged model
merged_model.save_pretrained("Llama-2-7B-merged-lora&base")

#trainer.push_to_hub() - push to HuggingFace if you want

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import pipeline
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Enable detailed error reporting

#Use our predefined prompt template for Llama
prompt = """ What is a University Provost? """




pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer, max_length=50)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': '<s>[INST]  What is a University Provost?  [/INST]  A provost is a senior academic and administrative leader at a university or college. \n\nThe provost is the chief academic officer of the university,'}]
