<a href="https://colab.research.google.com/github/B-IJoe1/UniChat/blob/main/Fine_Tuning_UniChat_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Code below checks library versions (great to determine stability)

In [None]:
import transformers
import trl
import peft
import accelerate
import bitsandbytes
import sentencepiece
import evaluate
import torch

print("Transformers:", transformers.__version__) #Newest version is 4.50 best to use 4.39 for stability when using trl and peft
print("PEFT version:", peft.__version__)
print("Accelerate version:", accelerate.__version__)
print("BitsandBytes version:", bitsandbytes.__version__)
print("Sentencepiece version:", sentencepiece.__version__)
print("Evaluate version:", evaluate.__version__)
print("TRL version:", trl.__version__)
print("Torch version:", torch.__version__)



"""
As of 4/2/25

 Transformers: 4.50.2
PEFT version: 0.14.0
Accelerate version: 1.5.2
BitsandBytes version: 0.45.4
Sentencepiece version: 0.2.0
Evaluate version: 0.4.3
TRL version: 0.11.4
Torch version: 2.6.0+cu124"""

Transformers: 4.50.2
PEFT version: 0.14.0
Accelerate version: 1.5.2
BitsandBytes version: 0.45.4
Sentencepiece version: 0.2.0
Evaluate version: 0.4.3
TRL version: 0.11.4
Torch version: 2.6.0+cu124


'\nAs of 4/2/25\n\n Transformers: 4.50.2\nPEFT version: 0.14.0\nAccelerate version: 1.5.2\nBitsandBytes version: 0.45.4\nSentencepiece version: 0.2.0\nEvaluate version: 0.4.3\nTRL version: 0.11.4\nTorch version: 2.6.0+cu124'

####Disable wandb to prevent needing an API key prior to training: It's automatically called through trl

In [None]:
#!pip uninstall -y wandb


Found existing installation: wandb 0.19.8
Uninstalling wandb-0.19.8:
  Successfully uninstalled wandb-0.19.8


In [None]:
#Install dependencies
!pip install -q fsspec==2025.3.0 transformers==4.50.2 peft accelerate tensorboard bitsandbytes sentencepiece evaluate trl


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#import os
#os._exit(00)
#This restarts runtime if you need or you could disconnect runtime on the top right

# Importing the modules for training

In [None]:
# Prompt for your token without echoing it in the output
import os
import getpass

hf_token = getpass.getpass("Enter your Hugging Face token: ")
os.environ["HF_TOKEN"] = hf_token

Enter your Hugging Face token: ··········


In [None]:
#AutoModelForCausalLM is used to import the Llama 2-7B Causal model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig  #AutoTokenizer used for formatting & tokenizing datasets & tokenizing models

#identify the model you'll use
model_name = "meta-llama/Llama-2-7b-chat-hf"

#Load the model & tokenizer first prior to freezing the layers

In [None]:
import torch

# 4-bit quantization configuration - Q in QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision model loading
    bnb_4bit_quant_type="nf4",  # Quantization type to int 9
    bnb_4bit_compute_dtype="float16",  # Compute dtype for float precision
    bnb_4bit_use_double_quant=True,  # Further quantization to int 4
)
#Load the model
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map = "auto",
                                              trust_remote_code=True, #Trusting to train all of the model's layers
                                             torch_dtype=torch.bfloat16,
                                              # Leave this out for regular SFT
                                                quantization_config=bnb_config,
)




model.config.use_cache = False #Gradient checkpointing; uses a lot of memory if true the point is to save storage
model.config.pretraining_tp = 1 #disable tensor parallelism best for TPU but not for multi GPU's

#Add special tokens to the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=hf_token)
tokenizer.pad_token = "<|PAD|>"
tokenizer.padding_side = "left"  # for causal models

#Save it and push it inside the same folder as the merged Qlora model (shown below)
tokenizer.save_pretrained("./Llama2-7B-merged-qlora&base")

#Save it to the same repository for the merged QLoRA model
#tokenizer.push_to_hub("Jsevere/llama2-7b-admissions-qa-merged")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

('./Llama2-7B-merged-qlora&base/tokenizer_config.json',
 './Llama2-7B-merged-qlora&base/special_tokens_map.json',
 './Llama2-7B-merged-qlora&base/tokenizer.model',
 './Llama2-7B-merged-qlora&base/added_tokens.json',
 './Llama2-7B-merged-qlora&base/tokenizer.json')

# Format and prepare dataset for training before Lora!

### Loading & Formatting the Dataset

In [None]:
import pandas as pd
from datasets import Dataset, Features, Value


#Upload the file path
file_path = "/content/Admissions Email QA.csv" # Loaded to working directory


# Load the CSV using pandas
df = pd.read_csv(file_path)

# Convert to Hugging Face dataset
features = Features({
    "Questions": Value("string"),
    "Answers": Value("string")
})
print("CSV shape:", df.shape)


#This bypasses the HuggingFace file reading logic - no caching issues!
dataset = Dataset.from_pandas(df, features=features)
dataset = dataset.shuffle(seed=42)

#sanity check
print(df.head())

# Function to convert a row into a messages list
def row_to_messages(example):
    return {
        "messages": [
        {"role": "system", "content": "You are a helpful university admissions assistant."}, #User query
        {"role": "user", "content": example["Questions"]},
        {"role": "assistant", "content": example["Answers"]} #Giving the prompt examples on how to respond
    ]
    }


#tokenizing my prompt template from row_to_messages and returning a text
def format_prompt(example):
    prompt = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    return {"text": prompt}



#Split your dataset prior to uploading it into rows to prevent data leakage
dataset = dataset.train_test_split(test_size=0.30, seed=42)

#Check to see the number of rows
print(f"Dataset size: {len(dataset)}")

#Uploading my dataset's format to HuggingFace
if len(dataset) > 0:
    dataset = dataset.map(row_to_messages)
else:
    print("Dataset is empty. Check your CSV or earlier processing steps.")


#Uploading the prompt to HuggingFace
dataset = dataset.map(format_prompt)



CSV shape: (50, 2)
                                           Questions  \
0  Good morning, I talked with a counselor a coup...   
1  I am considering applying to Salem State’s Acc...   
2  I hope you’re well! I have one more question a...   
3  Did 2.3 gpa will accept for bachelor admission...   
4  Did 2.3 gpa will accept for bachelor admission...   

                                             Answers  
0  Good morning, Thank you for reaching out! We n...  
1  Thank you for reaching out! We do require that...  
2  Thank you for reaching out! You can look at ou...  
3  Thank you for reaching out! Some of our progra...  
4  According to our counselors, a 2.3 should be a...  
Dataset size: 2


Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [None]:
#Load a tokenizer to use its chat template if needed; already created a template called "messages"

#template_tokenizer = AutoTokenizer.from_pretrained(model_name)

# Train your model on Lora adapters while the model is quantized

###Lora Configurations

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model



In [None]:
#Prepare LoRA Configuration
peft_config = LoraConfig  #Remove this if you want to have full instruction tuning without QLoRA

peft_config = LoraConfig(
    lora_alpha=8, #How significant the data is (If there's a certain prompt or style you want the model to follow from the data)
    lora_dropout=0.10, #Helps reduce overfitting slightly by introducing randomness during training by dropping some input connections to the LoRA adapter (some samples from the dataset), making the model less likely to overfit to the training data.
    r = 4, #start with a low regularization if you're doing classification or general instruction behavior (for tone and style)
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=       #Layers to target
     [
        "q_proj",
        "v_proj",
        "o_proj",
    ]
)
# prepare model for Quantization
model = prepare_model_for_kbit_training(model)


#Freezing all layers except for specific attention heads that affects tone & style (Base Model)

In [None]:
#Print layer names
for name, param in model.named_parameters():
    print(name, param.shape)

model.embed_tokens.weight torch.Size([32000, 4096])
model.layers.0.self_attn.q_proj.weight torch.Size([8388608, 1])
model.layers.0.self_attn.k_proj.weight torch.Size([8388608, 1])
model.layers.0.self_attn.v_proj.weight torch.Size([8388608, 1])
model.layers.0.self_attn.o_proj.weight torch.Size([8388608, 1])
model.layers.0.mlp.gate_proj.weight torch.Size([22544384, 1])
model.layers.0.mlp.up_proj.weight torch.Size([22544384, 1])
model.layers.0.mlp.down_proj.weight torch.Size([22544384, 1])
model.layers.0.input_layernorm.weight torch.Size([4096])
model.layers.0.post_attention_layernorm.weight torch.Size([4096])
model.layers.1.self_attn.q_proj.weight torch.Size([8388608, 1])
model.layers.1.self_attn.k_proj.weight torch.Size([8388608, 1])
model.layers.1.self_attn.v_proj.weight torch.Size([8388608, 1])
model.layers.1.self_attn.o_proj.weight torch.Size([8388608, 1])
model.layers.1.mlp.gate_proj.weight torch.Size([22544384, 1])
model.layers.1.mlp.up_proj.weight torch.Size([22544384, 1])
model.l

##### We're also going to dequantize those frozen layers (base model layers) so we could later merge it with the quantized layer. Making sure quantizing the base model doesn't mess with the weights when merging.

In [None]:
#for name, param in model.named_parameters():

   #Only focus on trainable layers for tone and style
#if any(substring in name for substring in ["q_proj", "v_proj", "o_proj"]):
 #     if param.dtype == torch.float16: # or torch.float16 or torch.complex64 or similar:
  #          param.requires_grad = True # Only set requires_grad for floating-point parameters

      #Freeze everything else

    #  else:
     #   param.requires_grad = False
______________________________
# Before Training Loop - trainer.train()
#if not param.requires_grad:  # If the layer is frozen
        # Dequantize if possible
        #try:
            #if hasattr(param, 'dequantize'):  # Check for the dequantize method safely
                #param.dequantize() # Dequantize without searching for the parent module.
                #print(f"Successfully dequantized layer: {name}")
            #else:
               # print(f"Layer {name} has no dequantize method. Skipping.")
        #except AttributeError:
            #print(f"Layer {name} could not be dequantized (likely already in FP16). Skipping.")
        #except Exception as e:
            #print(f"Error dequantizing layer {name}: {e}")

### Preparing the model for training

In [None]:
#Prepare model for Training
model = get_peft_model(model, peft_config)

# Compute Metrics

#Training the models with Training Arguments then Fine-tuning it for Training

###Use Tensorboard to plot f1 score and val_loss for metrics

In [None]:
#%load_ext tensorboard
#%tensorboard --logdir ./logs

### To Display the notebook without tensorboard to avoid 'Invalid Notebook' error when uploading to Github

###Compute F1

In [None]:
import numpy as np
import evaluate

# Load F1 metric properly
load_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    """Compute F1 Score"""
    logits, labels = eval_pred

    # Convert logits to class predictions
    predictions = np.where(logits[:, :, 1] > logits[:, :, 0], 1, 0) # Assuming 0 and 1 are your classes

    # Apply a mask for padding tokens, assuming -100 represents padding
    mask = labels != -100  # Ensures F1 computation is done on valid (non-padding) labels (special token used by HuggingFAce to indicate padding tokens)
    valid_predictions = predictions[mask] #uses the mase to select only the predictions w/o padding tokens
    valid_references = labels[mask] #uses the mask to select only ground truth labels w/o padding

    # Calculate F1 score after removing padding
    f1 = load_f1.compute(predictions=valid_predictions.astype(np.int32),
                         references=valid_references.astype(np.int32),
                         average = 'micro')["f1"]
    return {"f1": f1}

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

###Notify when Validation Loss or F1 Score stops improving consecutively through tensorboard! To prevent overfitting data

In [None]:
from transformers import TrainerCallback
import torch

class SmartNotifyCallback(TrainerCallback):
    def __init__(self, tb_writer=None, patience=10):  # Patience is 2
        self.patience = patience  # We'll tolerate up to 2 consecutive evaluations where the Val_loss or F1 score does not improve before notifying or stop training.
        self.wait = 0  # Stores how many times we've seen NO improvements
        self.best_val_loss = float('inf')  # Saves the best Val_loss as a checkpoint during training
        self.best_f1 = 0.0  # Saves the best F1 score as a checkpoint during training
        self.tb_writer = tb_writer  # Optional TensorBoard SummaryWriter

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        val_loss = metrics.get("eval_loss")  # Initializing & calling Val_loss metrics from training
        f1 = metrics.get("eval_f1")  # Initializing & calling F1 metric from training

        # Make sure both loss and F1 are present
        if val_loss is None or f1 is None:
            return

        # Get the current epoch safely
        step = state.global_step if state.global_step is not None else "Unknown"

        # Case 1: Val_Loss improves OR F1 improves
        if (val_loss < self.best_val_loss) or (f1 > self.best_f1):  # Either one would have to improve (or use '&' if you want both to improve)
            print(f" [Step {step}] New best model found! Saving checkpoint.")
            if self.tb_writer:
                self.tb_writer.add_text("Status", f"New best model found at epoch {step}", state.global_step)
            self.best_val_loss = min(val_loss, self.best_val_loss)  # Tracks the best Val_loss from previous minimum values
            self.best_f1 = max(f1, self.best_f1)  # Tracks the best F1 score from previous maximum values
            self.wait = 0  # Resets the wait counter if improvement happens between F1 or Val_loss
            control.should_save = True  # Forces a checkpoint save even outside save_strategy = "epoch"

        else:  # If Val_loss or F1 does not improve
            self.wait += 1  # Increases wait (shouldn't be two)
            print(f"[Step {step}] No improvement. Wait {self.wait}/{self.patience}")
            if self.tb_writer:
                self.tb_writer.add_text("Status", f"No improvement detected at step {step} (wait={self.wait})", state.global_step)

            if self.wait >= self.patience:  # If wait is more than or equal to patience then training is stopped to prevent overfitting of the dataset
                print(f"[Step {step}] Early stopping triggered - Training should stop here.")
                if self.tb_writer:
                    self.tb_writer.add_text("Status", f"Early stopping should be triggered at step {step}", state.global_step)
                     #control.should_training_stop = True.  <------ Uncomment this trigger if you want it to stop training automatically

###Training Arguments

In [None]:
import torch
from transformers import TrainingArguments
from torch.utils.tensorboard import SummaryWriter


tb_writer = SummaryWriter(log_dir="./runs/finetune")

output_dir = "./llama2-7b-admissions-qa-merged" #Better to track where your outputs are in HF repo

#Training arguments
training_arguments = TrainingArguments(
    output_dir=output_dir,
    logging_dir="./logs",
    logging_steps= 2, #logging every 5 steps to load into the tensor board for viewing metrics
    per_device_train_batch_size=1, #Affects how many samples (rows in dataset) you process at per step; helps prevents model overfitting
    gradient_accumulation_steps=2, #Updating gradients after two passes (In this case computes 2x2 = 4 samples then updates the weights using the optimizer)
    learning_rate = 3e-4, # Controls learning speed; lower prevents overfitting on small datasets
    warmup_ratio=0.05, # Controls how long (in terms of steps) it takes for the learning rate to ramp up from 0 to your target learning_rate.
    weight_decay = 0.10, #Add weight decay to slowly decrease training loss (not too slow to reach best minima) to ensure the model learns patterns gradually and not quickly
    optim="adamw_torch_fused",  #Recommended for fast training to compute gradients after gradient_accumulation_steps
    lr_scheduler_type="cosine", #Cosine decay produces smoother and more stable learning curves; fast enough to train before reaching local minimum but slow enough to learn!
    num_train_epochs = 3, #Increases the amount of forward passes (or backprop) in the training
    bf16= True,
    gradient_checkpointing=True, #Making it false stores the weights for later computation (backprop-uses a lot of memory) or making it true to calcuate it on the go (forward pass)
    save_strategy="steps", #Best practice ("epoch") to prevent noise and make it easier from using "steps"
    eval_strategy="steps",
    load_best_model_at_end=True,    #With the help of save_strategy, load the best epoch for F1
    metric_for_best_model="eval_f1", # Use F1 to select the best model
    greater_is_better=True,         # Because higher F1 is better
    report_to=["tensorboard"], # Reports to TensorBoard except for wandB for training otherwise make it = "none"
)







### How the training loop works



 1. Model does a forward pass using 'gradient_checkpointing=True'.

 2. Model computes loss and catches it with the help of "learning_rate" the rate at which the model learns (better to make lr slow for a small data).

 3. Model computes gradients after 2 batche_size (4 samples in total) when 'gradient_accumulation_steps=2'.
    Gradients = loss/weight helps correct mistakes by adjusting weights based on the loss.

 4. Optimizer updates every weight accordingly using optim="adamw_torch_fused".

 5. Repeat until N steps (Based on number of Epochs too)!


### Understanding the forward pass and computing gradients

In [None]:
"""
I have 50 rows of data to finetune &:
per_device_train_batch_size = 2
gradient_accumulation_steps = 2
num_train_epochs = 3
____________________________________________________________________

Compute steps_per_epoch:
steps_per_epoch = ceil(50 / 2) = 25 forward steps per epoch (dividing by / batch_size)
Good rule of thumb - You could divide batch_size by the # of num_devices to compute it evenly across GPU's


Compute optimizer steps per epoch:
optimizer_steps_per_epoch = ceil(25 / 2) = 13 optimizer steps per epoch (computing gradients every two passes)


Compute total steps for the entire training:
total_optimizer_steps = 13 * 3 = 39 optimizer steps (total across 3 epochs)


total_forward_steps = 25 * 3 = 75 forward passes


Warmup_ratio only decides:
"How many of those optimizer steps will be used to gradually get to the learning rate (2e-4)?"

warmup_steps = 0.05 * 39 = ~2 steps (Warm up ratio increases and updates learning rate every 2 steps-Does this about 20 times to get to 39 optimizer steps)
______________________________________________________________________
Concept	Value: (#'s may change if I have more # of devices or GPU's to share across computes)
Steps per epoch (forward passes) - 25
Optimizer steps per epoch	- 13
Total optimizer steps	- 39
Total forward passes - 75

"""

'\nI have 50 rows of data to finetune &:\nper_device_train_batch_size = 2\ngradient_accumulation_steps = 2\nnum_train_epochs = 3\n____________________________________________________________________\n\nCompute steps_per_epoch:\nsteps_per_epoch = ceil(50 / 2) = 25 forward steps per epoch (dividing by / batch_size)\nGood rule of thumb - You could divide batch_size by the # of num_devices to compute it evenly across GPU\'s\n\n\nCompute optimizer steps per epoch:\noptimizer_steps_per_epoch = ceil(25 / 2) = 13 optimizer steps per epoch (computing gradients every two passes)\n\n\nCompute total steps for the entire training:\ntotal_optimizer_steps = 13 * 3 = 39 optimizer steps (total across 3 epochs)\n\n\ntotal_forward_steps = 25 * 3 = 75 forward passes\n\n\nWarmup_ratio only decides:\n"How many of those optimizer steps will be used to gradually get to the learning rate (2e-4)?"\n\nwarmup_steps = 0.05 * 39 = ~2 steps (Warm up ratio increases and updates learning rate every 2 steps-Does this a

#Preventing Errors if CUDA is OOM (If needed)

###Use only if meta device doesn't have tensors yet to train or if CUDA is OOM

In [None]:
import os
# Set PYTORCH_CUDA_ALLOC_CONF to expand segment sizes and avoid fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
# Before calling trainer.train(), move the model to the device
if training_arguments.device != torch.device("cpu"):
    #Added if statement here in case there is insufficient GPU memory
    if not torch.cuda.is_available():
        model = model.to(torch.device("cpu"))
        print("Model moved to CPU due to unavailability of CUDA.")
    else:
        try:
            #Attempt move to GPU and proceed with training if successful
            model.to_empty(device=training_arguments.device)
            model = model.to(training_arguments.device)
            print("CUDA is available. Model will remain on GPU.")
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                #In case of insufficient GPU memory, move to CPU and issue a warning
                print("Insufficient GPU memory. Moving model to CPU.")
                model = model.to(torch.device("cpu")) #Move to CPU
                print("Model moved to CPU. Training will proceed on CPU, which will be slower.")
            else:
                #Raise the error if the cause was not OOM
                raise e

CUDA is available. Model will remain on GPU.


# Training with Supervised Fine Tuning

###Disable entering wandb API key for training in case report_to = "none" doesnt work

In [None]:
#import os
#os.environ["WANDB_DISABLED"] = "true"

In [None]:
import gc
# Free up memory before training
gc.collect()
torch.cuda.empty_cache()

In [None]:
from trl import SFTTrainer


#Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_arguments,
    compute_metrics=compute_metrics,
    dataset_text_field="text",  # Specify the text field directly
    max_seq_length=3, #Kept about 90% of my sample length for the 'Questions' column (to save memory) which in total has an average of 210 characters per row (sample)
    #Leave this out for regular SFT
    peft_config=peft_config,


)
#Notifies if F1 or Val_loss improves or not
trainer.add_callback(SmartNotifyCallback(tb_writer=tb_writer))


#Train model
trainer.train()



#Save QLoRA weights
trainer.model.save_pretrained("./Llama2-7B-qlora")

#No need to push QLoRA on HF since merged model already contains QLora adaptors


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,F1
2,7.8289,11.710692,0.0
4,7.75,11.154095,0.0
6,7.1961,9.333426,0.666667
8,5.4978,3.070704,0.666667
10,2.8793,8.187501,0.666667
12,5.1771,5.343827,0.666667
14,3.063,1.847193,0.666667
16,2.7862,6.792724,0.666667
18,4.082,3.899029,0.666667
20,2.8069,4.930801,0.666667


 [Step 2] New best model found! Saving checkpoint.
 [Step 4] New best model found! Saving checkpoint.
 [Step 6] New best model found! Saving checkpoint.
 [Step 8] New best model found! Saving checkpoint.
[Step 10] No improvement. Wait 1/10
[Step 12] No improvement. Wait 2/10
 [Step 14] New best model found! Saving checkpoint.
[Step 16] No improvement. Wait 1/10
[Step 18] No improvement. Wait 2/10
[Step 20] No improvement. Wait 3/10
[Step 22] No improvement. Wait 4/10
[Step 24] No improvement. Wait 5/10
 [Step 26] New best model found! Saving checkpoint.
 [Step 28] New best model found! Saving checkpoint.
 [Step 30] New best model found! Saving checkpoint.
 [Step 32] New best model found! Saving checkpoint.
 [Step 34] New best model found! Saving checkpoint.
 [Step 36] New best model found! Saving checkpoint.
 [Step 38] New best model found! Saving checkpoint.
 [Step 40] New best model found! Saving checkpoint.
[Step 42] No improvement. Wait 1/10
[Step 44] No improvement. Wait 2/10
 [St

#Finally Merging  Lora Weights with original model weights

###Merge Weights

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained("Llama2-7B-qlora",
                                                 low_cpu_mem_usage=True,
                                                 device_map="auto",
                                                 torch_dtype=torch.bfloat16,
                                                 )

#Merge QLoRA and base model
merged_model = model.merge_and_unload()

#Save the merged model in the same folder as tokenizer
merged_model.save_pretrained("./Llama2-7B-merged-qlora&base")

#Save the merged model in the same repository as tokenizer
#trainer.push_to_hub("Jsevere/llama2-7b-admissions-qa-merged")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import pipeline
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Enable detailed error reporting

#Use our predefined prompt template for Llama
prompt = """ Good evening, I was formerly enrolled at Salem State University as a business major. I am inquiring about how I would get a copy of my transcript and transferable credits. Any assistance is greatly appreciated. """




pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result)


Device set to use cuda:0


[{'generated_text': '<s>[INST]  Good evening, I was formerly enrolled at Salem State University as a business major. I am inquiring about how I would get a copy of my transcript and transferable credits. Any assistance is greatly appreciated.  [/INST]  Good evening! I\'m happy to help you with your inquiry. To obtain a copy of your transcript and transferable credits from Salem State University, you can follow these steps:\n\n1. Log in to your SSU Student Portal: Go to the SSU website and log in to your student portal using your SSU ID and password.\n2. Access the Transcript Request Form: Once you are logged in, click on the "Registrar" tab and select "Transcript Request" from the drop-down menu. This will take you to the transcript request form.\n3. Fill out the Transcript Request Form: Complete the form with your personal information, the type of transcript you want (official or unofficial), and the recipient\'s address. Make sure to specify the term(s) you want the transcript to inc