## **FineTuning the LLM**

### 1. Installing and Importing required libraries

In [15]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

* We are making use of LLaMa 3.2-1B Instruct, which gives the best of model capabilities and resource efficiency.

In [16]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/mistral-7b-bnb-4bit",
#     "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
#     "unsloth/llama-2-7b-bnb-4bit",
#     "unsloth/gemma-7b-bnb-4bit",
#     "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
#     "unsloth/gemma-2b-bnb-4bit",
#     "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
#     "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


* Adding LoRA adapters so we only need to update 1 to 10% of all parameters

In [17]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### 2. Importing Data

In [18]:
import pandas as pd
filtered_ca_plates = pd.read_csv("final_plates.csv")

In [19]:
type(filtered_ca_plates)

In [20]:
filtered_ca_plates["output"] = "Status: "+filtered_ca_plates["status"]+"\n"+"Category: "+filtered_ca_plates["category"]+"\n"+"Description: "+filtered_ca_plates["reviewer_comments"]

In [21]:
from datasets import Dataset
dataset = Dataset.from_pandas(filtered_ca_plates)

In [22]:
dataset

Dataset({
    features: ['plate', 'status', 'category', 'reviewer_comments', 'review_reason_code', 'customer_meaning', 'output'],
    num_rows: 30006
})

In [23]:
type(dataset)

### 3. Defining the prompt and its component structure (Prompt, Input and Output)

In [24]:
vanity_prompt = """ You are a vanity license plate reviewer. Determine whether the given vanity license plate would be accepted or rejected by the DMV. Look out for any vulgar, sexual and inappropriate content for public display. If it is inapproriate and falls into one of the below categories, then reject the plate and provide a reason for the same. Here are the categories:

1. Sexual connotation.
2. Vulgar or degrading term.
3. Swear word or profane.
4. Negative connotation to group.
5. Misrepresents law enforcement.
6. Deleted from series.
7. Foreign, slang, or phonetic.
8. Not Applicable (if status is Accepted)

Give the response in the following format->
Status: Accepted or Rejected
Category:
Reason:

Example (1):

Input: ZEROFCK

Output:

Status: Rejected
Category: Swear word or profane
Reason: It implies Zero Fucks, which is a swear phrase.

Example(2):

Input: LOVEALL

Output:

Status: Accepted
Category: Not Applicable
Reason: No inappropriate content found<|eot_id|><|start_header_id|>user<|end_header_id|>

### Input:
{}

### Response:
{}


"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(filtered_ca_plates):
    #instructions = examples["instruction"]
    inputs       = filtered_ca_plates["plate"]
    outputs      = filtered_ca_plates["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = vanity_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/30006 [00:00<?, ? examples/s]

In [25]:
dataset.features

{'plate': Value(dtype='string', id=None),
 'status': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'reviewer_comments': Value(dtype='string', id=None),
 'review_reason_code': Value(dtype='float64', id=None),
 'customer_meaning': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

### 4. Training the Model / Finetuning

* We are using Huggingface `SFTTrainer` from it's [Transformer Reinforcement Learning](https://huggingface.co/docs/trl/sft_trainer) Library
* We are running it for 120 steps

In [26]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 120,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adafactor",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/30006 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [27]:
# code to view the memory and GPU running statistics
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.281 GB of memory reserved.


In [28]:
#actual training is done here
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 30,006 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 120
 "-____-"     Number of trainable parameters = 11,272,192


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,3.054
2,3.0374
3,2.9724
4,2.8138
5,2.5021
6,2.1327
7,1.6618
8,1.2004
9,1.0586
10,0.9304


In [29]:
#@title Final Memory and Time Statistics
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

283.6593 seconds used for training.
4.73 minutes used for training.
Peak reserved memory = 3.225 GB.
Peak reserved memory for training = 0.944 GB.
Peak reserved memory % of max memory = 21.867 %.
Peak reserved memory for training % of max memory = 6.401 %.


### 5. Model execution for some sample inputs

In [30]:
# Some examples to test the model
# PUSYWGN
# BADAAZ
# RUBMYDUB
# 2CHARM

In [31]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    vanity_prompt.format(
        "2CHARM", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|> You are a vanity license plate reviewer. Determine whether the given vanity license plate would be accepted or rejected by the DMV. Look out for any vulgar, sexual and inappropriate content for public display. If it is inapproriate and falls into one of the below categories, then reject the plate and provide a reason for the same. Here are the categories:\n\n1. Sexual connotation.\n2. Vulgar or degrading term.\n3. Swear word or profane.\n4. Negative connotation to group.\n5. Misrepresents law enforcement.\n6. Deleted from series.\n7. Foreign, slang, or phonetic.\n8. Not Applicable (if status is Accepted)\n\nGive the response in the following format->\nStatus: Accepted or Rejected\nCategory:\nReason:\n\nExample (1):\n\nInput: ZEROFCK\n\nOutput:\n\nStatus: Rejected\nCategory: Swear word or profane\nReason: It implies Zero Fucks, which is a swear phrase.\n\nExample(2):\n\nInput: LOVEALL\n\nOutput:\n\nStatus: Accepted\nCategory: Not Applicable\nReason: No inappropriate 

* Using a `TextStreamer` for continuous inference, so that we can check the results and model response token by token (like how ChatGPT generates responses)

In [32]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    vanity_prompt.format(
        "SWTSHIT", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|> You are a vanity license plate reviewer. Determine whether the given vanity license plate would be accepted or rejected by the DMV. Look out for any vulgar, sexual and inappropriate content for public display. If it is inapproriate and falls into one of the below categories, then reject the plate and provide a reason for the same. Here are the categories:

1. Sexual connotation.
2. Vulgar or degrading term.
3. Swear word or profane.
4. Negative connotation to group.
5. Misrepresents law enforcement.
6. Deleted from series.
7. Foreign, slang, or phonetic.
8. Not Applicable (if status is Accepted)

Give the response in the following format->
Status: Accepted or Rejected
Category:
Reason:

Example (1):

Input: ZEROFCK

Output:

Status: Rejected
Category: Swear word or profane
Reason: It implies Zero Fucks, which is a swear phrase.

Example(2):

Input: LOVEALL

Output:

Status: Accepted
Category: Not Applicable
Reason: No inappropriate content found<|eot_id|><|start_heade

### 6. Testing the model with some pre-labeled data derived from New York plates dataset

In [33]:


modified_vanity_prompt = """ You are a vanity license plate reviewer. Determine whether the given vanity license plate would be accepted or rejected by the DMV. Look out for any vulgar, sexual and inappropriate content for public display. If it is inapproriate and falls into one of the below categories, then reject the plate and provide a reason for the same. Here are the categories:

1. Sexual connotation.
2. Vulgar or degrading term.
3. Swear word or profane.
4. Negative connotation to group.
5. Misrepresents law enforcement.
6. Deleted from series.
7. Foreign, slang, or phonetic.
8. Not Applicable (if status is Accepted)

Give the response in the following format->
Status: Accepted or Rejected
Category:
Reason:

Example (1):

Input: ZEROFCK

Output:

Status: Rejected
Category: Swear word or profane
Reason: It implies Zero Fucks, which is a swear phrase.

Example(2):

Input: LOVEALL

Output:

Status: Accepted
Category: Not Applicable
Reason: No inappropriate content found<|eot_id|><|start_header_id|>user<|end_header_id|>"""




In [34]:
test_plates = pd.read_csv("test_plates.csv")

In [35]:
accepted_plates = test_plates[test_plates["status"] == "Accepted"]
rejected_plates = test_plates[test_plates["status"] == "Rejected"]

# Take a random sample of 100 rows from each (adjust size if fewer rows are available)
sampled_accepted = accepted_plates.sample(n=50, random_state=42)
sampled_rejected = rejected_plates.sample(n=50, random_state=42)

# Combine the two samples into a single DataFrame
filtered_test_plates = pd.concat([sampled_accepted, sampled_rejected])

# Shuffle the combined DataFrame for randomness
filtered_test_plates = filtered_test_plates.sample(frac=1, random_state=42).reset_index(drop=True)

In [36]:
def classify_vanity_plate(plate):
    # Generate input using the prompt
    input_text = modified_vanity_prompt + f"\nInput: {plate}\n\n### Response:"
    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

    # Generate output
    text_streamer = TextStreamer(tokenizer)
    output = model.generate(**inputs, max_new_tokens=128, streamer=text_streamer)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the status from the model's output
    if "Status: Accepted" in decoded_output:
        return "Accepted"
    elif "Status: Rejected" in decoded_output:
        return "Rejected"
    else:
        return "Unknown"  # Handle unexpected cases

In [37]:
#debugging
import re

def classify_vanity_plate(plate):
    # Generate input
    input_text = modified_vanity_prompt + f"\nInput: {plate}\n\n### Response:"
    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

    # Generate output
    output = model.generate(**inputs, max_new_tokens=128)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the status using regex
    match = re.search(r"Status:\s*(Accepted|Rejected)", decoded_output)
    if match:
        return match.group(1)
    return "Unknown"  # Handle unexpected cases


In [38]:
filtered_test_plates["predicted_status"] = filtered_test_plates["plate"].apply(classify_vanity_plate)

In [40]:
from sklearn.metrics import accuracy_score

In [55]:
# Calculate the number of correct predictions
correct_predictions = (filtered_test_plates['status'] == filtered_test_plates['predicted_status']).sum()

# Calculate the total number of predictions
total_predictions = len(filtered_test_plates)

# Calculate the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy:.2%}")


Accuracy: 71.00%


In [45]:
from sklearn.metrics import accuracy_score

In [56]:
accuracy = accuracy_score(filtered_test_plates["status"], filtered_test_plates["predicted_status"])
print(f"Model Accuracy: {accuracy:.2%}")

Model Accuracy: 71.00%
