This is my first time posting code. If there's anything I can improve, feel free to share your feedback!

Downloading the dependencies

In [None]:
!pip install --upgrade datasets
!pip install --upgrade transformers
!pip install --upgrade pert
!pip install --upgrade trl
!pip install accelerate
!pip install bitsandbytes
!pip install tensorboard

Connect to Hugging Face 

In [None]:
# from google.colab import userdata
# from huggingface_hub import login

# # Login into Hugging Face Hub
# hf_token = userdata.get('HF_TOKEN') # If you are running inside a Google Colab
# login(hf_token)
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

from huggingface_hub.hf_api import HfFolder
HfFolder.save_token("YOUR_KEY")

After getting the aknowledgement of your gemma model licence from hugging face download the model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "google/gemma-2b"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(model)

Test the downloaded model 

In [None]:
input_text = "what is anime?"

input_ids = tokenizer(input_text, return_tensors='pt')
output = model.generate(**input_ids, max_length=128)
print(tokenizer.decode(output[0]))

import your dataset in json file.   
**NOTE: make sure your dataset is labeled dataset (i.e. that it has both the dependent values and independent values)**

In [None]:
import json
# Load dataset
file_path = "/kaggle/input/promogen-001-dataset/PromoGen_001.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)


check if the correct dataset is loaded  

In [None]:
print(data[0])

In [None]:
print(len(data))

convert your dataset into your required prompt that you want the model to learn, in my case it was advertisement generation based on the company name , product name, product description , ad script and CTA. You can make your own prompt.

In [None]:
from datasets import Dataset
# Convert JSON to Hugging Face dataset using the instruction-based template
def format_data(example):
    return {
        "text": (
            "### Instruction: Write a highly engaging advertisement script and a strong call to action for the product below. \n"
            "The ad script should creatively describe the product's features and benefits, and the call to action should motivate the audience to take action immediately. \n\n"
            f"Company Name: {example['Company Name']}\n"
            f"Product Name: {example['Product Name']}\n"
            f"Product Description: {example['Product Description']}\n\n"
            "Ad Script:\n"
            f"{example['Ad Script']}\n\n"
            "CTA:\n"
            f"{example['CTA']}"
        )
    }

formatted_data = [format_data(d) for d in data]
dataset = Dataset.from_list(formatted_data)

In [None]:
print(dataset[0])

**Tokenizing the dataset**  
This this where we convert the text into tokens.  
truncate : this ensure that if the text is longer than the max lenght, it gets cut off.  
padding : All the tokenize ouput are of same length.  
max length : fixed length of the tokenized sequence.

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Import the dependencies for the model 

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
model_name = "google/gemma-2b" #base model name
new_model = "gemma-ft" #new fine-tuned modle name

# Lora Adaptation Parameters
lora_r = 4 # Low rank dimension (smaller r reduces the memory usages but it may impact the model performance)
lora_alpha = 16 #scaling factor for LoRA updates
lora_dropout = 0.1 # applies droput for preventing overfitting

#Bit quantization Setup(4-bit)
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
bnb_4bit_use_double_quant = True

#training hyperparameters
output_dir = "./promogen_001"
num_train_epochs = 4
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 25

max_seq_length = 128
packing = False
device_map = "auto"

here we are configuring the model in 4-bit quantization

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
)

Here we load the pre-trained model with 4-bit quantization and tokenization.

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token = hf_token,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=hf_token,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Configures the LoRA settings for the gemma model by using peft

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj"],
)

Configures the training argument which controls how the model is going to be fine-tuned.

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
)
training_arguments


Here we initialize the SFTT(Supervised Fine Tuning Training) using hugging face TRL(transformer Reinforcement Learning)

In [None]:

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets,
    peft_config=peft_config,
    args=training_arguments,
    # max_seq_length=max_seq_length,

)


In [None]:
print(f"Tokenized Dataset Size Before Training: {len(tokenized_datasets)}")


In [None]:
print(f"Trainer Dataset Size: {len(trainer.train_dataset)}")


Here we start the training of the dataset

In [None]:
trainer.train()

Then we save the model

In [None]:
trainer.save_model(new_model)

Here we loads and merge the base model and the fine-tunned model and save it

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Load LoRA adapter (fine-tuned weights)
model = PeftModel.from_pretrained(base_model, new_model)

# Merge LoRA weights into the base model
model = model.merge_and_unload()

# Save the merged model for deployment
model.save_pretrained("promogen_final_model")

# Reload and save tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained("promogen_final_model")


Here we test the saved model 

In [None]:

import re
company_name = input("Enter Company Name: ")
product_name = input("Enter Product Name: ")
product_description = input("Enter Product Description: ")

# Create Prompt
input_text = (
    "### Instruction: Write a highly engaging advertisement script and a strong call to action for the product below.\n"
    "The ad script should creatively describe the product's features and benefits, and the call to action should motivate the audience to take action immediately.\n\n"
    f"Company Name: {company_name}\n"
    f"Product Name: {product_name}\n"
    f"Product Description: {product_description}\n\n"
    "Ad Script:\n"
)

# Tokenize and move to GPU
inputs = tokenizer(input_text, return_tensors="pt")
inputs = {key: val.to("cuda") for key, val in inputs.items()} 

# Generate output
# Generate output (this is a tensor)
outputs = model.generate(**inputs, max_length=256,temperature=0.7,top_p=0.9,do_sample=True)

# Decode the tensor output to get a string
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Regex to extract ONLY the first Ad Script and CTA before "Company Name" repeats
pattern = r"Ad Script:\s*(.*?)\s*CTA:\s*(.*?)(?:\nCompany Name:|\Z)"
match = re.search(pattern, generated_text, re.DOTALL)

if match:
    ad_script = match.group(1).strip()
    cta = match.group(2).strip()
    print("\n Ad Script:\n", ad_script)
    print("\n Call to Action:\n", cta)
else:
    print("\n Couldn't find Ad Script and CTA properly.\n")
    print(generated_text)


In [None]:
# !zip -r /kaggle/working/promogen_001.zip /kaggle/working/promogen_001
!zip -r promogen_final_model.zip promogen_final_model/


In [None]:
from IPython.display import FileLink
FileLink(r'promogen_final_model.zip')