In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

In [None]:
# @title For environments other than Google Colab
%pip install unsloth vllm

In [None]:
# @title For Loading Checkpoints from Google Drive (on colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title For setting up dataset in Kaggle
!git clone https://github.com/shuhanmirza/Bengali-Poem-Dataset.git
%cd /kaggle/working/Bengali-Poem-Dataset

In [None]:
# @title For setting up dataset in Kaggle
import os
import pandas as pd

dataset_dir = 'dataset'

data = []

for poet in os.listdir(dataset_dir):
    poet_dir = os.path.join(dataset_dir, poet)
    if os.path.isdir(poet_dir):
        for poem in os.listdir(poet_dir):
            poem_dir = os.path.join(poet_dir, poem)
            if os.path.isdir(poem_dir):
                class_text = None
                class_file_path = os.path.join(poem_dir, 'CLASS.txt')
                if os.path.exists(class_file_path):
                    with open(class_file_path, 'r', encoding='utf-8') as class_file:
                        class_text = class_file.read().strip()

                for file in os.listdir(poem_dir):
                    if file.endswith('.txt') and file not in ['CLASS.txt', 'SOURCE.txt']:
                        file_path = os.path.join(poem_dir, file)
                        with open(file_path, 'r', encoding='utf-8') as f:
                            poem_text = f.read()
                            title = os.path.splitext(file)[0]
                            data.append({
                                'poet': poet,
                                'category': class_text,
                                'title': title,
                                'poem': poem_text
                            })

df = pd.DataFrame(data)

df.to_csv('poems_dataset.csv', index=False, encoding='utf-8')
!mv /kaggle/working/Bengali-Poem-Dataset/poems_dataset.csv /kaggle/working/
%cd /kaggle/working/
df.head()


In [None]:
# @title Loading the last checkpoint from Google drive
!cp /content/drive/MyDrive/adapter_model.safetensors /content/outputs/checkpoint-1500/

In [None]:
# @title Loading the Gemma3 1 billion Parameters using unsloth
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-1b-it",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
    # token = "hf_...", # use one if using gated models
)

In [None]:
# @title To apply Parameter-Efficient Fine-Tuning (PEFT) to the pre-trained Gemma-3-4b model (unsloth provided)
model = FastModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "do
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

In [None]:
# @title Prompt and Dataset Loading (upload the poems_dataset.csv) to Google colab
prompt = """You are tasked with writing a poem related to the title in the style mentioned below. The poem should fit the specified category.

### Title:
{}
### Category:
{}

### Poem:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    titles    = examples["title"]
    categorys = examples["category"]
    poems     = examples["poem"]
    texts = []
    for title, category, poem in zip(titles, categorys, poems):
        text = prompt.format(title, category, poem) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("csv", data_files="/content/poems_dataset.csv", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
# @title Set up a training pipeline using the UnslothTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

max_seq_length = 2048

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,

    args = UnslothTrainingArguments(
        report_to = "wandb",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 3,
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
# @title Start training. Use trainer.train() to start from Zero
trainer.train()

In [None]:
# @title wandb login setup (needs your API key)
!wandb login
import wandb
wandb.init()

In [None]:
model.save_pretrained("gemma-poetry-bn")
tokenizer.save_pretrained("gemma-poetry-bn")

In [None]:
# @title Huggingface login
from huggingface_hub import login
login()

In [None]:
# @title Publish to Huggingface
! huggingface-cli upload Ankita-Porel/gemma3-1b-v1 gemma-poetry-bn

In [None]:
# @title Load the model from huggingface for inference
from unsloth import FastModel
import torch

model, tokenizer = FastModel.from_pretrained(
    model_name="Ankita-Porel/gemma3-1b-v1",
    max_seq_length=2048,
    load_in_4bit=True,
    load_in_8bit = False,
    full_finetuning = False,
    # token="hf_..."
)

In [None]:
# @title Run the tests
model = FastModel.for_inference(model)

prompt = """You are tasked with writing a poem related to the title in the style mentioned below. The poem should fit the specified category.

### Title:
{}
### Category:
{}

### Poem:
{}"""

inputs = tokenizer(
[
    prompt.format(
        "বাংলার বায়ু, বাংলার ফল- পূণ্য হউক, পূণ্য হউক,",
        "কীর্তন",
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, repetition_penalty = 2.0, use_cache = True)

generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

with open("generated_output.txt", "w", encoding="utf-8") as file:
    file.write(generated_text)

print("Output saved to 'output.txt'")
tokenizer.batch_decode(outputs)