In [1]:
# %%capture
# Install the required libraries
%pip install --upgrade pip
%pip install uv
!uv pip install --system --no-progress --link-mode=symlink --index=https://download.pytorch.org/whl/cu124 accelerate unsloth vllm sentencepiece protobuf datasets wandb huggingface_hub kagglehub[hf-datasets] hf_transfer

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Note: you may need to restart the kernel to use updated packages.
Collecting uv
  Downloading uv-0.7.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.7.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.7.2
Note: you may need to restart the kernel to use updated packages.
[2m

In [2]:
!git clone https://github.com/shuhanmirza/Bengali-Poem-Dataset.git

Cloning into 'Bengali-Poem-Dataset'...
remote: Enumerating objects: 18637, done.[K
remote: Counting objects: 100% (18637/18637), done.[K
remote: Compressing objects: 100% (18269/18269), done.[K
remote: Total 18637 (delta 70), reused 18632 (delta 69), pack-reused 0 (from 0)[K
Receiving objects: 100% (18637/18637), 6.11 MiB | 16.37 MiB/s, done.
Resolving deltas: 100% (70/70), done.


In [3]:
import os
import random
import re
import json


def normalize_text(text):
    text = re.sub(r"\([\s\u09E6-\u09EF]+\)", " ", text)
    text = re.sub(r"[\u09E6-\u09EF]+", " ", text)
    text = re.sub(r"[\u00A0\u200B\u200C\u200D\u2060\u3000]+", " ", text)
    text = re.sub(r"[ \t\r\f\v]+", " ", text)
    return text.strip()


def make_datasets():
    # parent_dir is the root dataset directory
    # Assuming you ran (from project root)
    # cd ..
    # git clone https://github.com/shuhanmirza/Bengali-Poem-Dataset
    # It would be
    parent_dir = "/kaggle/working/Bengali-Poem-Dataset/dataset"

    poems = []
    classes = []
    missing_class_count = 0

    for poet in os.listdir(parent_dir):
        poet_dir = os.path.join(parent_dir, poet)
        if not os.path.isdir(poet_dir):
            continue

        for poem in os.listdir(poet_dir):
            poem_dir = os.path.join(poet_dir, poem)
            if not os.path.isdir(poem_dir):
                continue

            class_text = ""
            class_path = os.path.join(poem_dir, "CLASS.txt")
            if os.path.exists(class_path):
                try:
                    with open(class_path, "r", encoding="utf-8") as f:
                        class_text = normalize_text(f.read())
                except Exception as e:
                    print(f"Error reading CLASS.txt in {poem_dir}: {e}")
                    continue

            for file in os.listdir(poem_dir):
                file_path = os.path.join(poem_dir, file)
                if (
                    file.endswith(".txt")
                    and file not in ["CLASS.txt", "SOURCE.txt"]
                    and os.path.isfile(file_path)
                ):
                    try:
                        with open(file_path, "r", encoding="utf-8") as f:
                            poem_text = f.read()

                        poem_lines = []
                        for line in poem_text.split("\n"):
                            line = normalize_text(line)
                            if line:
                                poem_lines.append(line)

                        poem_text = "\n".join(poem_lines)

                        if class_text:
                            classes.append(
                                {
                                    "Instructions": f"একটি বাংলা কবিতা লেখো যার বিষয় হলো {class_text}। কবিতায় নতুন লাইনের জন্য '\n' এবং নতুন স্তবকের জন্য '\n\n\n' ব্যবহার করো।",
                                    "Input": class_text,
                                    "Output": poem_text,
                                }
                            )
                        else:
                            missing_class_count += 1

                        if len(poem_lines) >= 2:
                            random_line_start = random.randint(0, len(poem_lines) - 2)
                            line1 = poem_lines[random_line_start]
                            line2 = poem_lines[random_line_start + 1]

                            poems.append(
                                {
                                    "Instructions": f"নিচের দুটি লাইন ব্যবহার করে একটি সম্পূর্ণ বাংলা কবিতা লেখো। কবিতায় নতুন লাইনের জন্য '\n' এবং নতুন স্তবকের জন্য '\n\n\n' ব্যবহার করো।\n{poem_text}",
                                    "Input": f"{line1}\n{line2}",
                                    "Output": poem_text,
                                }
                            )

                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
                        continue

    with open("/content/poems.json", "w", encoding="utf-8") as f:
        json.dump(poems, f, ensure_ascii=False, indent=4)
    with open("/content/classes.json", "w", encoding="utf-8") as f:
        json.dump(classes, f, ensure_ascii=False, indent=4)


if __name__ == "__main__":
    make_datasets()

In [4]:
import json
import os
import torch
import random


def split_dataset(
    input_file, train_output_file, test_output_file, test_size=0.2, random_state=42
):
    """
    Split a dataset into training and testing sets using PyTorch.

    Args:
        input_file (str): Path to the input JSON file
        train_output_file (str): Path to save the training data
        test_output_file (str): Path to save the testing data
        test_size (float): Proportion of the dataset to include in the test split
        random_state (int): Random seed for reproducibility
    """
    # Set seed for reproducibility
    torch.manual_seed(random_state)
    random.seed(random_state)

    # Load the data
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Calculate split sizes
    dataset_size = len(data)
    test_count = int(dataset_size * test_size)
    train_count = dataset_size - test_count

    # Create a random split with PyTorch
    indices = torch.randperm(dataset_size).tolist()
    train_indices = indices[:train_count]
    test_indices = indices[train_count:]

    # Create train and test datasets
    train_data = [data[i] for i in train_indices]
    test_data = [data[i] for i in test_indices]

    # Save the training data
    with open(train_output_file, "w", encoding="utf-8") as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)

    # Save the testing data
    with open(test_output_file, "w", encoding="utf-8") as f:
        json.dump(test_data, f, ensure_ascii=False, indent=4)

    print(f"Total samples: {len(data)}")
    print(f"Training samples: {len(train_data)}")
    print(f"Testing samples: {len(test_data)}")


def main():
    # Define file paths
    input_file = os.path.join("/content", "poems.json")
    train_output_file = os.path.join("/content", "train.json")
    test_output_file = os.path.join("/content", "test.json")

    # Create data directory if it doesn't exist
    os.makedirs(os.path.dirname(train_output_file), exist_ok=True)

    # Split the dataset
    split_dataset(input_file, train_output_file, test_output_file)


if __name__ == "__main__":
    main()

Total samples: 6055
Training samples: 4844
Testing samples: 1211


In [5]:
import json
import random

with open("/content/classes.json", "r", encoding="utf-8") as f:
    classes_data = json.load(f)
with open("/content/poems.json", "r", encoding="utf-8") as f:
    poems_data = json.load(f)

merged_data = classes_data + poems_data

random.shuffle(merged_data)

total_size = len(merged_data)
train_size = int(0.8 * total_size)
val_size = int(0.12 * total_size)
test_size = total_size - train_size - val_size

train_data = merged_data[:train_size]
val_data = merged_data[train_size : train_size + val_size]
test_data = merged_data[train_size + val_size :]


def write_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


write_jsonl(train_data, "/content/train.jsonl")
write_jsonl(test_data, "/content/test.jsonl")

print(
    f"Files created: train.jsonl ({len(train_data)} entries), "
    f"val.jsonl ({len(val_data)} entries), test.jsonl ({len(test_data)} entries)"
)

Files created: train.jsonl (9708 entries), val.jsonl (1456 entries), test.jsonl (971 entries)


In [6]:
import os
import logging
from unsloth import FastModel
from datasets import load_dataset


logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

DATA_PATH = "/content/test.jsonl"
OUTS_DIR = "out"
OUTPUT_FILE = os.path.join(OUTS_DIR, "combined_metrics_results.txt")


os.makedirs(OUTS_DIR, exist_ok=True)

model, tokenizer = FastModel.from_pretrained(
    model_name="Ankita-Porel/gemma3-4b-bn-chat-poem-ft",
    max_seq_length=2048,
    load_in_4bit=True,
    load_in_8bit = False,
    full_finetuning = False,
    # token="hf_..."
)

model = FastModel.for_inference(model)

# Load test dataset
logger.info("Loading test dataset...")
test_dataset = load_dataset("json", data_files=DATA_PATH)["train"]

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-04 11:54:53.147461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746359693.560707      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746359693.675107      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 05-04 11:55:18 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 05-04 11:55:18 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.7: Fast Gemma3 patching. Transformers: 4.51.1. vLLM: 0.8.5.post1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.35G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
!uv pip install --system --no-progress --link-mode=symlink -U evaluate rouge_score sacrebleu

[2mUsing Python 3.11.11 environment at: /usr[0m
[2mResolved [1m43 packages[0m [2min 3.57s[0m[0m
   [36m[1mBuilding[0m[39m rouge-score[2m==0.1.2[0m
[36m[1mDownloading[0m[39m lxml [2m(4.7MiB)[0m
      [32m[1mBuilt[0m[39m rouge-score[2m==0.1.2[0m
[36m[1mDownloading[0m[39m numpy [2m(15.7MiB)[0m
[36m[1mDownloading[0m[39m aiohttp [2m(1.6MiB)[0m
[36m[1mDownloading[0m[39m pyarrow [2m(40.3MiB)[0m
 [32m[1mDownloaded[0m[39m aiohttp
 [32m[1mDownloaded[0m[39m lxml
 [32m[1mDownloaded[0m[39m pyarrow
 [32m[1mDownloaded[0m[39m numpy
[2mPrepared [1m20 packages[0m [2min 2.35s[0m[0m
[2mUninstalled [1m16 packages[0m [2min 389ms[0m[0m
[2mInstalled [1m20 packages[0m [2min 136ms[0m[0m
 [31m-[39m [1mabsl-py[0m[2m==1.4.0[0m
 [32m+[39m [1mabsl-py[0m[2m==2.2.2[0m
 [31m-[39m [1maiohttp[0m[2m==3.11.16[0m
 [32m+[39m [1maiohttp[0m[2m==3.11.18[0m
 [31m-[39m [1mcertifi[0m[2m==2025.1.31[0m
 [3

In [8]:
import evaluate
from tqdm import tqdm

# Tokenization function
def tokenize_function(example):
    input_text = example["Instructions"] + " " + example["Input"]
    input_encoding = tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=1024,
        return_tensors="pt",
    )
    target_encoding = tokenizer(
        example["Output"],
        truncation=True,
        padding="max_length",
        max_length=1024,
        return_tensors="pt",
    )

    return {
        "input_ids": input_encoding.input_ids[0],
        "attention_mask": input_encoding.attention_mask[0],
        "labels": target_encoding.input_ids[0],
    }


test_dataset = test_dataset.map(tokenize_function)

# Load all metrics
logger.info("Loading metrics...")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
chrf = evaluate.load("chrf")


# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    model.eval()
    predictions = []
    references = []
    references_for_bleu = []  # BLEU expects a list of references per prediction

    logger.info("Generating predictions...")
    for example in tqdm(dataset, desc="Processing", unit="sample"):
        input_text = example["Instructions"] + " " + example["Input"]
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024,
        ).to(model.device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=200,
                pad_token_id=tokenizer.pad_token_id,
            )

        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(generated_text)
        references.append(example["Output"])
        references_for_bleu.append([example["Output"]])  # Wrap in list for BLEU

    return predictions, references, references_for_bleu


# Generate outputs (do this only once for all metrics)
logger.info("Generating model outputs...")
preds, refs, refs_for_bleu = generate_predictions(model, tokenizer, test_dataset)

# Compute all metrics
logger.info("Computing metrics...")
bleu_scores = bleu.compute(predictions=preds, references=refs_for_bleu)
rouge_scores = rouge.compute(predictions=preds, references=refs)
chrf_scores = chrf.compute(predictions=preds, references=refs)

# Combine results
results = {
    "BLEU": bleu_scores["bleu"],
    "ROUGE-1": rouge_scores["rouge1"],
    "ROUGE-2": rouge_scores["rouge2"],
    "ROUGE-L": rouge_scores["rougeL"],
    "ChRF": chrf_scores["score"],
}

# Log and save results
logger.info("\nFinal Test Metrics:")
for key, value in results.items():
    logger.info(f"{key}: {value:.4f}")

with open(OUTPUT_FILE, "w") as f:
    f.write("Final Test Metrics:\n")
    for key, value in results.items():
        f.write(f"{key}: {value:.4f}\n")

logger.info(f"\nResults saved in: {OUTPUT_FILE}")

Map:   0%|          | 0/971 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Processing: 100%|██████████| 971/971 [8:32:16<00:00, 31.65s/sample]
