In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
!pip install wandb -q
!pip install rouge-score bert_score evaluate
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from datasets import load_dataset
import pandas as pd
from trl import SFTTrainer
from transformers import TrainingArguments
import wandb
from datasets import DatasetDict
wandb.init(mode="disabled")
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from unsloth import FastLanguageModel
import torch

# Specify the dataset name
dataset_name = "csebuetnlp/xlsum"

# Load the dataset
dataset = load_dataset(dataset_name,'bengali')

# Display the structure of the dataset
print(dataset)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
# Convert each split to a DataFrame
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
validation_df = pd.DataFrame(dataset['validation'])

# # Display a preview of each DataFrame
# print("Train DataFrame:\n", train_df.head())
# print("\nTest DataFrame:\n", test_df.head())
# print("\nValidation DataFrame:\n", validation_df.head())

In [None]:
train_df = train_df[["text","summary"]]
test_df = test_df[["text","summary"]]
validation_df = validation_df[["text","summary"]]
train_df

In [None]:
import re
def replace_strings(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\u00C0-\u017F"          #latin
                           u"\u2000-\u206F"          #generalPunctuations

                           "]+", flags=re.UNICODE)
    english_pattern=re.compile('[a-zA-Z0-9]+', flags=re.I)
    #latin_pattern=re.compile('[A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s]*',)

    text=emoji_pattern.sub(r'', text)
    text=english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    # define punctuation
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰🤣⚽️✌�￰৷￰'''

    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char

    # display the unpunctuated string
    return no_punct

def preprocessing(text):
    out=remove_punctuations(replace_strings(text))
    return out

In [None]:
train_df['text'] = train_df.text.apply(lambda x: preprocessing(str(x)))
train_df['summary'] = train_df.summary.apply(lambda x: preprocessing(str(x)))

test_df['text'] = test_df.text.apply(lambda x: preprocessing(str(x)))
test_df['summary'] = test_df.summary.apply(lambda x: preprocessing(str(x)))

validation_df['text'] = validation_df.text.apply(lambda x: preprocessing(str(x)))
validation_df['summary'] =validation_df.summary.apply(lambda x: preprocessing(str(x)))

In [None]:
data1 =pd.read_excel('/content/stopwords_bangla.xlsx')
stop = data1['words'].tolist()

In [None]:
def stopwordRemoval(text):
    x=str(text)
    l=x.split()

    stm=[elem for elem in l if elem not in stop]

    out=' '.join(stm)

    return str(out)

In [None]:
train_df['text'] = train_df.text.apply(lambda x: stopwordRemoval(str(x)))
train_df['summary'] = train_df.summary.apply(lambda x: stopwordRemoval(str(x)))

test_df['text'] = test_df.text.apply(lambda x: stopwordRemoval(str(x)))
test_df['summary'] = test_df.summary.apply(lambda x: stopwordRemoval(str(x)))

validation_df['text'] = validation_df.text.apply(lambda x: stopwordRemoval(str(x)))
validation_df['summary'] = validation_df.summary.apply(lambda x: stopwordRemoval(str(x)))

In [None]:
train = train_df[~train_df['summary'].apply(lambda x: len(x.split()) < 6)]
test = test_df[~test_df['summary'].apply(lambda x: len(x.split()) < 6)]
validation = validation_df[~validation_df['summary'].apply(lambda x: len(x.split()) < 6)]

In [None]:
train.head()

In [None]:
train.rename(columns={'text': 'article'}, inplace=True)
test.rename(columns={'text': 'article'}, inplace=True)
validation.rename(columns={'text': 'article'}, inplace=True)

In [None]:
# dataset.shape
train = train[:300]
test = test[:50]
validation = validation[:90]

In [None]:
train.head()

In [None]:
train = Dataset.from_dict(train)
validation = Dataset.from_dict(validation)
test = Dataset.from_dict(test)

# Create DatasetDict
dataset = DatasetDict({
    'validation': validation,
    'test': test,
    'train': train
})


max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-3B-Instruct",
    # model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    token = "",
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

alpaca_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(example):
    # Retrieve question and answer from the example
    instruction = "Please provide a summary of the following article"
    question = example["article"]
    answer = example["summary"]

    # Check the structure and content of the example
    # print(f"Question: {question}")
    # print(f"Answer: {answer}")

    # Construct the formatted prompt text
    prompt_text = alpaca_prompt.format(instruction, question, answer) + EOS_TOKEN

    # Return the formatted prompt text as a dictionary
    return {"text": prompt_text}

# Assuming 'dataset' is your dataset object
dataset = dataset.map(formatting_prompts_func)

# Now check the dataset and ensure that it has been transformed correctly

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["validation"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 3,
        warmup_steps = 5,
        num_train_epochs = 5,
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        evaluation_strategy='epoch',
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "output3",
    ),
)

trainer_stats = trainer.train()



In [None]:
dataset["train"][0]

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/output1/checkpoint-80", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Please provide a summary of the following article", # instruction
        "অ্যান্ড্রয়েড ফোন নির্মাতারা নিজ ডিভাইসে অপারেটিং সিস্টেম হিসেবে ওএস ললিপপের পুরানো সংস্করণ কিটক্যাটই", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

In [None]:
from unsloth import FastLanguageModel
from transformers import TextStreamer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
# Calculating METEOR (requires nltk)
import nltk
nltk.download('wordnet')
from nltk.translate.meteor_score import meteor_score

# Calculating ROUGE scores
from unsloth import FastLanguageModel
from transformers import TextStreamer
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score


# Load your model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/output1/checkpoint-80",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)  # Enable faster inference


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')  # Required for METEOR

# Define inputs and outputs
references = ["কাঠবাদাম খান ভুঁড়ি কমান"]
predictions = [generated_text]

# BLEU Score
bleu_scores = [sentence_bleu([ref], pred) for ref, pred in zip(references, predictions)]
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)

# ROUGE Score - removing stemmer for Bengali
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
avg_rouge_scores = {k: sum([score[k].fmeasure for score in rouge_scores]) / len(rouge_scores) for k in rouge_scores[0].keys()}

# Tokenize Bengali Text for METEOR
tokenized_references = [[ref.split()] for ref in references]  # Tokenize each reference sentence
tokenized_predictions = [pred.split() for pred in predictions]

# METEOR Score
meteor_scores = [meteor_score(ref, pred) for ref, pred in zip(tokenized_references, tokenized_predictions)]
avg_meteor_score = sum(meteor_scores) / len(meteor_scores)

# BERTScore (Change 'en' to 'bn' for Bengali if supported)
P, R, F1 = bert_score(predictions, references, lang="bn")
avg_bert_score = F1.mean().item()

# Print results
print("Average BLEU Score:", avg_bleu_score)
print("Average ROUGE Scores:", avg_rouge_scores)
print("Average METEOR Score:", avg_meteor_score)
print("Average BERTScore F1:", avg_bert_score)


In [None]:
dataset["test"][0]

In [None]:
from datasets import load_dataset  # Only load_dataset is imported from datasets
from evaluate import load # load instead of load_metric is imported from evaluate

# Your code remains the same...
rouge_metric = load('rouge') # load instead of load_metric is used here

In [None]:
# !pip install tqdm
from tqdm import tqdm # Import tqdm

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [None]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16,
                               column_text="article",
                               column_summary="highlights"):

    # Define the device (CPU or GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device here

    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        # Change max_length to a higher value or use max_new_tokens
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=1, # Set num_beams to 1
                         max_new_tokens=128)

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

rouge_metric = load('rouge')
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
score = calculate_metric_on_test_ds(
    test,
    rouge_metric,
    model,
    tokenizer,
    batch_size = 4,
    column_text = 'article',
    column_summary= 'summary'
)
# Access the fmeasure directly from the score dictionary
rouge_dict = dict((rn, score[rn]) for rn in rouge_names) # Change this line to access the fmeasure directly
pd.DataFrame(rouge_dict, index = [f'pegasus'] )

In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Please provide a detailed answer to the following question", # instruction
        "সাধারণ পরিষদের সভা কোথায় অনুষ্ঠিত হয়?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True)
tokenizer.batch_decode(outputs)

You can also use Hugging Face's AutoModelForPeftCausalLM. Only use this if you do not have unsloth installed. It can be hopelessly slow, since 4bit model downloading is not supported, and Unsloth's inference is 2x faster.