use this enviroment to prevent crash the model when it's training

In [49]:
import time
import torch
import datasets
import evaluate
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    GenerationConfig,
)
from peft import LoraConfig, PeftConfig, get_peft_model, TaskType
from transformers import DataCollatorForSeq2Seq

dataset = datasets.load_dataset("knkarthick/dialogsum")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [50]:
import torch
# try a simple test to check autocast with bfloat16
try:
    with torch.cuda.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
        x = torch.randn(1, device="cuda")
    print("bfloat16 autocast ممکن است پشتیبانی شود")
except Exception as e:
    print("bfloat16 autocast پشتیبانی نمی‌شود:", e)

bfloat16 autocast پشتیبانی نمی‌شود: autocast.__init__() got an unexpected keyword argument 'device_type'


  with torch.cuda.amp.autocast(device_type="cuda", dtype=torch.bfloat16):


In [51]:
model_name='google/flan-t5-base'

origin_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [52]:
def NumOfTrainableParams(model):
    total = origin_model.num_parameters()
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    percent = round(trainable/total*100, 2)
    print(f"Number of All parameters: {total}\tNumber of Trainable: {trainable}\nPercentage: {percent}%")

NumOfTrainableParams(origin_model)

Number of All parameters: 247577856	Number of Trainable: 247577856
Percentage: 100.0%


# Pre-Test the FLAN-T5 model

In [53]:
_tdial = dataset["test"][421]["dialogue"]
_tsumm = dataset["test"][421]["summary"]
_tprompt = (
    f""""Write a short summary for this text: {_tdial}"""
)
_tinput = tokenizer(_tprompt, return_tensors="pt")["input_ids"]
_tanswer = tokenizer.decode(
    origin_model.generate(_tinput, max_new_tokens=50)[0], skip_special_tokens=True
)

print(_tprompt)
print(50*"=")
print(f"Human Summary: {_tsumm}\n\nModel Summary: {_tanswer}")

"Write a short summary for this text: #Person1#: Honey, of course I forgive you! I love you so much! I've really missed you. I was wrong to get upset over nothing. 
#Person2#: I'm sorry I haven't called or anything, but right after you decided you wanted a break, I was called up north to put out some major forest fires! I was in the middle of nowhere, working day and night, trying to prevent the blaze from spreading! It was pretty intense. 
#Person1#: Oh, honey, I'm glad you're okay! But I have some exciting news. . . I think I'm pregnant! 
#Person2#: Really? Wow, that's amazing! This is great news! I've always wanted to be a father! We'll go to the doctor first thing in the morning! 
#Person3#: We have your test results back and, indeed, you are pregnant. Let's see here. . . everything seems to be in order. Your approximate due date is October twenty-seventh two thousand and nine, so that means that the baby was conceived on February third, two thousand and nine. 
#Person2#: Are you s

# Fullfiled Fine-Tuning

In [54]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    # return lists (not tensors) so datasets.map handles them properly
    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=512,
    )["input_ids"]

    targets = tokenizer(
        example["summary"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )["input_ids"]

    # replace pad token id in labels with -100 so loss ignores padding
    pad_id = tokenizer.pad_token_id or 0
    labels = [[(tok if tok != pad_id else -100) for tok in seq] for seq in targets]

    return {"input_ids": inputs, "labels": labels}


ds_tokenized = dataset.map(tokenize_function, batched=True)
ds_tokenized = ds_tokenized.remove_columns(["id", "dialogue", "summary", "topic"])

In [55]:
print(f"shapes of dataset\n", 50*'=')
print(f'Train: {ds_tokenized['train'].shape}')
print(f'Test: {ds_tokenized['test'].shape}')
print(f'Validation: {ds_tokenized['validation'].shape}')

print(f'tokenized dataset:\n{ds_tokenized}')
print(f'\n\ntwo samples:\n {ds_tokenized['train'][:2]['labels']}')

shapes of dataset
Train: (12460, 2)
Test: (1500, 2)
Validation: (500, 2)
tokenized dataset:
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})


two samples:
 [[1363, 5, 3931, 31, 7, 652, 3, 9, 691, 18, 413, 6, 11, 7582, 12833, 77, 7, 7786, 7, 376, 12, 43, 80, 334, 215, 5, 12833, 77, 7, 31, 195, 428, 128, 251, 81, 70, 2287, 11, 11208, 12, 199, 1363, 5, 3931, 10399, 10257, 5, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -

## Fine Tune Section

In [56]:
# ensure training mode settings
origin_model.config.use_cache = False
origin_model.config.pad_token_id = tokenizer.pad_token_id  # صریحاً تنظیم کن

In [57]:
# torch.cuda.empty_cache()

# recreate lora model after changing config (اگر قبلاً ساخته شده، دوباره بساز)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=32,
    lora_alpha=16,
    target_modules=['v', 'q'],
    bias='none',
    lora_dropout=0.05,
)
lora_model = get_peft_model(origin_model, lora_config)

NumOfTrainableParams(lora_model)

Number of All parameters: 251116800	Number of Trainable: 3538944
Percentage: 1.41%


In [58]:
# torch.cuda.empty_cache()
output_dir = f'./saved_models/mtrain-{str(int(time.time()))}'

data_collator = DataCollatorForSeq2Seq(tokenizer, model=origin_model, pad_to_multiple_of=None)

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=2,
    # نتابع کردن max_steps یا تنظیم آن مناسب نیاز است؛ max_steps=1 فقط برای تست است
    max_steps=10,
    bf16=False,
    fp16=False,
    per_device_train_batch_size=4,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=ds_tokenized["train"],
    eval_dataset=ds_tokenized["validation"],
    data_collator=data_collator,
)

# less memory using but it's slower
# origin_model.gradient_checkpointing_enable()

In [59]:
import numpy as np

# چاپ نمونه‌ی برچسب‌ها و تعداد توکن‌های غیر -100
for i in range(3):
    lbl = ds_tokenized['train'][i]['labels']
    non_ignored = [tok for tok in lbl if tok != -100]
    print(f"sample {i}: total_len={len(lbl)}, non-ignored={len(non_ignored)}")
    print("decoded target (non-ignored):", tokenizer.decode(non_ignored, skip_special_tokens=True))
    print("-"*40)

# آماری روی چند نمونه — اصلاح شده
lbls = ds_tokenized['train'][:200]['labels']   # این یک لیست از توکن‌های برچسب هر مثال است
counts = [np.sum(np.array(lbl) != -100) for lbl in lbls]
print("median non-ignored tokens in first 200:", np.median(counts))

# بررسی pad_token_id
print("tokenizer.pad_token_id:", tokenizer.pad_token_id)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token or "<pad>"
    origin_model.config.pad_token_id = tokenizer.pad_token_id
    print("Set pad_token_id to", tokenizer.pad_token_id)

sample 0: total_len=128, non-ignored=48
decoded target (non-ignored): Mr. Smith's getting a check-up, and Doctor Hawkins advises him to have one every year. Hawkins'll give some information about their classes and medications to help Mr. Smith quit smoking.
----------------------------------------
sample 1: total_len=128, non-ignored=27
decoded target (non-ignored): Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vaccine.
----------------------------------------
sample 2: total_len=128, non-ignored=31
decoded target (non-ignored): #Person1#'s looking for a set of keys and asks for #Person2#'s help to find them.
----------------------------------------
median non-ignored tokens in first 200: 34.0
tokenizer.pad_token_id: 0


In [60]:
# یک قدم آموزشی دستی برای تایید
from torch.utils.data import DataLoader
dl = DataLoader(ds_tokenized['train'].select(range(2)), batch_size=1, collate_fn=data_collator)
batch = next(iter(dl))
lora_model.train()
batch_cuda = {k: v.cuda() for k,v in batch.items()}
out = lora_model(**batch_cuda)
print("manual train-step loss:", out.loss.item())

manual train-step loss: 2.6186940670013428


In [61]:
# torch.cuda.empty_cache()
trainer.train()

Step,Training Loss
2,0.0
4,0.0
6,0.0
8,0.0
10,0.0


TrainOutput(global_step=10, training_loss=0.0, metrics={'train_runtime': 2.9309, 'train_samples_per_second': 13.648, 'train_steps_per_second': 3.412, 'total_flos': 27825159536640.0, 'train_loss': 0.0, 'epoch': 0.0032102728731942215})