In [1]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
from src.RAG_Calculater import RAG
from src.Prompt_Factory import prompt_factory
from peft import LoraConfig, get_peft_model, TaskType
from datasets import DatasetDict, Dataset

In [2]:
dataset_name = "elife"

data_train = pd.read_json(f'src/dataset/clean/{dataset_name}/train.json')
data_val = pd.read_json(f'src/dataset/clean/{dataset_name}/validation.json')
data_test = pd.read_json(f'src/dataset/clean/{dataset_name}/test.json')

In [3]:
data_train['rag_sentences'] = data_train['sentences_similarity'].apply(RAG)
data_val['rag_sentences'] = data_val['sentences_similarity'].apply(RAG)
data_test['rag_sentences'] = data_test['sentences_similarity'].apply(RAG)

In [4]:
len(data_train.loc[0, 'rag_sentences'])

10

In [5]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(data_train),
    "validation": Dataset.from_pandas(data_val),
    "test": Dataset.from_pandas(data_test)
})

In [6]:
lora_config = LoraConfig(
    r=8,  # Reduced rank
    lora_alpha=8,  # Lower scaling factor
    #target_modules=["q"],  # Update fewer modules (e.g., only query weights)
    lora_dropout=0.1,  # Increased dropout for better regularization
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Task type remains the same
)

In [7]:
model_name='google/flan-t5-small'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [9]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [10]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


In [11]:
peft_model = get_peft_model(model, lora_config)

In [12]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 344064
all model parameters: 77305216
percentage of trainable model parameters: 0.45%


In [13]:
for x in data_train['summary']:
    print(len(" ".join(x).split()))

357
392
437
385
349


In [14]:
def tokenize_function(example):
    # Lay summary için optimize edilmiş prompt
    prompt = prompt_factory(1, example)
    summary = ' '.join(map(str, example['summary']))

    # Girdileri tokenlaştırma
    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=1024,  # T5-small için maksimum girdi boyutu
        return_tensors="pt"
    ).input_ids.squeeze(0)

    # Çıkışları (lay summary) tokenlaştırma
    example['labels'] = tokenizer(
        summary,
        padding="max_length",
        truncation=True,
        max_length=512,  # Lay summary genellikle kısa tutulur
        return_tensors="pt"
    ).input_ids.squeeze(0)

    return example


In [15]:
tokenized_datasets = dataset.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'year', 'title', 'sections', 'headings', 'abstract', 'summary', 'keywords', 'sentences_similarity', 'rag_sentences'])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [16]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

Shapes of the datasets:
Training: (5, 2)
Validation: (2, 2)
Test: (3, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 3
    })
})


In [17]:
output_dir = f'./results/BiOzU_{dataset_name}-training'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    #auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1
    )
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
    )

In [18]:
peft_trainer.train()

Step,Training Loss
1,6.0


TrainOutput(global_step=1, training_loss=6.0, metrics={'train_runtime': 110.0275, 'train_samples_per_second': 0.045, 'train_steps_per_second': 0.009, 'total_flos': 1869474693120.0, 'train_loss': 6.0, 'epoch': 1.0})

In [19]:
peft_model_path=f"./results/BiOzU_{dataset_name}-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./results/BiOzU_elife-checkpoint-local/tokenizer_config.json',
 './results/BiOzU_elife-checkpoint-local/special_tokens_map.json',
 './results/BiOzU_elife-checkpoint-local/tokenizer.json')