In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import DatasetDict, Dataset
import pandas as pd
import torch

from src.RAG_Calculater import RAG
from src.Prompt_Factory import prompt_factory
from src.Case_Builder import (device,
                              bert_version,
                              bert_model_name,
                              genai_model_name,
                              prompt_strategy_used,
                              dataset_name
                              )

In [None]:
data_train = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_train.json')
data_val = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_validation.json')
data_test = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_test.json')

In [None]:
data_train['rag_sentences'] = data_train['sentences_similarity'].apply(RAG)
data_val['rag_sentences'] = data_val['sentences_similarity'].apply(RAG)
data_test['rag_sentences'] = data_test['sentences_similarity'].apply(RAG)

In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(data_train),
    "validation": Dataset.from_pandas(data_val),
    "test": Dataset.from_pandas(data_test)
})

In [None]:
lora_config = LoraConfig(
    r=8,  # Reduced rank
    lora_alpha=8,  # Lower scaling factor
    #target_modules=["q"],  # Update fewer modules (e.g., only query weights)
    lora_dropout=0.1,  # Increased dropout for better regularization
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Task type remains the same
)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(genai_model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(genai_model_name)

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [None]:
print(print_number_of_trainable_model_parameters(model))

In [None]:
peft_model = get_peft_model(model, lora_config)

In [None]:
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
for x in data_train['summary']:
    print(len(" ".join(x).split()))

In [None]:
def tokenize_function(example):
    # Lay summary için optimize edilmiş prompt
    prompt = prompt_factory(prompt_strategy_used, example)
    summary = ' '.join(map(str, example['summary']))

    # Girdileri tokenlaştırma
    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=1024,  # T5-small için maksimum girdi boyutu
        return_tensors="pt"
    ).input_ids.squeeze(0)

    # Çıkışları (lay summary) tokenlaştırma
    example['labels'] = tokenizer(
        summary,
        padding="max_length",
        truncation=True,
        max_length=512,  # Lay summary genellikle kısa tutulur
        return_tensors="pt"
    ).input_ids.squeeze(0)

    return example


In [None]:
tokenized_datasets = dataset.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'year', 'title', 'sections', 'headings', 'abstract', 'summary', 'keywords', 'sentences_similarity', 'rag_sentences'])

In [None]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

In [None]:
output_dir = f'./results/BiOzU_{bert_version}_{dataset_name}_{prompt_strategy_used}-training'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    #auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1
    )
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
    )

In [None]:
peft_trainer.train()

In [None]:
peft_model_path=f"./results/BiOzU_{bert_version}_{dataset_name}_{prompt_strategy_used}-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)