In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer, SFTConfig
from datasets import DatasetDict, Dataset
import pandas as pd
import torch

from src.RAG_Calculater import RAG
from src.Prompt_Factory import prompt_factory
from src.Case_Builder import (device,
                              bert_version,
                              bert_model_name,
                              genai_model_name,
                              prompt_strategy_used,
                              dataset_name
                              )

In [2]:
import evaluate
from rouge_score import rouge_scorer
from transformers import BartTokenizer, BartForConditionalGeneration
import numpy as np
import torch

# BARTScore için model ve tokenizer
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

# ROUGE metriğini yükle
rouge_metric = evaluate.load('rouge')

def compute_metrics(pred):
    """
    Computes metrics for summarization tasks using ROUGE, FKGL, DCRS, and BARTScore.

    Parameters:
    - pred: A named tuple containing predictions and labels from HuggingFace Trainer.

    Returns:
    A dictionary containing evaluation metrics.
    """
    # Tahmin ve etiketleri ayıkla
    predictions = pred.predictions
    labels = pred.label_ids

    # Tokenizer ile tahmin ve etiketleri metne dönüştür
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE Skorları
    rouge_results = rouge_metric.compute(predictions=decoded_preds, 
                                         references=decoded_labels,
                                         use_aggregator=True,
                                         use_stemmer=True,
                                        )
    rouge1 = rouge_results["rouge1"].mid.fmeasure
    rouge2 = rouge_results["rouge2"].mid.fmeasure
    rougel = rouge_results["rougeL"].mid.fmeasure
    
    # 
    # # FKGL Hesaplama
    # def fkgl_score(text):
    #     words = text.split()
    #     sentences = text.count('.') + text.count('!') + text.count('?')
    #     syllables = sum([sum(1 for char in word if char.lower() in "aeiou") for word in words])
    #     if len(words) == 0 or sentences == 0:
    #         return np.nan  # Bölüm sıfır hatasını önle
    #     return 0.39 * (len(words) / sentences) + 11.8 * (syllables / len(words)) - 15.59
    # 
    # fkgl_scores = [fkgl_score(pred) for pred in decoded_preds]
    # fkgl = np.mean(fkgl_scores)
    # 
    # # DCRS Hesaplama
    # def dcrs(pred_text, ref_text):
    #     scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    #     scores = scorer.score(ref_text, pred_text)
    #     return (scores["rouge1"].fmeasure + scores["rouge2"].fmeasure + scores["rougeL"].fmeasure) / 3
    # 
    # dcrs_scores = [dcrs(pred, ref) for pred, ref in zip(decoded_preds, decoded_labels)]
    # dcrs = np.mean(dcrs_scores)
    # 
    # # BARTScore Hesaplama
    # def bartscore(pred_text, ref_text):
    #     inputs = bart_tokenizer(ref_text, return_tensors="pt", truncation=True, max_length=1024)
    #     with torch.no_grad():
    #         outputs = bart_model(**inputs, labels=bart_tokenizer(pred_text, return_tensors="pt").input_ids)
    #     return -outputs.loss.item()  # Negatif loss, yüksek skor daha iyi
    # 
    # bart_scores = [bartscore(pred, ref) for pred, ref in zip(decoded_preds, decoded_labels)]
    # bartscore_mean = np.mean(bart_scores)
    # 
    return {
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougel,
        # "fkgl": fkgl,
        # "dcrs": dcrs,
        # "bartscore": bartscore_mean,
    }


In [3]:
data_train = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_train.json')
data_val = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_validation.json')
data_test = pd.read_json(f'src/dataset/clean/{dataset_name}/{bert_version}_test.json')

In [4]:
print(len(data_train), len(data_val), len(data_test))
data_train, data_val, data_test = data_val, data_test, data_train
print(len(data_train), len(data_val), len(data_test))

5 241 3
241 3 5


data_val = data_val.iloc[:2]
print(len(data_train), len(data_val), len(data_test))

In [5]:
data_train['rag_sentences'] = data_train['sentences_similarity'].apply(RAG)
data_val['rag_sentences'] = data_val['sentences_similarity'].apply(RAG)
data_test['rag_sentences'] = data_test['sentences_similarity'].apply(RAG)

In [6]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(data_train),
    "validation": Dataset.from_pandas(data_val),
    "test": Dataset.from_pandas(data_test)
})

In [7]:
lora_config = LoraConfig(
    r=8,  # Reduced rank
    lora_alpha=8,  # Lower scaling factor
    target_modules=["q"],  # Update fewer modules (e.g., only query weights)
    lora_dropout=0.1,  # Increased dropout for better regularization
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # Task type remains the same
)

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(genai_model_name, 
                                              torch_dtype=torch.bfloat16,
                                              low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(genai_model_name)

In [9]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [10]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [11]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 76961152
all model parameters: 76961152
percentage of trainable model parameters: 100.00%


In [12]:
peft_model = get_peft_model(model, lora_config)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [13]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 172032
all model parameters: 77133184
percentage of trainable model parameters: 0.22%


In [14]:
sum([len(" ".join(x).split()) for x in data_train['summary']]) / len(data_train['summary'])

389.8755186721992

In [15]:
total_x = []
def tokenize_function(example):
    global total_x
    # Lay summary için optimize edilmiş prompt
    prompt = prompt_factory(prompt_strategy_used, example)
    summary = ' '.join(map(str, example['summary']))

    # Girdileri tokenlaştırma
    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=1024,  # T5-small için maksimum girdi boyutu
        return_tensors="pt"
    ).input_ids.squeeze(0)

    # Çıkışları (lay summary) tokenlaştırma
    example['labels'] = tokenizer(
        summary,
        padding="max_length",
        truncation=True,
        max_length=512,  # Lay summary genellikle kısa tutulur
        return_tensors="pt"
    ).input_ids.squeeze(0)
    
    total_x.append(len(prompt.split()))
    
    return example


In [16]:
tokenized_datasets = dataset.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'year', 'title', 'sections', 'headings', 'abstract', 'summary', 'keywords', 'sentences_similarity', 'rag_sentences'])

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [17]:
for x in tokenized_datasets['validation']['input_ids']:
    print(len(x))

1024
1024
1024


In [18]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

Shapes of the datasets:
Training: (241, 2)
Validation: (3, 2)
Test: (5, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 241
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 3
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
})


In [20]:
output_dir = f'./results/BiOzU_{bert_version}_{dataset_name}_{prompt_strategy_used}-training'


peft_training_args = SFTConfig(
    output_dir=output_dir,
    max_seq_length=1024,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    # per_device_eval_batch_size=3,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    # eval_strategy="epoch",
    optim="adamw_hf",
    #optim="adamw_8bit",
    bf16=True,
    )
peft_trainer = SFTTrainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    
    
    )

In [None]:
peft_trainer.train()



In [None]:
peft_model_path=f"./results/BiOzU_{bert_version}_{dataset_name}_{prompt_strategy_used}-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)