In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
from peft import PeftModel, PeftConfig
from datasets import load_dataset
import evaluate
import pandas as pd
import torch
from tqdm import tqdm
from datasets import DatasetDict, Dataset
from src.RAG_Calculater import RAG
from src.Prompt_Factory import prompt_factory

In [2]:
dataset_name = "elife"

data_train = pd.read_json(f'src/dataset/clean/{dataset_name}/train.json')
data_val = pd.read_json(f'src/dataset/clean/{dataset_name}/validation.json')
data_test = pd.read_json(f'src/dataset/clean/{dataset_name}/test.json')

In [3]:
data_train['rag_sentences'] = data_train['sentences_similarity'].apply(RAG)
data_val['rag_sentences'] = data_val['sentences_similarity'].apply(RAG)
data_test['rag_sentences'] = data_test['sentences_similarity'].apply(RAG)

In [4]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(data_train),
    "validation": Dataset.from_pandas(data_val),
    "test": Dataset.from_pandas(data_test)
})

In [5]:
peft_model_path=f"./results/BiOzU_{dataset_name}-checkpoint-local"

In [6]:
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [7]:
def tokenize_function(example):
    # Lay summary için optimize edilmiş prompt
    prompt = prompt_factory(1, example)
    summary = ' '.join(map(str, example['summary']))

    # Girdileri tokenlaştırma
    example['input_ids'] = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=1024,  # T5-small için maksimum girdi boyutu
        return_tensors="pt"
    ).input_ids

    # Çıkışları (lay summary) tokenlaştırma
    example['labels'] = tokenizer(
        summary,
        padding="max_length",
        truncation=True,
        max_length=512,  # Lay summary genellikle kısa tutulur
        return_tensors="pt"
    ).input_ids

    return example


In [16]:
tokenized_datasets = dataset.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'year', 'title', 'sections', 'headings', 'abstract', 'keywords', 'sentences_similarity', 'rag_sentences'])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [17]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

Shapes of the datasets:
Training: (5, 3)
Validation: (2, 3)
Test: (3, 3)
DatasetDict({
    train: Dataset({
        features: ['summary', 'input_ids', 'labels'],
        num_rows: 5
    })
    validation: Dataset({
        features: ['summary', 'input_ids', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['summary', 'input_ids', 'labels'],
        num_rows: 3
    })
})


In [24]:
input_ids_list = tokenized_datasets['test']['input_ids']
human_baseline_summaries = tokenized_datasets['test']['summary']
peft_model_summaries = []

for idx in tqdm(range(len(input_ids_list))):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = torch.tensor(input_ids_list[idx])
    human_baseline_text_output = ' '.join(map(str, human_baseline_summaries[idx]))
    human_baseline_summaries[idx] = human_baseline_text_output
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=512, num_beams=1))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)

100%|██████████| 3/3 [00:10<00:00,  3.42s/it]


In [25]:
model_results = pd.DataFrame({
    'reference': human_baseline_summaries,
    'prediction': peft_model_summaries
})

In [26]:
model_results.to_csv(f'results/peft_model_summaries_{dataset_name}.csv')