In [1]:
from lib.utility import CaseBuilder, ResultCalculator
from lib.dataloaders import PageRankDataset
from lib.gemini import GeminiFineTuner, GeminiTester

from torch.utils.data import DataLoader
import google.generativeai as genai

import pandas as pd
from tqdm import tqdm

[nltk_data] Downloading package punkt to /home/cagatay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
genai.configure(api_key="AIzaSyC42OyqZc03g56rzaoC4JkDV9dt7TZ49ic")

In [3]:
dataset_name = "elife"
rag_strategy = "pagerank"
rag_n = 10

batch_size = 4

print("RAG Strategy: ", rag_strategy)
print("RAG N: ", rag_n)

RAG Strategy:  pagerank
RAG N:  10


In [4]:
def collate_fn(batch):
    return batch

In [5]:
case_builder = CaseBuilder(dataset_name=dataset_name,
                           rag_strategy=rag_strategy,
                           rag_n=rag_n,
                           batch_size=batch_size)

gemini_trainer = GeminiFineTuner()
gemini_tester = GeminiTester()

result_calculator = ResultCalculator()

In [None]:
# print("Dataset Name: ", case_builder.dataset_name)
# df_train = pd.read_json(f'dataset/processed/{case_builder.dataset_name}/train.json').reset_index(drop=True)
# df_test = pd.read_json(f'dataset/processed/{case_builder.dataset_name}/test.json').reset_index(drop=True)
#
# print("Train Shape: ", df_train.shape)
# print("Test Shape: ", df_test.shape)

In [6]:
dataset_train = PageRankDataset(source_name=case_builder.dataset_name, split_name='train')

In [7]:
dataset_test = PageRankDataset(source_name=case_builder.dataset_name, split_name='test')

In [8]:
train_loader = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    num_workers=batch_size,        # worker başına bir subset ayrılıp __getitem__ paralelleşir
    pin_memory=True,      # GPU’ya aktarırken hız
    persistent_workers=True,
    collate_fn=collate_fn
)

In [9]:
test_loader = DataLoader(
    dataset_test,
    batch_size=batch_size,
    shuffle=False,
    num_workers=batch_size,        # worker başına bir subset ayrılıp __getitem__ paralelleşir
    pin_memory=True,      # GPU’ya aktarırken hız
    persistent_workers=True,
    collate_fn=collate_fn
)

In [None]:
training_messages = []
for train_batch in tqdm(train_loader, desc="Training Process: "):
    training_messages.extend(train_batch)

In [10]:
testing_messages = []
for test_batch in tqdm(test_loader, desc="Testing Process: "):
    testing_messages.extend(test_batch)

Testing Process: 100%|██████████| 61/61 [04:56<00:00,  4.86s/it]


In [None]:
gemini_trainer.set_epoch_count(20)
gemini_trainer.set_training_data(training_messages)

In [None]:
gemini_trainer.fit()

In [None]:
fine_tuned_model_name = gemini_trainer.get_fine_tuned_model_name()

In [None]:
gemini_tester.set_source_model(fine_tuned_model_name)
gemini_tester.update_genai_model()

In [None]:
clean_answers, labels = gemini_tester.predict_batch(testing_messages)

In [None]:
result_dict = result_calculator.evaluate(clean_answers, labels)

In [None]:
result_dict

In [2]:
30 * (0.999 ** 200)

24.559464884359066