In [1]:
from dask.array import optimize
from mlflow.models.cli import predict

from lib.utility import CaseBuilder, ResultCalculator, MessageFactory, CustomLossFunction
from lib.dataloaders import GraphXtractDataset
from lib.gemini import GeminiFineTuner, GeminiTester
from lib.rag_factories import RAG_Factory
from lib.selectors.models.gat_gcn_selector import GATGCNSelector


import torch
from torch.utils.data import DataLoader
from torch_geometric.data import Batch, ClusterData
from torch.utils.data._utils.collate import default_collate

import google.generativeai as genai
from google.generativeai import types

from tqdm import tqdm
import pandas as pd
import random
import numpy as np

[nltk_data] Downloading package punkt to /home/cagatay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
genai.configure(api_key="AIzaSyC42OyqZc03g56rzaoC4JkDV9dt7TZ49ic")

In [3]:
dataset_name = "elife"
rag_strategy = "graphxtract"
rag_n = 10

batch_size = 16

print("RAG Strategy: ", rag_strategy)
print("RAG N: ", rag_n)

RAG Strategy:  graphxtract
RAG N:  10


In [4]:
from torch_geometric.data import Batch
import pandas as pd

def collate_fn(batch):
    rows, graphs = zip(*batch)  # rows: Tuple[pd.Series]
    return rows, Batch.from_data_list(list(graphs))#graphs


In [5]:
case_builder = CaseBuilder(dataset_name=dataset_name,
                           rag_strategy=rag_strategy,
                           rag_n=rag_n,
                           batch_size=batch_size)

selector_model = GATGCNSelector()

rag_factory = RAG_Factory()
# rag_factory.factory.model = selector_model

message_factory = MessageFactory()

gemini_trainer = GeminiFineTuner()
gemini_tester = GeminiTester()

result_calculator = ResultCalculator()
# loss_function = CustomLossFunction(reward_smoothing_factor=1.0)

source_model = case_builder.genai_model_name

In [6]:
dataset_train = GraphXtractDataset(source_name=case_builder.dataset_name, split_name='train')
dataset_test = GraphXtractDataset(source_name=case_builder.dataset_name, split_name='test')

In [7]:
train_loader = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    num_workers=batch_size,        # worker başına bir subset ayrılıp __getitem__ paralelleşir
    pin_memory=False,      # GPU’ya aktarırken hız
    persistent_workers=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    dataset_test,
    batch_size=batch_size,
    shuffle=False,
    num_workers=batch_size,
    pin_memory=False,      # GPU’ya aktarırken hız
    persistent_workers=True,
    collate_fn=collate_fn
)

In [8]:
def devide_batch(graph_batch, logits_batch):
    num_graphs = graph_batch.num_graphs         # veya: batch.batch.max().item()+1

    # 3) Her grafiğin logit’lerini ayrı bir listeye topla
    per_graph_logits = [
    logits_batch[graph_batch.batch == i]      # i’nci grafın düğüm logit’leri
    for i in range(num_graphs)
    ]

    return per_graph_logits

def answer_checker(clean_answers, labels):
    answer_results = []
    label_results = []
    for answer, label in zip(clean_answers, labels):
        if answer != "":
            answer_results.append(answer)
            label_results.append(label)
    return answer_results, label_results


In [9]:
from torch import nn
optimizer = torch.optim.Adam(selector_model.parameters(), lr=1e-4)
loss_function = nn.CrossEntropyLoss()

In [10]:
# for epoch in range(1, 6, 2):
#     # Gemini Fine-Tune
#     selector_model.eval()
#     train_messages = []
#     for train_row_batch, train_graph_batch in tqdm(test_loader, desc=f"Epoch: {epoch} | Gemini Training Process: "):
#         with torch.no_grad():
#             logits_all_batch = selector_model(train_graph_batch)
#             logits_batch = devide_batch(train_graph_batch, logits_all_batch)
#             selected_sentence_batch = rag_factory(train_row_batch, logits_batch)
#             message_batch = message_factory(pd.DataFrame(train_row_batch), selected_sentence_batch)
#             train_messages.extend(message_batch)
#     gemini_trainer.set_model(source_model)
#     gemini_trainer.set_training_data(train_messages)
#     gemini_trainer.set_epoch_count(epoch)
#     source_model = gemini_trainer.get_fine_tuned_model_name()
#
#     source_model = 'tunedModels/graphxtract10elife-1re6tcwq1ell'
#
#     gemini_tester.set_source_model(source_model)
#     gemini_tester.update_genai_model()
#
#     # Selector Train
#
#     selector_model.train()
#     for current_epoch in range(epoch):
#         for train_row_batch, train_graph_batch in tqdm(train_loader, desc=f"Epoch: {current_epoch+1} / {epoch} | Training Process: "):
#
#             optimizer.zero_grad()
#
#             logits_all_batch = selector_model(train_graph_batch)
#             logits_batch = devide_batch(train_graph_batch, logits_all_batch)
#
#             # Create prompt
#             rag_result = rag_factory(train_row_batch, logits_batch)
#             selected_sentence_batch, selected_log_probs_batch = zip(*rag_result)
#             message_batch = message_factory(pd.DataFrame(train_row_batch), selected_sentence_batch)
#
#             # Get answer form Gemini
#             clean_answers, labels = gemini_tester.predict_batch(message_batch)
#             clean_answers, labels = answer_checker(clean_answers, labels)
#
#             # Calculate performance metrics
#             reward = result_calculator.reward_function(clean_answers, labels)
#             loss = loss_function(selected_log_probs_batch, reward)
#             # Update Model
#             loss.backward()
#             optimizer.step()


In [None]:
try:
    import mlflow
    import mlflow.pytorch
except ImportError:
    !pip install mlflow
    import mlflow
    import mlflow.pytorch
from tqdm import tqdm

# (1) Deney adını ve parametreleri belirleyin:
mlflow.set_experiment("GraphXtract_Selector_Training")

# (2) Bir run açın:
with mlflow.start_run(run_name="selector_finetune"):
    # Log hiperparametreler
    mlflow.log_param("epoch_schedule", list(range(1, 10, 2)))
    mlflow.log_param("optimizer", optimizer.__class__.__name__)
    mlflow.log_param("learning_rate", optimizer.defaults.get('lr'))

    global_step = 0

    # (3) Orijinal loop'unuza MLflow log ekledik:
    for epoch in range(1, 10, 2):
        source_model = 'tunedModels/graphxtract10elife-1re6tcwq1ell'
        gemini_tester.set_source_model(source_model)
        gemini_tester.update_genai_model()

        selector_model.train()
        for current_epoch in range(epoch):
            epoch_loss_sum = 0.0
            epoch_reward_sum = 0.0
            batch_count = 0

            for train_row_batch, train_graph_batch in tqdm(
                    train_loader,
                    desc=f"[Epoch {epoch} | Sub-epoch {current_epoch+1}] Training"):

                optimizer.zero_grad()

                # forward
                logits_all_batch = selector_model(train_graph_batch)
                logits_batch = devide_batch(train_graph_batch, logits_all_batch)

                # prompt oluşturma
                rag_result = rag_factory(train_row_batch, logits_batch)
                selected_sentence_batch, selected_log_probs_batch = zip(*rag_result)
                message_batch = message_factory(
                    pd.DataFrame(train_row_batch),
                    selected_sentence_batch)

                # Gemini'den cevap al
                try:
                    clean_answers, labels = gemini_tester.predict_batch(message_batch)
                except:
                    gemini_tester.update_genai_model()
                    clean_answers, labels = gemini_tester.predict_batch(message_batch)

                clean_answers, labels = answer_checker(clean_answers, labels)

                # reward ve loss
                # reward = result_calculator.reward_function(clean_answers, labels)
                # loss = loss_function(selected_log_probs_batch, reward)

                prediction_embed = genai.embed_content(
                                    model="models/text-embedding-004",
                                    content=clean_answers)
                labels_embed = genai.embed_content(
                                model="models/text-embedding-004",
                                content=labels)

                prediction_embed = torch.tensor(prediction_embed['embedding']).detach().requires_grad_(True)
                labels_embed = torch.tensor(labels_embed['embedding'])
                loss = loss_function(prediction_embed, labels_embed)


                # backward & step
                loss.backward()
                optimizer.step()

                # (4) Her batch için loss'u kaydet
                global_step += 1
                mlflow.log_metric("train_loss", loss.item(), step=global_step)
                # mlflow.log_metric("train_reward", reward.item(), step=global_step)

                epoch_loss_sum += loss.item()
                # epoch_reward_sum += reward.item()
                batch_count += 1

            # (5) Dilerseniz her alt-epoch sonunda ortalama loss'u da log'larsınız
            avg_epoch_loss = epoch_loss_sum / batch_count
            avg_epoch_reward = epoch_reward_sum / batch_count
            mlflow.log_metric("avg_epoch_loss", avg_epoch_loss, step=epoch)
            # mlflow.log_metric("avg_epoch_reward", avg_epoch_reward, step=epoch)

            # (6) Eğitim bittikten sonra modeli versiyonlayın
            mlflow.pytorch.log_model(
                selector_model,
                artifact_path="selector_model",
                registered_model_name="GraphXtractSelector"
            )

[Epoch 1 | Sub-epoch 1] Training:   6%|▌         | 16/289 [03:44<34:51,  7.66s/it]  

In [None]:
prediction_embed = genai.embed_content(
                                    model="models/text-embedding-004",
                                    content=clean_answers)