In [1]:
from lib.utility import CaseBuilder, ResultCalculator
from lib.processors import GraphGenerator, SimilarityPreprocessor
from lib.rag_factories import RAG_Factory

from torch_geometric.data import Data, Batch

from lib.dataloaders import GraphDataloader
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np

import time
import json
import sys

[nltk_data] Downloading package punkt to /home/cagatay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Get Parameters

The parameters such as dataset name, rag strategy and number of selected sentence are given for this run

In [2]:
dataset_name = "elife"
rag_strategy = "pagerank"
rag_n = 10

batch_size = 8

print("RAG Strategy: ", rag_strategy)
print("RAG N: ", rag_n)

RAG Strategy:  pagerank
RAG N:  10


# Create Reqired Object in the Pipeline

Related object are created:

* case_builder: all needed information are kept in this oject
* similarity_preprocessor: cosine similarity calculated row by row at the beginning of pipeline one time
* graph_generator: Generate graph representation of row (batch supprted with Threads)
*

In [3]:
case_builder = CaseBuilder(dataset_name=dataset_name,
                           rag_strategy=rag_strategy,
                           rag_n=rag_n)

similarity_preprocessor = SimilarityPreprocessor()
graph_generator = GraphGenerator()

rag = RAG_Factory()

result_calculator = ResultCalculator()

In [4]:
print("Dataset Name: ", case_builder.dataset_name)
# df_train = pd.read_json(f'dataset/processed/{case_builder.dataset_name}/train.json').reset_index(drop=True)
df_test = pd.read_json(f'dataset/processed/{case_builder.dataset_name}/test.json').reset_index(drop=True)

# print("Train Shape: ", df_train.shape)
print("Test Shape: ", df_test.shape)

Dataset Name:  elife
Test Shape:  (241, 11)


In [5]:
df_test.head()

Unnamed: 0,headings,title,abstract,keywords,sections,summary,heading_clusters,title_embedding,abstract_embedding,keywords_embedding,sections_embedding
0,"[Title, Abstract, Introduction, Results, Discu...",Cerebellar implementation of movement sequence...,"Most movements are not unitary , but are compr...",neuroscience,[[Cerebellar implementation of movement sequen...,Imagine a gymnastics competition in which part...,"[0.0, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125...","[-0.6115092039000001, 0.2919370532, 0.99965220...","[-0.6199356914, 0.5176450610000001, 0.99962067...","[-0.7181161642, 0.447173804, 0.9996859431, -0....","[[[-0.6115093231000001, 0.2919372618, 0.999652..."
1,"[Title, Abstract, Introduction, Results and di...",Architecture of the human mTORC2 core complex,The mammalian target of rapamycin ( mTOR ) is ...,"short report, structural biology and molecular...",[[Architecture of the human mTORC2 core comple...,"To grow and multiply , a living cell must take...","[0.0, 0.125, 0.125, 0.125, 0.125, 0.25, 0.25, ...","[-0.6833246946, 0.3678493798, 0.99959510560000...","[-0.7574325204, 0.563606143, 0.999870657900000...","[-0.6531850696, 0.37302213910000004, 0.9993952...","[[[-0.6833247542, 0.367849499, 0.9995951056000..."
2,"[Title, Abstract, Introduction, Results, Discu...",Motion along the mental number line reveals sh...,Perception of number and space are tightly int...,neuroscience,[[Motion along the mental number line reveals ...,Our sense of number is thought to have emerged...,"[0.0, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125...","[-0.7579232454, 0.4932712615, 0.99987405540000...","[-0.8007718921, 0.6892958879000001, 0.99995392...","[-0.7181161642, 0.447173804, 0.9996859431, -0....","[[[-0.7579234242, 0.4932712615, 0.999874055400..."
3,"[Title, Abstract, Introduction, Results, Discu...",Allosteric inhibition of a stem cell RNA-bindi...,Gene expression and metabolism are coupled at ...,"biochemistry and chemical biology, structural ...",[[Allosteric inhibition of a stem cell RNA-bin...,"When an embryo is developing , stem cells must...","[0.0, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125...","[-0.6241006255, 0.27451998, 0.9995617867000001...","[-0.6822834015, 0.6058595181, 0.99974066020000...","[-0.6644507647, 0.4038660824, 0.9988580346, -0...","[[[-0.6241004467, 0.27451995020000003, 0.99956..."
4,"[Title, Abstract, Introduction, Results, Discu...",Synaptotagmin 7 functions as a Ca2+-sensor for...,Synaptotagmin ( syt ) 7 is one of three syt is...,"cell biology, neuroscience",[[Synaptotagmin 7 functions as a Ca2+-sensor f...,Neurons communicate with one another at juncti...,"[0.0, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125...","[-0.4976378381, 0.2259566784, 0.9984002709, -0...","[-0.7635375261, 0.6162644029000001, 0.99991971...","[-0.6633251309, 0.500700891, 0.9995726347, -0....","[[[-0.4976381958, 0.2259570509, 0.9984002709, ..."


In [6]:
dataset_test = GraphDataloader(df_test)

In [7]:
def collate_fn(batch):
    """
    batch: List of tuples (row_dict, Data)
    -> row_batch: pandas.DataFrame
    -> graph_batch: torch_geometric.data.Batch
    """
    rows, graphs = zip(*batch)
    row_batch = pd.DataFrame(rows).reset_index(drop=True)
    graph_batch = Batch.from_data_list(list(graphs))
    return row_batch, graph_batch


In [8]:
test_loader = DataLoader(
    dataset_test,
    batch_size=batch_size,
    shuffle=True,
    num_workers=batch_size,        # worker başına bir subset ayrılıp __getitem__ paralelleşir
    pin_memory=True,      # GPU’ya aktarırken hız
    persistent_workers=True,
    collate_fn = collate_fn
)

In [9]:
temp_graph_batch = None
temp_row_batch = None

for row_batch, graph_batch in test_loader:
    temp_graph_batch = graph_batch
    temp_row_batch = row_batch

    selected_sentences = rag(row_batch, graph_batch, batch_size)
    break

PageRankRAG Factory
PageRankRAG Factory
PageRankRAG Factory
PageRankRAG Factory
PageRankRAG Factory
PageRankRAG Factory
PageRankRAG Factory
PageRankRAG Factory


In [None]:
len(temp_row_batch.loc[0, 'sections_embedding'][0][0])

In [None]:
deneme = rag(temp_row_batch.loc[0], temp_graph_batch[0])

In [None]:
len(deneme)

In [None]:
deneme

In [13]:
len(selected_sentences[2])

10