In [12]:
import os
from dotenv import load_dotenv
from neo4j import GraphDatabase
from datasets import load_from_disk
from retrieval.ner import NER
from retrieval.path_retriever import PathRetriever
import json

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
#DATASET_NAME = 'prime'
DATASET_NAME = 'mag'

In [14]:
# All data needed
qa = load_from_disk(f'{DATASET_NAME}-data/qa')

In [22]:
# Entity matching on all data

with open(f"{DATASET_NAME}-data/ner_instructions.json", 'r') as f:
    ner_instructions = json.load(f) 

with GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) as driver:
    ner = NER(ner_instructions, openai_api_key=OPENAI_API_KEY)
    qa_with_ner = qa \
        .map(lambda x: x | {'predicted_entities' : ner.find_source_nodes(x['question'], driver=driver)}, num_proc=8)
qa_with_ner.save_to_disk(f'{DATASET_NAME}-data/qa_with_ner')

Map (num_proc=8): 100%|██████████| 10/10 [00:06<00:00,  1.66 examples/s]
Map (num_proc=8): 100%|██████████| 10/10 [00:03<00:00,  2.59 examples/s]
Map (num_proc=8): 100%|██████████| 10/10 [00:03<00:00,  2.60 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'answer_ids', 'question', 'predicted_entities'],
        num_rows: 10
    })
    valid: Dataset({
        features: ['id', 'answer_ids', 'question', 'predicted_entities'],
        num_rows: 10
    })
    test: Dataset({
        features: ['id', 'answer_ids', 'question', 'predicted_entities'],
        num_rows: 10
    })
})

In [22]:
qa_with_ner = load_from_disk(f'{DATASET_NAME}-data/qa_with_ner')

with GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)) as driver:
    path_retriever = PathRetriever(dataset_name=DATASET_NAME)
    
    qa_with_cypher_queries = qa_with_ner \
        .map(lambda x: x | path_retriever.retrieve_paths(driver=driver, src_names=x['predicted_entities'], tgt_ids=x['answer_ids']), num_proc=8)
    
qa_with_cypher_queries.save_to_disk(f'{DATASET_NAME}-data/qa_with_cypher_queries')

Map (num_proc=8):   0%|          | 0/2665 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2665 [00:00<?, ? examples/s]