### Multi Index Testing with graphrag library API

In [None]:
# Import query api from graphrag library
# Make sure to install graphrag version 0.3.0 to be able to get these packages
from graphrag.query.api import global_search, local_search

import os
import inspect
import yaml
from graphrag.config import create_graphrag_config
import pandas as pd 

In [None]:
# Load custom pipeline settings m
this_directory = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe()))
        )
data = yaml.safe_load(open(f"./pipeline-settings.yaml"))
# layer the custom settings on top of the default configuration settings of graphrag
parameters = create_graphrag_config(data, ".")

##### Logic to iterate through indexes, convert parquets to dataframes, append dataframes to a list, then concat them together to pass to graphrag.query.api

In [None]:
# Index folder names

index_names = ['./arizona', './alaska']

nodes_dfs =[]
community_dfs =[]
entities_dfs = []
text_units_dfs=[]
relationships_dfs = []
covariates_dfs = []

links = {"nodes": {}, "community": {}, "entities": {}, "text_units": {}, "relationships": {}, "covariates": {}}
max_vals = {"nodes": -1, "community": -1, "entities": -1, "text_units": -1, "relationships": -1, "covariates": -1}

for index_name in index_names:
    # Construct the path to the output directory
    output_path = os.path.join(index_name, "output")
    # Get all subdirectories in the output folder
    subdirs = [os.path.join(output_path, d) for d in os.listdir(output_path) if os.path.isdir(os.path.join(output_path, d))]
    
    # Find the most recently created subdirectory
    most_recent_subdir = max(subdirs, key=os.path.getmtime)
    
    # Construct the path to the respective parquet files, takes the most recent index iteration in the artifacts folder
    nodes_file_path = os.path.join(most_recent_subdir, "artifacts", "create_final_nodes.parquet")
    community_report_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_community_reports.parquet")  # Adjust the filename/path as necessary
    entities_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_entities.parquet")  # Adjust the filename/path as necessary
    text_units_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_text_units.parquet")
    relationships_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_relationships.parquet")
    covariates_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_covariates.parquet")
    # Read the parquet file into a DataFrame
    #For community df, id column is "community" and id type id "str". Convert type from "str" to "int", add max_id to the int, convert back to string and overwrite the column
    #For entities df, id column is "human_readable_id" and id type id "int". Add max_id to the int and overwrite the column. Add index name to the name column and overwrite the column,
    #add index name to the text_unit_ids column and overwrite the column
    #For nodes df, id column is "human_readable_id" and id type id "int". Add max_id to the int and overwrite the column. Add index name to the title column and overwrite the column
    #add index name to the source_id column and overwrite the column, add max_id for community to the community column and overwrite the column
    #For text units df, id column is "id" and id type is "str". Append index name with hyphen to the id and overwrite the column
    #For relationships df, id column is "human_readable_id" and id type is "str". Convert type from "str" to "int", add max_id to the int, convert back to string and overwrite the column,
    #add index name to the source column and overwrite the column, add index name to the target column and overwrite the column
    #For covariates df, id column is "human_readable_id" and id type is "str". Convert type from "str" to "int", add max_id to the int, convert back to string and overwrite the column
    
    #Note that nodes need to set before communities to that max community id makes sense
    if os.path.exists(nodes_file_path):
        nodes_df = pd.read_parquet(nodes_file_path)
        for i in nodes_df["human_readable_id"]:
            links["nodes"][i + max_vals["nodes"] + 1] = {"index_name": index_name, "id": i}
        if max_vals["nodes"] != -1:
           nodes_df["human_readable_id"] += max_vals["nodes"] + 1
        nodes_df["community"] = nodes_df["community"].apply(lambda x: str(int(x) + max_vals["community"] +1) if x else x)
        nodes_df["title"] = nodes_df["title"].apply(lambda x: x + f"-{index_name}")
        nodes_df["source_id"] = nodes_df["source_id"].apply(lambda x: ",".join([i + f"-{index_name}" for i in x.split(",")]))
        max_vals["nodes"] = nodes_df["human_readable_id"].max() 
        nodes_dfs.append(nodes_df)
    else:
        print(f"File {nodes_file_path} does not exist.")
    
    if os.path.exists(community_report_table_path):
        community_df = pd.read_parquet(community_report_table_path)
        for i in community_df["community"].astype(int):
            links["community"][i + max_vals["community"] + 1] = {"index_name": index_name, "id": str(i)}
        if max_vals["community"] != -1:
            col = community_df["community"].astype(int) + max_vals["community"] + 1
            community_df["community"] = col.astype(str)
        max_vals["community"] = community_df["community"].astype(int).max()
        community_dfs.append(community_df)
    else:
        print(f"File {community_report_table_path} does not exist.")
    
    if os.path.exists(entities_table_path):
        entities_df= pd.read_parquet(entities_table_path)
        for i in entities_df["human_readable_id"]:
            links["entities"][i + max_vals["entities"] + 1] = {"index_name": index_name, "id": i}
        if max_vals["entities"] != -1:
           entities_df["human_readable_id"] += max_vals["entities"] + 1
        entities_df["name"] = entities_df["name"].apply(lambda x: x + f"-{index_name}")
        entities_df["text_unit_ids"] = entities_df["text_unit_ids"].apply(lambda x: [i + f"-{index_name}" for i in x])
        max_vals["entities"] = entities_df["human_readable_id"].max()
        entities_dfs.append(entities_df)
    else:
        print(f"File {entities_table_path} does not exist.")
    
    if os.path.exists(text_units_table_path):
        text_units_df= pd.read_parquet(text_units_table_path)
        text_units_df["id"] = text_units_df["id"].apply(lambda x: f"{x}-{index_name}")
        text_units_dfs.append(text_units_df)
    else:
        print(f"File {text_units_table_path} does not exist.")
    
    if os.path.exists(relationships_table_path):
        relationships_df= pd.read_parquet(relationships_table_path)
        for i in relationships_df["human_readable_id"].astype(int):
            links["relationships"][i + max_vals["relationships"] + 1] = {"index_name": index_name, "id": i}
        if max_vals["relationships"] != -1:
            col = relationships_df["human_readable_id"].astype(int) + max_vals["relationships"] + 1
            relationships_df["human_readable_id"] = col.astype(str)
        relationships_df["source"] = relationships_df["source"].apply(lambda x: x + f"-{index_name}")
        relationships_df["target"] = relationships_df["target"].apply(lambda x: x + f"-{index_name}")
        relationships_df["text_unit_ids"] = relationships_df["text_unit_ids"].apply(lambda x: [i + f"-{index_name}" for i in x])
        max_vals["relationships"] = relationships_df["human_readable_id"].astype(int).max()
        relationships_dfs.append(relationships_df)
    else:
        print(f"File {relationships_table_path} does not exist.")
    
    if os.path.exists(covariates_table_path):
        covariates_df= pd.read_parquet(covariates_table_path)
        if i in covariates_df["human_readable_id"].astype(int):
            links["covariates"][i + max_vals["covariates"] + 1] = {"index_name": index_name, "id": i}
        if max_vals["covariates"] != -1:
            col = covariates_df["human_readable_id"].astype(int) + max_vals["covariates"] + 1
            covariates_df["human_readable_id"] = col.astype(str)
        max_vals["covariates"] = covariates_df["human_readable_id"].astype(int).max()
        covariates_dfs.append(covariates_df)
    else:
        print(f"File {covariates_table_path} does not exist.")
        covariates_dfs = None

In [None]:
#context_keys = ['reports', 'entities', 'relationships', 'claims', 'sources']
def update_context(context, links):
    updated_context = {}
    for key in context:
        updated_entry = []
        if key == "reports":
            updated_entry = [
                dict(
                    {k: entry[k] for k in entry},
                    **{
                        "index_name": links["community"][int(entry["id"])]["index_name"],
                        "index_id": links["community"][int(entry["id"])]["id"],
                    },
                )
                for entry in context[key]
            ]
        if key == "entities":
            updated_entry = [
                dict(
                    {k: entry[k] for k in entry},
                    **{
                        "entity": entry["entity"].split("-")[0],
                        "index_name": links["entities"][int(entry["id"])]["index_name"],
                        "index_id": links["entities"][int(entry["id"])]["id"],
                    },
                )
                for entry in context[key]
            ]
        if key == "relationships":
            updated_entry = [
                dict(
                    {k: entry[k] for k in entry},
                    **{
                        "source": entry["source"].split("-")[0],
                        "target": entry["target"].split("-")[0],
                        "index_name": links["relationships"][int(entry["id"])]["index_name"],
                        "index_id": links["relationships"][int(entry["id"])]["id"],
                    },
                )
                for entry in context[key]
            ]
        if key == "claims":
            updated_entry = [
                dict(
                    {k: entry[k] for k in entry},
                    **{
                        "index_name": links["claims"][int(entry["id"])]["index_name"],
                        "index_id": links["claims"][int(entry["id"])]["id"],
                    },
                )
                for entry in context[key]
            ]
        if key == "sources":
            updated_entry = context[key]
        updated_context[key] = updated_entry
    return updated_context

In [None]:
nodes_combined = pd.concat(nodes_dfs, axis=0, ignore_index=True)
community_combined = pd.concat(community_dfs, axis=0, ignore_index=True)
entities_combined = pd.concat(entities_dfs, axis=0, ignore_index=True)
text_units_combined = pd.concat(text_units_dfs, axis=0, ignore_index=True)
relationships_combined = pd.concat(relationships_dfs, axis=0, ignore_index=True)
covariates_combined = pd.concat(covariates_dfs, axis=0, ignore_index=True) if covariates_dfs is not None else None

In [None]:
result = await global_search(config=parameters,
    nodes=nodes_combined,
    entities=entities_combined,
    community_reports = community_combined,
    community_level = 1,
    response_type = "Multiple Paragraphs",
    query= "railroads in arizona"
)

In [None]:
answer = result[0]
context = update_context(result[1], links)
print(answer)

## Local Search

In [None]:
!mkdir -p test5/output/00000000-000000

In [None]:
result = await local_search(
    root_dir = "./test5",
    config=parameters,
    nodes=nodes_combined,
    entities=entities_combined,
    community_reports = community_combined,
    text_units = text_units_combined,
    relationships = relationships_combined,
    covariates = covariates_combined,
    community_level = 1,
    response_type = "Multiple Paragraphs",
    query= "railroads in arizona"
)


In [None]:
answer = result[0]
context = update_context(result[1], links)
context["reports"]

In [None]:
print(answer)

# Basic Single Index Test

In [None]:
'''nodes_df = pd.read_parquet('./arizona/output/20240820-192925/artifacts/create_final_nodes.parquet')
entities_df = pd.read_parquet('./arizona/output/20240820-192925/artifacts/create_final_entities.parquet')
community_df = pd.read_parquet('./arizona/output/20240820-192925/artifacts/create_final_community_reports.parquet')
text_units_df = pd.read_parquet('./arizona/output/20240820-192925/artifacts/create_final_text_units.parquet')
relationships_df = pd.read_parquet('./arizona/output/20240820-192925/artifacts/create_final_relationships.parquet')'''
nodes_df = pd.read_parquet('./arizona/output/20240820-192210/artifacts/create_final_nodes.parquet')
entities_df = pd.read_parquet('./arizona/output/20240820-192210/artifacts/create_final_entities.parquet')
community_df = pd.read_parquet('./arizona/output/20240820-192210/artifacts/create_final_community_reports.parquet')
text_units_df = pd.read_parquet('./arizona/output/20240820-192210/artifacts/create_final_text_units.parquet')
relationships_df = pd.read_parquet('./arizona/output/20240820-192210/artifacts/create_final_relationships.parquet')
covariates_df= None

In [None]:
!mkdir -p single/output/00000000-000000

In [None]:
result = await local_search(
    root_dir = "./single/",
    config=parameters,
    nodes=nodes_df,
    entities=entities_df,
    community_reports = community_df,
    text_units = text_units_df,
    relationships = relationships_df,
    covariates = covariates_df,
    community_level = 1,
    response_type = "Multiple Paragraphs",
    query= "where is alaska?"
)
print(result[0])

In [None]:
result = await global_search(config=parameters,
    nodes=nodes_df,
    entities=entities_df,
    community_reports = community_df,
    community_level = 1,
    response_type = "Multiple Paragraphs",
    query= "where is alaska?"
)
print(result[0])