### Multi Index Testing with graphrag library API

In [103]:
# Import query api from graphrag library
# Make sure to install graphrag version 0.3.0 to be able to get these packages
from graphrag.query.api import global_search, local_search

import os
import inspect
import yaml
from graphrag.config import create_graphrag_config
import pandas as pd 

In [104]:
# Load custom pipeline settings m
this_directory = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe()))
        )
data = yaml.safe_load(open(f"./pipeline-settings.yaml"))
# layer the custom settings on top of the default configuration settings of graphrag
parameters = create_graphrag_config(data, ".")

##### Logic to iterate through indexes, convert parquets to dataframes, append dataframes to a list, then concat them together to pass to graphrag.query.api

In [133]:
# Index folder names

index_names = ['./arizona', './alaska']

nodes_dfs =[]
community_dfs =[]
entities_dfs = []
text_units_dfs=[]
relationships_dfs = []
covariates_dfs = []

links = {"nodes": {}, "community": {}, "entities": {}, "text_units": {}, "relationships": {}, "covariates": {}}
max_vals = {"nodes": -1, "community": -1, "entities": -1, "text_units": -1, "relationships": -1, "covariates": -1}

for index_name in index_names:
    # Construct the path to the output directory
    output_path = os.path.join(index_name, "output")
    # Get all subdirectories in the output folder
    subdirs = [os.path.join(output_path, d) for d in os.listdir(output_path) if os.path.isdir(os.path.join(output_path, d))]
    
    # Find the most recently created subdirectory
    most_recent_subdir = max(subdirs, key=os.path.getmtime)
    
    # Construct the path to the respective parquet files, takes the most recent index iteration in the artifacts folder
    nodes_file_path = os.path.join(most_recent_subdir, "artifacts", "create_final_nodes.parquet")
    community_report_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_community_reports.parquet")  # Adjust the filename/path as necessary
    entities_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_entities.parquet")  # Adjust the filename/path as necessary
    text_units_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_text_units.parquet")
    relationships_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_relationships.parquet")
    covariates_table_path = os.path.join(most_recent_subdir, "artifacts", "create_final_covariates.parquet")
    
    # Read the parquet file into a DataFrame
    #For community df, id column is "community" and id type id "str". Convert type from "str" to "int", add max_id to the int, convert back to string and overwrite the column
    #For entities df, id column is "human_readable_id" and id type id "int". Add max_id to the int and overwrite the column
    #For nodes df, id column is "human_readable_id" and id type id "int". Add max_id to the int and overwrite the column
    #For text units df, id column is "id" and id type is "str". Append index name with hyphen to the id and overwrite the column
    #For relationships df, id column is "human_readable_id" and id type is "str". Convert type from "str" to "int", add max_id to the int, convert back to string and overwrite the column
    #For covariates df, id column is "human_readable_id" and id type is "str". Convert type from "str" to "int", add max_id to the int, convert back to string and overwrite the column
    if os.path.exists(nodes_file_path):
        nodes_df = pd.read_parquet(nodes_file_path)
        for i in nodes_df["human_readable_id"]:
            links["nodes"][i + max_vals["nodes"] + 1] = {"index_name": index_name, "id": i}
        if max_vals["nodes"] != -1:
           nodes_df["human_readable_id"] += max_vals["nodes"] + 1
        max_vals["nodes"] = nodes_df["human_readable_id"].max() 
        nodes_dfs.append(nodes_df)
    else:
        print(f"File {nodes_file_path} does not exist.")
    
    if os.path.exists(community_report_table_path):
        community_df = pd.read_parquet(community_report_table_path)
        for i in community_df["community"].astype(int):
            links["community"][i + max_vals["community"] + 1] = {"index_name": index_name, "id": str(i)}
        if max_vals["community"] != -1:
            col = community_df["community"].astype(int) + max_vals["community"] + 1
            community_df["community"] = col.astype(str)
        max_vals["community"] = community_df["community"].astype(int).max()
        community_dfs.append(community_df)
    else:
        print(f"File {community_report_table_path} does not exist.")
    
    if os.path.exists(entities_table_path):
        entities_df= pd.read_parquet(entities_table_path)
        for i in entities_df["human_readable_id"]:
            links["entities"][i + max_vals["entities"] + 1] = {"index_name": index_name, "id": i}
        if max_vals["entities"] != -1:
           entities_df["human_readable_id"] += max_vals["entities"] + 1
        max_vals["entities"] = entities_df["human_readable_id"].max()
        entities_dfs.append(entities_df)
    else:
        print(f"File {entities_table_path} does not exist.")
    
    if os.path.exists(text_units_table_path):
        text_units_df= pd.read_parquet(text_units_table_path)
        text_units_dfs.append(text_units_df)
    else:
        print(f"File {text_units_table_path} does not exist.")
    
    if os.path.exists(relationships_table_path):
        relationships_df= pd.read_parquet(relationships_table_path)
        relationships_dfs.append(relationships_df)
    else:
        print(f"File {relationships_table_path} does not exist.")
    
    if os.path.exists(covariates_table_path):
        covariates_df= pd.read_parquet(covariates_table_path)
        covariates_dfs.append(covariates_df)
    else:
        print(f"File {covariates_table_path} does not exist.")
        covariates_dfs = None

File ./arizona/output/20240820-192210/artifacts/create_final_covariates.parquet does not exist.
File ./alaska/output/20240820-192925/artifacts/create_final_covariates.parquet does not exist.


In [134]:
nodes_combined = pd.concat(nodes_dfs, axis=0, ignore_index=True)
community_combined = pd.concat(community_dfs, axis=0, ignore_index=True)
entities_combined = pd.concat(entities_dfs, axis=0, ignore_index=True)
text_units_combined = pd.concat(text_units_dfs, axis=0, ignore_index=True)
relationships_combined = pd.concat(relationships_dfs, axis=0, ignore_index=True)
covariates_combined = pd.concat(covariates_dfs, axis=0, ignore_index=True) if covariates_dfs is not None else None

In [None]:
result = await global_search(config=parameters,
              nodes=nodes_combined,
              entities=entities_combined,
              community_reports = community_combined,
              community_level = 1,
              response_type = "Multiple Paragraphs",
              query= "where is alaska?"
              )

In [146]:
print(result[0])

### Geographic Location of Alaska

Alaska is a state located in the northwest extremity of North America. It is bordered by Canada to the east, the Arctic Ocean to the north, and the Pacific Ocean to the south and west [Data: Reports (57, 59)]. Additionally, Alaska shares a maritime border with Russia in the Bering Strait [Data: Reports (54)].

### Key Geographical Features

Alaska is renowned for its vast wilderness and diverse wildlife. Significant geographical features include the Alexander Archipelago and the Tongass National Forest [Data: Reports (59)]. The Aleutian Islands, a chain of over 300 small volcanic islands, extend from the southern tip of the Alaska Peninsula into the Pacific Ocean [Data: Reports (57)].

In summary, Alaska's unique location and geographical features make it a distinctive state with a rich natural environment.


In [152]:
update_context = [
    dict(
        {k: entry[k] for k in entry},
        **{
            "index_name": links["community"][int(entry["id"])]["index_name"],
            "index_id": links["community"][int(entry["id"])]["id"],
        },
    )
    for entry in result[1]["reports"]
]

In [153]:
for i in update_context:
    if i["id"] in ["54","57","59"]:
        print(i)

{'id': '59', 'title': 'Southeast Alaska Cities and Events', 'occurrence weight': 0.13636363636363635, 'content': "# Southeast Alaska Cities and Events\n\nThe community revolves around several key cities in Southeast Alaska, including Ketchikan, Sitka, and Juneau. These cities are interconnected through various relationships and events, such as festivals and historical significance. The community also includes notable geographical features like the Alexander Archipelago and the Tongass National Forest, as well as healthcare facilities operated by the Southeast Alaska Regional Health Consortium.\n\n## Ketchikan's Cultural and Economic Significance\n\nKetchikan is a city in southeastern Alaska known for its high precipitation and significant cultural events such as the Blueberry Festival and the Alaska Hummingbird Festival. The city's population fluctuates dramatically due to the arrival of cruise ships, which highlights its economic dependence on tourism. Ketchikan was once the largest c

In [147]:
seen = []
title = []
for k in result[1].keys():
    if len(result[1][k]) > 0:
        for i in result[1][k]:
            if i['id'] not in seen:
                seen.append(i['id'])
                title.append(i['title'])
            else:
                print(i['id'], title[seen.index(i['id'])], "-"*10, i['title'])
            

## Local Search

In [68]:
parameters.local_search.root_dir

AttributeError: 'LocalSearchConfig' object has no attribute 'root_dir'

In [70]:
help(local_search)

Help on function local_search in module graphrag.query.api:

local_search(root_dir: str | None, config: graphrag.config.models.graph_rag_config.GraphRagConfig, nodes: pandas.core.frame.DataFrame, entities: pandas.core.frame.DataFrame, community_reports: pandas.core.frame.DataFrame, text_units: pandas.core.frame.DataFrame, relationships: pandas.core.frame.DataFrame, covariates: pandas.core.frame.DataFrame | None, community_level: int, response_type: str, query: str) -> tuple[str | dict[str, typing.Any] | list[dict[str, typing.Any]], str | list[pandas.core.frame.DataFrame] | dict[str, pandas.core.frame.DataFrame]]
    Perform a local search and return the context data and response.
    
    Parameters
    ----------
    - config (GraphRagConfig): A graphrag configuration (from settings.yaml)
    - nodes (pd.DataFrame): A DataFrame containing the final nodes (from create_final_nodes.parquet)
    - entities (pd.DataFrame): A DataFrame containing the final entities (from create_final_entiti

In [74]:
parameters.storage

StorageConfig(type="file", base_dir='output/${timestamp}/artifacts', connection_string=None, container_name=None, storage_account_blob_url=None)

In [75]:
result = await local_search(
              root_dir = None,
              config=parameters,
              nodes=nodes_combined,
              entities=entities_combined,
              community_reports = community_combined,
              text_units = text_units_combined,
              relationships = relationships_combined,
              covariates = covariates_combined,
              community_level = 1,
              response_type = "Multiple Paragraphs",
              query= "where is arizona?"
              )


INFO: Vector Store Args: {}


ValueError: Parent directory None/output does not exist or is not a directory.

# Basic Single Index Test

In [None]:
nodes_df = pd.read_parquet('./alaska/output/20240820-192925/artifacts/create_final_nodes.parquet')
entities_df = pd.read_parquet('./alaska/output/20240820-192925/artifacts/create_final_entities.parquet')
community_df = pd.read_parquet('./alaska/output/20240820-192925/artifacts/create_final_community_reports.parquet')
text_units_df = pd.read_parquet('./alaska/output/20240820-192925/artifacts/create_final_text_units.parquet')
relationships_df = pd.read_parquet('./alaska/output/20240820-192925/artifacts/create_final_relationships.parquet')
covariates_df= None

In [None]:
result = await local_search(config=parameters,
              nodes=nodes_df,
              entities=entities_df,
              community_reports = community_df,
              text_units = text_units_df,
              relationships = relationships_df,
              covariates = covariates_df,
              community_level = 1,
              response_type = "Multiple Paragraphs",
              query= "where is alaska?"
              )

In [None]:
result = await global_search(config=parameters,
              nodes=nodes_df,
              entities=entities_df,
              community_reports = community_df,
              community_level = 1,
              response_type = "Multiple Paragraphs",
              query= "where is alaska?"
              )