In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [None]:
import os
from pathlib import Path

import pandas as pd

import graphrag.api as api
from graphrag.config.create_graphrag_config import create_graphrag_config
from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.config.models.vector_store_config import VectorStoreConfig


## Global Search example

Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).

### LLM setup

Set the `GRAPHRAG_API_KEY`, `GRAPHRAG_LLM_MODEL`, and `GRAPHRAG_EMBEDDING_MODEL` environment variables before running this notebook.

In [None]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]


### Load data and configure GraphRAG

- Configure the chat and embedding models used for queries.
- Load the community, entity, and report tables that back global search.


In [None]:
# parquet files generated from indexing pipeline
INPUT_DIR = "./inputs/operation dulce"
COMMUNITY_TABLE = "communities"
COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2


In [None]:
chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIChat,
    model=llm_model,
    max_retries=20,
)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.OpenAIEmbedding,
    model=embedding_model,
    max_retries=20,
)

vector_store_config = VectorStoreConfig(
    type="lancedb",
    db_uri=str(Path(INPUT_DIR).resolve() / "lancedb"),
    container_name="default",
    overwrite=True,
)

config = create_graphrag_config(
    {
        "models": {
            "default_chat_model": chat_config,
            "default_embedding_model": embedding_config,
        },
        "global_search": {
            "chat_model_id": "default_chat_model",
        },
        "vector_store": {
            "default_vector_store": vector_store_config,
        },
    },
    root_dir=Path("."),
)

community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

print(f"Total community report count: {len(report_df)}")
report_df.head()


#### Run global search

In [None]:
response, context = await api.global_search(
    config=config,
    entities=entity_df,
    communities=community_df,
    community_reports=report_df,
    community_level=COMMUNITY_LEVEL,
    dynamic_community_selection=False,
    response_type="Multiple Paragraphs",
    query="What is operation dulce?",
)


#### Inspect response and context

In [None]:
response


In [None]:
context["reports"].head()
