# Local Search Example

In [None]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [11]:

import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

from dotenv import load_dotenv
load_dotenv()

True

# LanceDB(default)

## load table

In [4]:
INPUT_DIR = "./output/20240824-194226/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
COMMUNITIES_REPORT_TABLE = "create_final_communities"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

## read entities

In [5]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 39


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,SOKOBAN,OBJECT,"The ""SOKOBAN"" is an object that serves as the ...","47429b83e588f06c43d4cc792a75379f,5211048d32705...",1,9,0,b45241d70f0e43fca764df95b2b81f77,9,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,CRATE,OBJECT,"The ""CRATE"" is an object within the sokoban ga...","47429b83e588f06c43d4cc792a75379f,5211048d32705...",3,5,1,4119fd06010c494caa07f439b333f4c5,5,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,LOCATION X,LOCATION,Location x is a position in the game where the...,5211048d327058bc6a44c679ae895263,2,2,2,d3835bf3dda84ead99deadbeac5d0d7d,2,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,LOCATION Y,LOCATION,Location y is a position in the game where the...,5211048d327058bc6a44c679ae895263,2,6,3,077d2820ae1845bcbb1803379a3d1eae,6,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,LOCATION Z,LOCATION,Location z is a target position in the game wh...,5211048d327058bc6a44c679ae895263,2,4,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,4,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


## Read relationships

In [6]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 68


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,SOKOBAN,CRATE,1.0,The sokoban is the character that interacts wi...,[5211048d327058bc6a44c679ae895263],17ed1d92075643579a712cc6c29e8ddb,0,9,5,14
1,SOKOBAN,MOVELEFT,1.0,The action 'moveLeft' involves the sokoban as ...,[5211048d327058bc6a44c679ae895263],3ce7c210a21b4deebad7cc9308148d86,1,9,3,12
2,SOKOBAN,MOVERIGHT,1.0,The action 'moveRight' involves the sokoban as...,[5211048d327058bc6a44c679ae895263],d64ed762ea924caa95c8d06f072a9a96,2,9,3,12
3,SOKOBAN,MOVEUP,1.0,The action 'moveUp' involves the sokoban as th...,[5211048d327058bc6a44c679ae895263],adf4ee3fbe9b4d0381044838c4f889c8,3,9,3,12
4,SOKOBAN,MOVEDOWN,1.0,The action 'moveDown' involves the sokoban as ...,[5211048d327058bc6a44c679ae895263],32ee140946e5461f9275db664dc541a5,4,9,3,12


## Read community reports

In [8]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 4


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,0,# Sokoban Game Mechanics: Actions and Relation...,0,9.0,Sokoban Game Mechanics: Actions and Relationships,The rating reflects the significant impact of ...,The community surrounding the Sokoban game mec...,[{'explanation': 'The actions PUSHLEFT and PUS...,"{\n ""title"": ""Sokoban Game Mechanics: Actio...",8a6f6a40-3b3b-438a-8c3c-a6c1dc25727a
1,1,# Sokoban Game Dynamics: Actions and Spatial R...,0,9.0,Sokoban Game Dynamics: Actions and Spatial Rel...,The rating reflects the significant role of ga...,The community surrounding the Sokoban game is ...,[{'explanation': 'The sokoban character is the...,"{\n ""title"": ""Sokoban Game Dynamics: Action...",304a21a2-5d96-4e9d-b1c9-8d64654eef4c
2,2,# Sokoban Game Dynamics: Actions and Locations...,0,9.0,Sokoban Game Dynamics: Actions and Locations,The rating reflects the significant impact of ...,The community surrounding the Sokoban game is ...,[{'explanation': 'The actions 'moveLeft' and '...,"{\n ""title"": ""Sokoban Game Dynamics: Action...",30bdac31-275d-454a-911c-aad57e7d3390
3,3,# Sokoban Game Dynamics: Crates and Actions\n\...,0,9.5,Sokoban Game Dynamics: Crates and Actions,The rating reflects the high significance of u...,The community surrounding the Sokoban game is ...,[{'explanation': 'The sokoban character is the...,"{\n ""title"": ""Sokoban Game Dynamics: Crates...",82d6d7f8-fb34-4aa4-a47d-58958f43dbc0


## Read text units

In [9]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 3


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,5211048d327058bc6a44c679ae895263,(define (domain sokoban)\n\t(:requirements :st...,1200,[2e348e3a4352ddebb8cfd4e189d93a4f],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[17ed1d92075643579a712cc6c29e8ddb, 3ce7c210a21..."
1,47429b83e588f06c43d4cc792a75379f,\t(:action pushUp\n\t\t:parameters (?sokoban ?...,1200,[2e348e3a4352ddebb8cfd4e189d93a4f],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[f1c6eed066f24cbdb376b910fce29ed4, 83a6cb03df6..."
2,7e7d1504bd08a8b76fdc1586682a1c4b,l10)\n\t\t (at sokoban2 l16)\n\t\t (at cr...,125,[2e348e3a4352ddebb8cfd4e189d93a4f],"[1943f245ee4243bdbfbd2fd619ae824a, 273daeec8ca...","[fc01e9baa80e417c9206f941bb279407, 7c49f2710e8..."


In [12]:
api_key = os.getenv("GRAPHRAG_API_KEY")
llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
embedding_model = os.getenv("GRAPHRAG_EMBEDDING_MODEL")

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

## Create local search context builder

In [13]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

## Create local search engine

In [None]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [None]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

## Run local search on sample queries

In [None]:
result = await search_engine.asearch("Tell me about the top theme")
print(result.response)
# 8.9s

In [None]:
question = "Tell me about ADC Technology"
result = await search_engine.asearch(question)
print(result.response)

## Inspecting the context data used to generate the response

In [None]:
result.context_data["entities"].head()

In [None]:
result.context_data["relationships"].head()

In [None]:
result.context_data["reports"].head()

In [None]:
result.context_data["sources"].head()

In [None]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

## Question Generation
This function takes a list of user queries and generates the next candidate questions.

In [None]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [None]:
question_history = [
    "Tell me about the top theme",
    "What are the advantages in Delta Sigma ADC?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)