# Import Library

In [2]:
import os

import pandas as pd
import lancedb
import tiktoken
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import LanceDB
from langchain_openai import ChatOpenAI

from query.structured_search.local_search.combine_context import LocalSearchMixedContext
from query.structured_search.local_search.search import LocalSearch
from query.system_prompt_builder.entity_extraction import EntityVectorStoreKey
from vectorstore import store_entity_semantic_embeddings
from query.inputs.loader.indexer_adapters import read_indexer_covariates, read_indexer_entities, read_indexer_relationships, read_indexer_reports, read_indexer_text_units

In [16]:
os.environ["OPENAI_API_KEY"] = ""

# Setup Config

In [17]:
# Path to folders after ingestion
INPUT_DIR = "outputs"

# Filenames
COMMUNITY_REPORT_TABLE = "community_report.csv"
ENTITY_TABLE = "node.csv"
ENTITY_EMBEDDING_TABLE = "entity.csv"
RELATIONSHIP_TABLE = "relationship.csv"
COVARIATE_TABLE = "claims.csv"
TEXT_UNIT_TABLE = "text_unit.csv"
TABLE_PATH = "/home/hungquan/build_kg/lancedb_store"
TABLE_NAME = "multimodal_test"

# Community config
COMMUNITY_LEVEL = 2

# Local context params
local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}


# LLM config params
llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}



# Embedding

In [18]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
)

# LLM Model

In [19]:
llm = ChatOpenAI(model="gpt-4o-mini", **llm_params)

# Token Encoder

In [20]:
token_encoder = tiktoken.get_encoding("cl100k_base")

# Load Entity

In [21]:
entity_df = pd.read_csv(f"{INPUT_DIR}/{ENTITY_TABLE}")
entity_embedding_df = pd.read_csv(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}")

entity_embedding_df["description"] = entity_embedding_df["description"].fillna("")
entity_embedding_df["text_unit_ids"] = entity_embedding_df["text_unit_ids"].apply(lambda x: x.split(','))

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# Load Relationship

In [22]:
relationship_df = pd.read_csv(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}")

relationship_df["text_unit_ids"] = relationship_df["text_unit_ids"].apply(lambda x: x.split(','))

relationships = read_indexer_relationships(relationship_df)

# Load Covariate

In [23]:
covariate_df = pd.read_csv(f"{INPUT_DIR}/{COVARIATE_TABLE}")

claims = read_indexer_covariates(covariate_df)

covariates = {"claims": claims}

# Load CommunityReport

In [24]:
report_df = pd.read_csv(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)


# Load TextUnit

In [25]:
text_unit_df = pd.read_csv(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}")
text_units = read_indexer_text_units(text_unit_df)

# Load VectorStore

In [26]:
connection = lancedb.connect("/home/hungquan/build_kg/lancedb_store")
db = None
if TABLE_NAME not in connection.table_names():
    db = LanceDB(table_name=TABLE_NAME,embedding=embeddings, uri=TABLE_PATH)
    db = store_entity_semantic_embeddings(entities=entities, vectorstore=db)
else:
    db = LanceDB(connection=connection, embedding=embeddings, table_name=TABLE_NAME)



# Local Search Context Builder

In [27]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    covariates=covariates, # If not use, set this to None
    entity_text_embeddings=db,
    text_embedder=embeddings,
)

# Search Engine

In [28]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

# Run local search query

In [29]:
result = await search_engine.asearch("Tell me about Agent Mercer")
print(result.response)

ValidationError: 1 validation error for Document
page_content
  none is not an allowed value (type=type_error.none.not_allowed)