In [1]:
from IPython.display import display, Markdown

def add_section_title(title, color="black", size="24px"):
    display(Markdown(f"<h2 style='color:{color}; font-size:{size};'>{title}</h2>"))


In [2]:
add_section_title("Step 1: Setup postgres connection", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 1: Setup postgres connection</h2>

In [3]:
import psycopg2
import os

def get_connection():
    host = os.getenv("AGE_HOST") if os.getenv("USE_LOCAL_AGE") == "true" else os.getenv("POSTGRES_HOST")
    port = os.getenv("AGE_PORT") if os.getenv("USE_LOCAL_AGE") == "true" else os.getenv("POSTGRES_PORT")
    user = os.getenv("AGE_USER") if os.getenv("USE_LOCAL_AGE") == "true" else os.getenv("POSTGRES_USER")
    password = os.getenv("AGE_PASSWORD") if os.getenv("USE_LOCAL_AGE") == "true" else os.getenv("POSTGRES_PASSWORD")
    dbname = os.getenv("AGE_DB") if os.getenv("USE_LOCAL_AGE") == "true" else os.getenv("POSTGRES_DB")

    return psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )
"""     
conn = get_connection()
cur = conn.cursor()
cur.close()
conn.close()
"""

'     \nconn = get_connection()\ncur = conn.cursor()\ncur.close()\nconn.close()\n'

In [51]:
add_section_title("Step 2: Check Nodes, Relationship count", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 2: Check Nodes, Relationship count</h2>

In [2]:
conn = get_connection()
cur = conn.cursor()

# Set search path and run a Cypher query
cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')
cur.execute("""
    SELECT * FROM cypher('graphRAG', $$
        MATCH (n) RETURN labels(n), count(n)
    $$) AS (labels agtype, count agtype);
""")


# Print results
for row in cur.fetchall():
    print(row)


('["Entity"]', '449')
('["Document"]', '10')


In [4]:
cur = conn.cursor()

# Set search path and run a Cypher query
cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')
cur.execute("""
    SELECT * FROM cypher('graphRAG', $$
        MATCH ()-[r]->() RETURN type(r), count(r)
    $$) AS (relationship agtype, count agtype);
""")

# Print results
for row in cur.fetchall():
    print(row)


('"RELATED_TO"', '690')


In [63]:
add_section_title("Step 3: Show entity, relationship properties using Cypher query", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 3: Show entity, relationship properties using Cypher query</h2>

In [5]:
conn = get_connection()
cur = conn.cursor()
cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')

# List Document Node Properties
cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH (d:Document)
    RETURN keys(d) AS document_properties
    LIMIT 1
$$) AS (document_properties agtype);
""")

for row in cur.fetchall():
    print(row)

# List Entity Node Properties
cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH (e:Entity)
    RETURN keys(e) AS entity_properties
    LIMIT 1
$$) AS (entity_properties agtype);
""")
for row in cur.fetchall():
    print(row)

# List Relationship Properties:    
cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH ()-[r:RELATED_TO]->()
    RETURN DISTINCT keys(r) AS relationship_keys
$$) AS (relationship_keys agtype);
""")


for row in cur.fetchall():
    print(row)

# List a few relationships, see the properties:
cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH ()-[r:RELATED_TO]->()
    WHERE exists(r.source) OR exists(r.target)
    RETURN r.source AS source, r.target AS target, r.description AS description
    LIMIT 5
$$) AS (source text, target text, description text);
""")

for row in cur.fetchall():
    print(row)


('["id", "name", "text", "title", "metadata", "creation_date", "text_unit_ids", "human_readable_id"]',)
('["x", "y", "id", "name", "type", "title", "degree", "frequency", "description", "text_unit_ids", "human_readable_id"]',)
('["id", "source", "target", "weight", "description", "text_unit_ids", "combined_degree", "human_readable_id"]',)
('KEVIN SCOTT', 'DIO GONZALEZ', "Kevin Scott and Dio Gonzalez participated in a conversation on the Behind the Tech podcast, where they explored a variety of topics, including work/life balance, cycling, and the potential of mixed reality. Their discussion delved into the applications and possibilities of mixed reality, as well as virtual reality technologies. Both individuals shared their personal experiences of being drawn to computers through their visual and creative aspects, highlighting a shared connection to technology's artistic and innovative dimensions.")
('DIO GONZALEZ', 'KEVIN SCOTT', 'Dio Gonzalez and Kevin Scott have engaged in a series 

In [6]:
conn = get_connection()
cur = conn.cursor()

cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')
cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH (a:Entity)-[r]->(b:Entity)
    RETURN 
        r.human_readable_id AS id,
        a.title AS source,
        b.title AS target,
        r.description AS description,
        r.weight AS weight,
        r.combined_degree AS combined_degree
    LIMIT 20
$$) AS (
    id agtype, source agtype, target agtype,
    description agtype, weight agtype, combined_degree agtype
);

""")
for row in cur.fetchall():
    print(row)


('"0"', '"DIO GONZALEZ"', '"MICROSOFT"', '"Dio Gonzalez has been a part of Microsoft since December 2017, where he joined as a principal software architect in Central Engineering. He is inspired by Microsoft\'s advancements in mixed reality innovations, reflecting his engagement with the company\'s cutting-edge technologies."', '"18.0"', '"53"')
('"1"', '"DIO GONZALEZ"', '"PURDUE UNIVERSITY"', '"Dio Gonzalez is an alumnus of Purdue University, where he pursued his master\'s degree with a focus on virtual reality. During his time at Purdue, he played a pivotal role in establishing the university\'s first virtual reality facility, contributing significantly to the advancement of immersive technology on campus."', '"17.0"', '"29"')
('"2"', '"DIO GONZALEZ"', '"PIXAR"', '"Dio Gonzalez is a professional who worked at Pixar earlier in her career as a character animation engineer. During her tenure at Pixar, she contributed significantly to the fields of computer graphics and animation, includ

In [7]:
conn = get_connection()
cur = conn.cursor()

cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')
cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH (n:Entity)
    RETURN 
        n.human_readable_id AS id,
        n.title AS title,
        n.type AS type,
        n.description AS description,
        n.frequency AS frequency,
        n.degree AS degree,
        n.x AS x,
        n.y AS y
    LIMIT 20
$$) AS (
    id agtype, title agtype, type agtype, description agtype,
    frequency agtype, degree agtype, x agtype, y agtype
);


""")
for row in cur.fetchall():
    print(row)


('"0"', '"DIO GONZALEZ"', '"PERSON"', '"Dio Gonzalez is a Latina, immigrant, and non-native English speaker originally from Venezuela, who has overcome significant challenges, including poor standardized test results and bullying, to pursue her passion for programming, computer engineering, and technology. She is a minority in the technology field and a passionate advocate for increasing diversity in engineering and technology, particularly for women and underrepresented communities. Dio actively promotes education, mentorship, and role models to support minorities in technology.\\n\\nProfessionally, Dio Gonzalez is a highly accomplished computer engineer, character animation expert, and researcher specializing in virtual and mixed reality technologies, computer graphics, and animation. She has worked in academia, teaching games programming in Singapore, and contributed to the establishment of Purdue University\'s first virtual reality facility. Her industry experience includes roles a

In [53]:
add_section_title("Step 4: Multi-hop Cypher query", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 4: Multi-hop Cypher query</h2>

In [8]:
"""
A multi-hop Cypher query

Original .txt input:
“Next time on Behind the Tech, we'll hear from Alice Steinglass who heads Code.org. We'll hear about how she's working to make sure every kid has the opportunity to take computer science classes in school.”

multi-hop query example:

Kevin mentioned Alice,
Alice is connected to Code.org,
And the relationship description gives a rich summary of what Code.org does and Alice’s role in it.
"""

conn = get_connection()
cur = conn.cursor()

# Set search path and run a Cypher query
cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')

cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH (kevin:Entity {title: 'KEVIN SCOTT'})-[:RELATED_TO]->(alice:Entity {title: 'ALICE STEINGLASS'})
          -[r:RELATED_TO]->(org:Entity)
    WHERE toLower(r.description) CONTAINS 'code.org'
    RETURN kevin.title AS source,
           alice.title AS intermediate,
           org.title AS target,
           r.description AS relationship_description
$$) AS (source text, intermediate text, target text, relationship_description text);
""")


for row in cur.fetchall():
    print(row)



('KEVIN SCOTT', 'ALICE STEINGLASS', 'CODE.ORG', "Alice Steinglass is the President of Code.org, an organization dedicated to promoting and expanding computer science education globally. As the head of Code.org, she leads its mission to make computer science education accessible to children and students worldwide. Steinglass has expressed admiration for Code.org's efforts to teach computer science to young learners and actively works on initiatives to ensure that this education reaches a broader audience. Her leadership is central to Code.org's goal of advancing computer science education on a global scale.")
('KEVIN SCOTT', 'ALICE STEINGLASS', 'KEVIN SCOTT', "Alice Steinglass and Kevin Scott are actively engaged in discussions surrounding gender stereotypes and strategies to improve gender balance in computer science education. Kevin Scott interviewed Alice Steinglass on the Behind the Tech podcast, where they explored her career, her journey into technology, and her impactful work at 

In [9]:
"""
A multi-hop Cypher query

Original .txt input:
“Next time on Behind the Tech, we'll hear from Alice Steinglass who heads Code.org. We'll hear about how she's working to make sure every kid has the opportunity to take computer science classes in school.”

Run a query:
"Who did KEVIN SCOTT mention that leads a company helping kids learn computer science?”
"""
conn = get_connection()
cur = conn.cursor()

# Set search path and run a Cypher query
cur.execute("LOAD 'age';")
cur.execute('SET search_path = ag_catalog, "$user", public;')

cur.execute("""
SELECT * FROM cypher('graphRAG', $$
    MATCH (kevin:Entity {title: 'KEVIN SCOTT'})-[:RELATED_TO]->(person:Entity)
          -[r:RELATED_TO]->(company:Entity)
    WHERE toLower(r.description) CONTAINS 'computer science'
       OR toLower(r.description) CONTAINS 'education'
       OR toLower(r.description) CONTAINS 'kids'
    RETURN person.title AS mentioned_person,
           company.title AS organization,
           r.description AS relationship_description
$$) AS (mentioned_person text, organization text, relationship_description text);
""")

for row in cur.fetchall():
    print(row)
"""    
This query will return:

The person Kevin mentioned.
The company they’re connected to.
A description of that connection that includes educational or youth-oriented language.
"""


('ALICE STEINGLASS', 'KEVIN SCOTT', "Alice Steinglass and Kevin Scott are actively engaged in discussions surrounding gender stereotypes and strategies to improve gender balance in computer science education. Kevin Scott interviewed Alice Steinglass on the Behind the Tech podcast, where they explored her career, her journey into technology, and her impactful work at Code.org. The conversation highlighted Alice Steinglass's contributions to advancing computer science education and her efforts to address gender disparities in the field.")
('DANIELLE FEINBERG', 'KEVIN SCOTT', "Kevin Scott interviewed Danielle Feinberg on the *Behind the Tech* podcast, where they discussed her journey in programming and computer science, as well as her experiences studying the field and overcoming challenges. The conversation delved into Danielle Feinberg's early experiences with art and programming, highlighting how these passions shaped her career. They also explored her technical and creative contributi

'    \nThis query will return:\n\nThe person Kevin mentioned.\nThe company they’re connected to.\nA description of that connection that includes educational or youth-oriented language.\n'

In [55]:
add_section_title("Step 5: graphRAG output checking", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 5: graphRAG output checking</h2>

In [11]:
import pandas as pd
import xml.etree.ElementTree as ET

# Count nodes in graph.graphml
graphml_path = "/app/graphrag-folder/output/graph.graphml"
tree = ET.parse(graphml_path)
root = tree.getroot()
graphml_nodes = len(root.findall(".//{http://graphml.graphdrawing.org/xmlns}node"))
print(f"Number of nodes in graph.graphml: {graphml_nodes}")

# Count rows in text_units.parquet
text_units_path = "/app/graphrag-folder/output/text_units.parquet"
text_units_df = pd.read_parquet(text_units_path)
print(f"Number of rows in text_units.parquet: {len(text_units_df)}")

# Count rows in entities.parquet
entities_path = "/app/graphrag-folder/output/entities.parquet"
entities_df = pd.read_parquet(entities_path)
print(f"Number of rows in entities.parquet: {len(entities_df)}")


Number of nodes in graph.graphml: 448
Number of rows in text_units.parquet: 119
Number of rows in entities.parquet: 449


In [12]:
import pandas as pd

files = [
    "communities.parquet",
    "community_reports.parquet",
    "documents.parquet",
    "embeddings.community.full_content.parquet",
    "embeddings.entity.description.parquet",
    "embeddings.text_unit.text.parquet",
    "relationships.parquet",
    "entities.parquet"
]

for file in files:
    try:
        df = pd.read_parquet(f"/app/graphrag-folder/output/{file}")
        print(f"{file}: {len(df)} rows")
    except Exception as e:
        print(f"Error reading {file}: {e}")


communities.parquet: 102 rows
community_reports.parquet: 102 rows
documents.parquet: 10 rows
embeddings.community.full_content.parquet: 102 rows
embeddings.entity.description.parquet: 449 rows
embeddings.text_unit.text.parquet: 119 rows
relationships.parquet: 690 rows
entities.parquet: 449 rows


In [13]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [14]:
INPUT_DIR = "/app/graphrag-folder/output/"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
RELATIONSHIP_TABLE = "relationships"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2


In [15]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 449


Unnamed: 0,id,human_readable_id,title,type,description,text_unit_ids,frequency,degree,x,y
0,97c9d929-b6e3-4aa2-880b-4e945785ea11,0,DIO GONZALEZ,PERSON,"Dio Gonzalez is a Latina, immigrant, and non-n...",[5c537c55a260fcdf05e097fa59681363520e45ab59f39...,11,28,0.0,0.0
1,3bcbf067-9f32-4a89-8200-07ef7b8d357d,1,KEVIN SCOTT,PERSON,Kevin Scott is the Chief Technology Officer fo...,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...,112,71,0.0,0.0
2,52169f80-bc06-4198-b521-2c563c0e95fd,2,CHRISTINA WARREN,PERSON,Christina Warren is a Senior Cloud Developer A...,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...,27,10,0.0,0.0
3,4d488bfa-9c85-485b-992f-0b9b6de3005c,3,MICROSOFT,ORGANIZATION,Microsoft is a leading global technology compa...,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...,26,25,0.0,0.0
4,bc85ef71-c28d-4afc-9570-57497865cb39,4,PURDUE UNIVERSITY,ORGANIZATION,Purdue University is a prominent educational i...,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...,2,1,0.0,0.0


In [16]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 690


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,bcde0186-be20-4301-a2b1-e16482548d75,0,DIO GONZALEZ,MICROSOFT,Dio Gonzalez has been a part of Microsoft sinc...,18.0,53,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...
1,b66dad79-9d45-45e7-b3cd-1937b6e990cd,1,DIO GONZALEZ,PURDUE UNIVERSITY,Dio Gonzalez is an alumnus of Purdue Universit...,17.0,29,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...
2,4b7ccd4b-a5c7-4cf6-9d5f-bdeb06715d51,2,DIO GONZALEZ,PIXAR,Dio Gonzalez is a professional who worked at P...,22.0,45,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...
3,4b8bd3b9-1769-437b-92d5-6a8af525bedf,3,DIO GONZALEZ,DREAMWORKS ANIMATION,Dio Gonzalez is a professional who previously ...,16.0,31,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...
4,f4481ea6-326d-4889-aa13-80365a59939e,4,DIO GONZALEZ,UNITY LABS,Dio Gonzalez is a professional who previously ...,17.0,30,[5c537c55a260fcdf05e097fa59681363520e45ab59f39...


In [17]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 102


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,f5ae82bff5f9455dbffa2fc43526201f,98,98,3,75,[],Danielle Feinberg and Pixar Studios Community,This community centers around Danielle Feinber...,# Danielle Feinberg and Pixar Studios Communit...,8.5,The impact severity rating is high due to Dani...,[{'explanation': 'Danielle Feinberg serves as ...,"{\n ""title"": ""Danielle Feinberg and Pixar S...",2025-05-27,8
1,3d93506838d84ff499d250ab7b052516,99,99,3,75,[],Danielle Feinberg's Academic Journey at Harvard,This community centers around Danielle Feinber...,# Danielle Feinberg's Academic Journey at Harv...,7.5,The impact severity rating reflects the import...,[{'explanation': 'Linear Algebra was a pivotal...,"{\n ""title"": ""Danielle Feinberg's Academic ...",2025-05-27,2
2,63a71426e42c4254a09d3105d7fd6545,100,100,3,92,[],Turbo Pascal and Anders Hejlsberg's Pascal Eco...,"The community centers around Turbo Pascal, a r...",# Turbo Pascal and Anders Hejlsberg's Pascal E...,8.5,The impact severity rating is high due to Turb...,"[{'explanation': 'Turbo Pascal, developed by A...","{\n ""title"": ""Turbo Pascal and Anders Hejls...",2025-05-27,4
3,b66a6cf6bf324aedbb075606fcb3ab49,101,101,3,92,[],Anders Hejlsberg and Programming Language Inno...,"The community centers around Anders Hejlsberg,...",# Anders Hejlsberg and Programming Language In...,8.5,The impact severity rating is high due to Ande...,[{'explanation': 'Anders Hejlsberg is a pivota...,"{\n ""title"": ""Anders Hejlsberg and Programm...",2025-05-27,10
4,33be08d16f4a4a59b5770e6629550ce3,72,72,2,13,[],Pixar Animated Films: Finding Nemo and A Bug's...,The community centers around two key Pixar ani...,# Pixar Animated Films: Finding Nemo and A Bug...,7.5,The impact severity rating reflects the signif...,[{'explanation': 'Finding Nemo is a Pixar anim...,"{\n ""title"": ""Pixar Animated Films: Finding...",2025-05-27,2


In [18]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 119


Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,5c537c55a260fcdf05e097fa59681363520e45ab59f392...,1,DIO GONZALEZ: (VOICEOVER) And so they made us ...,1200,[25590c2303379ea1ba9fbcab0dc8664f2f943864091ec...,"[97c9d929-b6e3-4aa2-880b-4e945785ea11, 3bcbf06...","[bcde0186-be20-4301-a2b1-e16482548d75, b66dad7...",[]
1,678f6d9f75e36a229f89e8d1b4bc035fd5acc38d0d8099...,2,"they explained us how to do it. Like, you fi...",1200,[25590c2303379ea1ba9fbcab0dc8664f2f943864091ec...,"[97c9d929-b6e3-4aa2-880b-4e945785ea11, 3bcbf06...","[b66dad79-9d45-45e7-b3cd-1937b6e990cd, ef3036c...",[]
2,2b3ea036c5954b0fa0198abe95da83efedf0d689a8b6a6...,3,"thesis. At that time, there was no Oculus, r...",1200,[25590c2303379ea1ba9fbcab0dc8664f2f943864091ec...,"[97c9d929-b6e3-4aa2-880b-4e945785ea11, 3bcbf06...","[4b7ccd4b-a5c7-4cf6-9d5f-bdeb06715d51, ef3036c...",[]
3,e2b1c2c2daafc008f6cf43844a96b7fc7f5ed1eb3920b5...,4,"hey, grand mommy. (Laughter.) \n\nKEVIN SCOT...",1200,[25590c2303379ea1ba9fbcab0dc8664f2f943864091ec...,"[97c9d929-b6e3-4aa2-880b-4e945785ea11, 3bcbf06...","[4b7ccd4b-a5c7-4cf6-9d5f-bdeb06715d51, 4b8bd3b...",[]
4,d07370b5bb40e78843e93c1aacb8520e75138bd0ab9fed...,5,"the engineer and I know how it was done, like...",1200,[25590c2303379ea1ba9fbcab0dc8664f2f943864091ec...,"[97c9d929-b6e3-4aa2-880b-4e945785ea11, 3bcbf06...","[f4481ea6-326d-4889-aa13-80365a59939e, f0faa01...",[]


In [56]:
add_section_title("Step 6: graphRAG LocalSearch", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 6: graphRAG LocalSearch</h2>

In [19]:
import os
from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager

# Optional: load .env if not already loaded
from dotenv import load_dotenv
load_dotenv()  # only needed if you're running outside Docker or .env isn't auto-loaded

# Read from environment
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
llm_api_version = os.environ["GRAPHRAG_LLM_API_VERSION"].strip('"')
llm_deployment = os.environ["GRAPHRAG_LLM_DEPLOYMENT"]

embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
embedding_api_version = os.environ["GRAPHRAG_EMBEDDING_API_VERSION"].strip('"')
embedding_deployment = os.environ["GRAPHRAG_EMBEDDING_DEPLOYMENT"]

# Shared AOAI base
api_base = "https://graphrag-eastus2.openai.azure.com"

# Chat model config
chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.AzureOpenAIChat,
    model=llm_model,
    api_base=api_base,
    api_version=llm_api_version,
    deployment_name=llm_deployment,
    max_retries=5,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

# Embedding model config
embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.AzureOpenAIEmbedding,
    model=embedding_model,
    api_base=api_base,
    api_version=embedding_api_version,
    deployment_name=embedding_deployment,
    max_retries=5,
)

# Initialize models
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=chat_config.type,
    config=chat_config,
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=embedding_config.type,
    config=embedding_config,
)


In [20]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=None,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [21]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

model_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [22]:
search_engine = LocalSearch(
    model=chat_model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    model_params=model_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [3]:
add_section_title("Note: include_community_rank=False, community_prop=0.1", color="blue", size="14px")

<h2 style='color:blue; font-size:14px;'>Note: include_community_rank=False, community_prop=0.1</h2>

In [24]:
question = "Who is Alice Seinglass?"
result = await search_engine.search(question)
print(result.response)



### Overview of Alice Steinglass

Alice Steinglass is a prominent computer scientist, educator, and advocate for computer science education. Her career spans significant contributions to technology, entrepreneurship, and education. She is currently the President of **Code.org**, a global organization dedicated to expanding access to computer science education for K-12 students. Under her leadership, Code.org has developed curriculum, tools, and software for computer science education, provided professional development for teachers, and launched initiatives like the **Hour of Code**, which has reached tens of millions of students in over 180 countries [Data: Entities (88); Relationships (142, 154, 678)].

### Career Highlights

Alice's journey into technology began atypically, as she did not initially envision herself in the field. She majored in Computer Science in college and started her career at **Microsoft** in 2001, where she contributed to the development of the first **Xbox** an

In [25]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,88,ALICE STEINGLASS,Alice Steinglass is a prominent computer scien...,19,True
1,428,SETH,"Seth is Alice Steinglass's younger brother, wh...",1,True
2,430,DYNAMICFEEDBACK.COM,Dynamicfeedback.com was a startup co-founded b...,1,True
3,434,FIRST XBOX LAUNCH,The launch of the first version of Xbox in 200...,2,True
4,435,FIRST XBOX LIVE LAUNCH,"The launch of the first version of Xbox Live, ...",2,True


In [26]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,links,in_context
0,645,ALICE STEINGLASS,KEVIN SCOTT,Alice Steinglass and Kevin Scott are actively ...,29.0,1,True
1,140,KEVIN SCOTT,ALICE STEINGLASS,Kevin Scott and Alice Steinglass are actively ...,41.0,1,True
2,651,ALICE STEINGLASS,MICROSOFT,Alice Steinglass has a longstanding connection...,21.0,1,True
3,142,ALICE STEINGLASS,CODE.ORG,"Alice Steinglass is the President of Code.org,...",51.0,1,True
4,154,CODE.ORG,ALICE STEINGLASS,"Alice Steinglass is the president of Code.org,...",17.0,1,True


In [27]:
question = "Who did KEVIN SCOTT mention that leads a company helping kids learn computer science?"
result = await search_engine.search(question)
print(result.response)



Kevin Scott mentioned **Alice Steinglass**, who is the President of **Code.org**, a global nonprofit organization dedicated to expanding access to computer science education for K-12 students. Under her leadership, Code.org develops curriculum, tools, and software for computer science education, provides professional development for teachers, and runs initiatives like the **Hour of Code**, which has reached tens of millions of students in over 180 countries [Data: Entities (88, 89); Sources (118)].

Alice Steinglass is a passionate advocate for diversity in the tech workforce and emphasizes the importance of early computer science education, particularly in elementary schools, as a means to address gender stereotypes and support diversity in STEM fields. Kevin Scott highlighted her atypical journey into technology and her impactful work during their conversation on the *Behind the Tech* podcast [Data: Entities (88); Sources (118)].


In [4]:
add_section_title("Note: include_community_rank=True, community_prop=0.8", color="blue", size="14px")

<h2 style='color:blue; font-size:14px;'>Note: include_community_rank=True, community_prop=0.8</h2>

In [29]:
local_context_params = {
    "text_unit_prop": 0.2,
    "community_prop": 0.8,   # increased
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 15,  # Increased
    "top_k_relationships": 15,  # Increased
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": True,    # updated to True
    "return_candidate_context": True,  # updated to True
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

model_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [30]:
search_engine = LocalSearch(
    model=chat_model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    model_params=model_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [31]:
question = "Who did KEVIN SCOTT mention that leads a company helping kids learn computer science?"
result = await search_engine.search(question)
print(result.response)

Kevin Scott mentioned **Alice Steinglass**, who leads **Code.org**, a global nonprofit organization dedicated to making computer science education accessible to students worldwide. As the president of Code.org, Alice Steinglass plays a critical role in advancing the organization's mission by developing curriculum, advocating for diversity in STEM fields, and forming partnerships with schools and companies to integrate computer science into K-12 education [Data: Reports (2, 20, 21)].

Code.org is particularly known for its flagship initiative, the **Hour of Code**, which introduces students to computer science through engaging, one-hour coding activities. This initiative has reached millions of students globally and has been supported by high-profile figures such as Steph Curry, Barack Obama, and Justin Trudeau, further amplifying its impact [Data: Reports (21)]. Alice Steinglass's leadership reflects her commitment to fostering inclusivity and innovation in technology education.


In [5]:
add_section_title("Note: changed the word 'leads' to 'related to'", color="blue", size="14px")

<h2 style='color:blue; font-size:14px;'>Note: changed the word 'leads' to 'related to'</h2>

In [33]:
question = "Who did KEVIN SCOTT mention that is related to computer science, education, or kids?"
result = await search_engine.search(question)
print(result.response)

Kevin Scott mentioned several individuals and initiatives related to computer science, education, and children. Below is a detailed breakdown:

---

### **Alice Steinglass and Code.org**
Alice Steinglass, the president of Code.org, is a prominent figure in computer science education. Code.org is a nonprofit organization dedicated to making computer science accessible to K-12 students. It runs initiatives like the Hour of Code, which has reached millions of children worldwide, and develops curriculum and tools to promote diversity in STEM fields. Alice Steinglass advocates for early computer science education to address gender stereotypes and increase participation among underrepresented groups. Code.org's partnership with Microsoft, including the Minecraft Hour of Code activity, highlights the shared commitment to democratizing technology education [Data: Reports (2, 20); Sources (33)].

---

### **Kevin Scott's Advocacy for Education**
Kevin Scott himself is deeply committed to promot

In [6]:
add_section_title("Note: check max community level", color="blue", size="14px")

<h2 style='color:blue; font-size:14px;'>Note: check max community level</h2>

In [34]:
# Know what's the max community level
print(community_df["level"].max())


3


In [57]:
add_section_title("Step 7: graphRAG GlobalSearch", color="blue", size="28px")


<h2 style='color:blue; font-size:28px;'>Step 7: graphRAG GlobalSearch</h2>

In [36]:
import os

import pandas as pd
import tiktoken

from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_entities,
    read_indexer_reports,
)
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

In [44]:

api_key = os.environ["GRAPHRAG_API_KEY"]
api_base = os.environ["GRAPHRAG_API_BASE"]
api_version = os.environ["GRAPHRAG_LLM_API_VERSION"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
llm_deployment = os.environ["GRAPHRAG_LLM_DEPLOYMENT"]

config = LanguageModelConfig(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    deployment_name=llm_deployment,
    type=ModelType.AzureOpenAIChat,
    model=llm_model,
    max_retries=5,
)
model = ModelManager().get_or_create_chat_model(
    name="global_search",
    model_type=ModelType.AzureOpenAIChat,
    config=config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

In [45]:
from dotenv import load_dotenv
load_dotenv()  # only needed if you're running outside Docker or .env isn't auto-loaded

# Read from environment
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
llm_api_version = os.environ["GRAPHRAG_LLM_API_VERSION"].strip('"')
llm_deployment = os.environ["GRAPHRAG_LLM_DEPLOYMENT"]

embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
embedding_api_version = os.environ["GRAPHRAG_EMBEDDING_API_VERSION"].strip('"')
embedding_deployment = os.environ["GRAPHRAG_EMBEDDING_DEPLOYMENT"]

# Shared AOAI base
api_base = "https://graphrag-eastus2.openai.azure.com"

# Chat model config
chat_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.AzureOpenAIChat,
    model=llm_model,
    api_base=api_base,
    api_version=llm_api_version,
    deployment_name=llm_deployment,
    max_retries=5,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

# Embedding model config
embedding_config = LanguageModelConfig(
    api_key=api_key,
    type=ModelType.AzureOpenAIEmbedding,
    model=embedding_model,
    api_base=api_base,
    api_version=embedding_api_version,
    deployment_name=embedding_deployment,
    max_retries=5,
)

# Initialize models
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=chat_config.type,
    config=chat_config,
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=embedding_config.type,
    config=embedding_config,
)


In [46]:
INPUT_DIR = "/app/graphrag-folder/output/"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
RELATIONSHIP_TABLE = "relationships"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 3

In [47]:
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

communities = read_indexer_communities(community_df, report_df)
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)

report_df.head()

Total report count: 102
Report count after filtering by community level 3: 102


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,f5ae82bff5f9455dbffa2fc43526201f,98,98,3,75,[],Danielle Feinberg and Pixar Studios Community,This community centers around Danielle Feinber...,# Danielle Feinberg and Pixar Studios Communit...,8.5,The impact severity rating is high due to Dani...,[{'explanation': 'Danielle Feinberg serves as ...,"{\n ""title"": ""Danielle Feinberg and Pixar S...",2025-05-27,8
1,3d93506838d84ff499d250ab7b052516,99,99,3,75,[],Danielle Feinberg's Academic Journey at Harvard,This community centers around Danielle Feinber...,# Danielle Feinberg's Academic Journey at Harv...,7.5,The impact severity rating reflects the import...,[{'explanation': 'Linear Algebra was a pivotal...,"{\n ""title"": ""Danielle Feinberg's Academic ...",2025-05-27,2
2,63a71426e42c4254a09d3105d7fd6545,100,100,3,92,[],Turbo Pascal and Anders Hejlsberg's Pascal Eco...,"The community centers around Turbo Pascal, a r...",# Turbo Pascal and Anders Hejlsberg's Pascal E...,8.5,The impact severity rating is high due to Turb...,"[{'explanation': 'Turbo Pascal, developed by A...","{\n ""title"": ""Turbo Pascal and Anders Hejls...",2025-05-27,4
3,b66a6cf6bf324aedbb075606fcb3ab49,101,101,3,92,[],Anders Hejlsberg and Programming Language Inno...,"The community centers around Anders Hejlsberg,...",# Anders Hejlsberg and Programming Language In...,8.5,The impact severity rating is high due to Ande...,[{'explanation': 'Anders Hejlsberg is a pivota...,"{\n ""title"": ""Anders Hejlsberg and Programm...",2025-05-27,10
4,33be08d16f4a4a59b5770e6629550ce3,72,72,2,13,[],Pixar Animated Films: Finding Nemo and A Bug's...,The community centers around two key Pixar ani...,# Pixar Animated Films: Finding Nemo and A Bug...,7.5,The impact severity rating reflects the signif...,[{'explanation': 'Finding Nemo is a Pixar anim...,"{\n ""title"": ""Pixar Animated Films: Finding...",2025-05-27,2


In [48]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

In [49]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [50]:
search_engine = GlobalSearch(
    model=model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [51]:
result = await search_engine.search("Who did KEVIN SCOTT mention that is related to computer science, education, or kids?")

print(result.response)

### Kevin Scott's Mentions Related to Computer Science, Education, and Kids

Kevin Scott has mentioned several individuals and organizations that are closely tied to computer science, education, and initiatives aimed at benefiting children. Below is a detailed summary of these mentions:

---

#### **Alice Steinglass and Code.org**
Kevin Scott discussed Alice Steinglass, the President of Code.org, a nonprofit organization dedicated to expanding access to computer science education globally. Code.org runs initiatives like the Hour of Code, which has reached millions of students worldwide, and develops curriculum and tools to promote diversity in STEM fields. Scott also engaged in conversations with Steinglass about addressing gender stereotypes and improving gender balance in computer science education. These discussions highlight the importance of early exposure to computer science for underrepresented groups [Data: Reports (2, 82, +more)].

Additionally, Kevin Scott emphasized Code.org

In [58]:
add_section_title("Another way of graphRAG GlobalSearch", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Another way of graphRAG GlobalSearch</h2>

In [5]:
add_section_title("Note: run globalSearch in CLI", color="blue", size="14px")

<h2 style='color:blue; font-size:14px;'>Note: run globalSearch in CLI</h2>

In [53]:
import subprocess

query_text = "Who did KEVIN SCOTT mention that leads a company helping kids learn computer science?"
command = [
    "graphrag",
    "query",
    "--root", "/app/graphrag-folder",
    "--method", "global",
    "--query", query_text
]

# Run the command and capture output
result = subprocess.run(command, capture_output=True, text=True)

# Print the output
print("STDOUT:\n", result.stdout)
print("STDERR:\n", result.stderr)


STDOUT:
 


SUCCESS: Global Search Response:
### Alice Steinglass and Code.org

Kevin Scott mentioned Alice Steinglass as the leader of Code.org, a global nonprofit organization dedicated to expanding access to computer science education for K-12 students. Alice Steinglass serves as the President of Code.org and has played a pivotal role in advancing its mission to democratize computer science education. Under her leadership, Code.org has launched impactful initiatives such as the Hour of Code, which has reached tens of millions of students in over 180 countries [Data: Reports (19, 21, 2, 82, +more)].

### Code.org's Mission and Impact

Code.org focuses on making computer science education accessible to all, with a particular emphasis on promoting diversity in STEM fields. The organization develops curriculum and tools to support early computer science education and inspire students from diverse backgrounds to pursue careers in technology. Its flagship initiative, the Hour of Code, has

In [59]:
add_section_title("Step 8: Vector search", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 8: Vector search</h2>

In [55]:
import pandas as pd
import os

directory_path = "/app/graphrag-folder/output"
files = [f for f in os.listdir(directory_path) if f.endswith(".parquet")]

for file in files:
    try:
        df = pd.read_parquet(os.path.join(directory_path, file))
        print(f"{file}: {df.columns.tolist()}")
    except Exception as e:
        print(f"{file}: Error - {e}")


embeddings.entity.description.parquet: ['id', 'embedding']
communities.parquet: ['id', 'human_readable_id', 'community', 'level', 'parent', 'children', 'title', 'entity_ids', 'relationship_ids', 'text_unit_ids', 'period', 'size']
documents.parquet: ['id', 'human_readable_id', 'title', 'text', 'text_unit_ids', 'creation_date', 'metadata']
embeddings.text_unit.text.parquet: ['id', 'embedding']
text_units.parquet: ['id', 'human_readable_id', 'text', 'n_tokens', 'document_ids', 'entity_ids', 'relationship_ids', 'covariate_ids']
community_reports.parquet: ['id', 'human_readable_id', 'community', 'level', 'parent', 'children', 'title', 'summary', 'full_content', 'rank', 'rating_explanation', 'findings', 'full_content_json', 'period', 'size']
entities.parquet: ['id', 'human_readable_id', 'title', 'type', 'description', 'text_unit_ids', 'frequency', 'degree', 'x', 'y']
embeddings.community.full_content.parquet: ['id', 'embedding']
relationships.parquet: ['id', 'human_readable_id', 'source', 't

In [56]:
import pandas as pd
import openai
from openai import AzureOpenAI
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

api_base = os.getenv("GRAPHRAG_API_BASE")
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
llm_api_version = os.environ["GRAPHRAG_LLM_API_VERSION"].strip('"')
llm_deployment = os.environ["GRAPHRAG_LLM_DEPLOYMENT"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

# AOAI setup
client = AzureOpenAI(
    api_key=api_key,  # Replace with your Azure OpenAI key
    api_version=llm_api_version,  # Or the version your deployment uses
    azure_endpoint=api_base  # Replace with your endpoint
)

# Load embeddings and text
embeddings_df = pd.read_parquet("/app/graphrag-folder/output/embeddings.text_unit.text.parquet")
text_units_df = pd.read_parquet("/app/graphrag-folder/output/text_units.parquet")

# Join to get text
merged_df = pd.merge(embeddings_df, text_units_df, on="id", how="left")

# Generate embedding for the query
query = "Who did Kevin Scott mention that helps kids learn computer science?"
response = client.embeddings.create(
    input=[query],
    model=embedding_model  # Use your deployed model name
)
query_embedding = np.array(response.data[0].embedding).reshape(1, -1)

# Prepare stored embeddings
stored_embeddings = np.vstack(merged_df["embedding"].values)

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, stored_embeddings)[0]
merged_df["score"] = similarities

# Show top 5 results
top_results = merged_df.sort_values(by="score", ascending=False).head(5)
pd.set_option('display.max_colwidth', None)

print(top_results[["text", "score"]])


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [4]:
add_section_title("Note: above cell expands the top 5 results, to show the full text", color="blue", size="14px")

<h2 style='color:blue; font-size:14px;'>Note: above cell expands the top 5 results, to show the full text</h2>

In [57]:
# Show top 5 results
pd.reset_option('display.max_colwidth')
print(top_results[["text", "score"]])

                                                  text     score
116   child has the opportunity to learn computer s...  0.628696
113   it's their school doesn't teach it at all. So...  0.595626
117   back. It's doing less. It's not being enterta...  0.594327
112   across the board, which is really interesting...  0.592278
115  , whereas starting earlier you can maybe get t...  0.592008


In [61]:
add_section_title("Step 9: Examine semantic-kernel, AI agent related modules", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 9: Examine semantic-kernel, AI agent related modules</h2>

In [2]:
import semantic_kernel
print(semantic_kernel.__version__)


1.20.0


In [7]:
import pkgutil
import semantic_kernel

modules = []
for importer, modname, ispkg in pkgutil.walk_packages(semantic_kernel.__path__, prefix=semantic_kernel.__name__ + "."):
    modules.append(modname)

for m in modules:
    print(m)


semantic_kernel.agents
semantic_kernel.agents.agent
semantic_kernel.agents.channels
semantic_kernel.agents.channels.agent_channel
semantic_kernel.agents.channels.chat_history_channel
semantic_kernel.agents.channels.open_ai_assistant_channel
semantic_kernel.agents.chat_completion
semantic_kernel.agents.chat_completion.chat_completion_agent
semantic_kernel.agents.group_chat
semantic_kernel.agents.group_chat.agent_chat
semantic_kernel.agents.group_chat.agent_chat_utils
semantic_kernel.agents.group_chat.agent_group_chat
semantic_kernel.agents.group_chat.broadcast_queue
semantic_kernel.agents.open_ai
semantic_kernel.agents.open_ai.assistant_content_generation
semantic_kernel.agents.open_ai.azure_assistant_agent
semantic_kernel.agents.open_ai.function_action_result
semantic_kernel.agents.open_ai.open_ai_assistant_agent
semantic_kernel.agents.open_ai.open_ai_assistant_base
semantic_kernel.agents.open_ai.run_polling_options
semantic_kernel.agents.strategies
semantic_kernel.agents.strategies.se

In [15]:
import inspect
print(inspect.signature(AzureAssistantAgent.__init__))

(self, kernel: 'Kernel | None' = None, arguments: 'KernelArguments | None' = None, service_id: str | None = None, deployment_name: str | None = None, api_key: str | None = None, endpoint: Optional[Annotated[pydantic.networks.AnyUrl, UrlConstraints(max_length=2083, allowed_schemes=['https'], host_required=None, default_host=None, default_port=None, default_path=None)]] = None, api_version: str | None = None, ad_token: str | None = None, ad_token_provider: collections.abc.Callable[[], str | collections.abc.Awaitable[str]] | None = None, client: openai.lib.azure.AsyncAzureOpenAI | None = None, default_headers: dict[str, str] | None = None, env_file_path: str | None = None, env_file_encoding: str | None = None, description: str | None = None, id: str | None = None, instructions: str | None = None, name: str | None = None, enable_code_interpreter: bool | None = None, enable_file_search: bool | None = None, enable_json_response: bool | None = None, file_ids: list[str] | None = None, temperat

In [19]:
from semantic_kernel.agents.chat_completion.chat_completion_agent import ChatCompletionAgent
help(ChatCompletionAgent.__init__)


Help on function __init__ in module semantic_kernel.agents.chat_completion.chat_completion_agent:

__init__(self, service_id: str | None = None, kernel: 'Kernel | None' = None, name: str | None = None, id: str | None = None, description: str | None = None, instructions: str | None = None, arguments: semantic_kernel.functions.kernel_arguments.KernelArguments | None = None, prompt_template_config: semantic_kernel.prompt_template.prompt_template_config.PromptTemplateConfig | None = None) -> None
    Initialize a new instance of ChatCompletionAgent.

    Args:
        service_id: The service id for the chat completion service. (optional) If not provided,
            the default service name `default` will be used.
        kernel: The kernel instance. (optional)
        name: The name of the agent. (optional)
        id: The unique identifier for the agent. (optional) If not provided,
            a unique GUID will be generated.
        description: The description of the agent. (optional)


In [62]:
add_section_title("Step 10: AI agent for summarization service", color="blue", size="28px")

<h2 style='color:blue; font-size:28px;'>Step 10: AI agent for summarization service</h2>

In [45]:
import os
from semantic_kernel.contents.chat_history import ChatHistory
from semantic_kernel.contents.chat_message_content import ChatMessageContent, AuthorRole
from semantic_kernel.functions.kernel_arguments import KernelArguments
from semantic_kernel.agents.chat_completion.chat_completion_agent import ChatCompletionAgent

async def summarize_all_text(folder_path: str, agent: ChatCompletionAgent):
    # Step 1: Combine all .txt files into one string
    combined_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r") as f:
                combined_text += f.read() + "\n"

    # Step 2: Prepare arguments
    args = KernelArguments()

    # Step 3: Prepare chat history with system and user messages
    history = ChatHistory()
    history.add_message(ChatMessageContent(
        role=AuthorRole.SYSTEM,
        content="You are a helpful assistant that summarizes long documents into concise summaries."
    ))
    history.add_message(ChatMessageContent(
        role=AuthorRole.USER,
        content=f"Please summarize the following text:\n\n{combined_text}"
    ))

    # Step 4: Invoke the agent
    async for result in agent.invoke(prompt="", arguments=args, history=history):
        print("\n--- Combined Summary ---\n", result)


In [48]:
await summarize_all_text("/app/graphrag-folder/input", agent)



--- Combined Summary ---
 Sure, here's a concise summary of the lengthy text you provided:

---

**Summary:**

The podcast *Behind the Tech*, hosted by Microsoft CTO Kevin Scott, explores diverse stories about innovators in technology. In this series, Scott interviews influential figures such as Jaron Lanier, Reid Hoffman, Danielle Feinberg, Alice Steinglass, Andrew Ng, Anders Hejlsberg, Surya Ganguli, and others, showcasing their unique journeys into tech and their impact on the industry.

Common themes across episodes include:
1. **Career Journeys:** Interviewees shared personal anecdotes about how curiosity, chance, and determination shaped their paths—from early exposure to computing to tackling complex challenges in their careers, including machine learning, VR, programming languages, and animation.
2. **Intersection of Creativity and STEM:** Guests like Danielle Feinberg and Anders Hejlsberg describe blending artistic and technical craftsmanship in their work, whether creating v