# GraphRAG Basic Walkthrough Demo

## Prerequisites
Install 3rd party packages that are not part of the Python Standard Library

In [5]:
! pip install devtools pandas python-magic requests tqdm

Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Installing collected packages: python-magic
[31mERROR: Exception:
Traceback (most recent call last):
  File "/graphrag-accelerator/.venv/lib/python3.10/site-packages/pip/_internal/cli/base_command.py", line 180, in exc_logging_wrapper
    status = run_func(*args)
  File "/graphrag-accelerator/.venv/lib/python3.10/site-packages/pip/_internal/cli/req_command.py", line 245, in wrapper
    return func(self, options, args)
  File "/graphrag-accelerator/.venv/lib/python3.10/site-packages/pip/_internal/commands/install.py", line 452, in run
    installed = install_given_reqs(
  File "/graphrag-accelerator/.venv/lib/python3.10/site-packages/pip/_internal/req/__init__.py", line 72, in install_given_reqs
    requirement.install(
  File "/graphrag-accelerator/.venv/lib/python3.10/site-packages/pip/_internal/req/req_install.py", line 856, in

In [6]:
import getpass
import json
import sys
import time
from pathlib import Path

import magic
import pandas as pd
import requests
from devtools import pprint
from tqdm import tqdm

## Configuration required by User

#### Get API Key for API Management Service
For authentication, the API requires a *subscription key* to be passed in the header of all requests. To find this key, visit the Azure Portal. The API subscription key will be located under `<my_resource_group> --> <API Management service> --> <APIs> --> <Subscriptions> --> <Built-in all-access subscription> Primary Key`.

In [7]:
ocp_apim_subscription_key = getpass.getpass(
    "Enter the subscription key to the GraphRag APIM:"
)

#### Setup directories and API endpoint

The following parameters are required to access and use the GraphRAG solution accelerator API:
* file_directory
* storage_name
* index_name
* endpoint

For demonstration purposes, you may use the provided `get-wiki-articles.py` script to download a small set of wikipedia articles or provide your own data.

In [14]:
"""
These parameters must be defined by the user:

- file_directory: local directory where data files of interest are stored.
- storage_name: unique name for an Azure blob storage container where files will be uploaded.
- index_name: unique name for a single knowledge graph construction. Multiple indexes can be created from the same blob container of data.
- apim_url: the endpoint URL for GraphRAG service (this is the Gateway URL found in the APIM resource).
"""

file_directory = ""
storage_name = ""
index_name = ""
apim_url = ""

In [15]:
assert (
    file_directory != "" and storage_name != "" and index_name != "" and apim_url != ""
)

In [10]:
"""
"Ocp-Apim-Subscription-Key": 
    This is a custom HTTP header used by Azure API Management service (APIM) to 
    authenticate API requests. The value for this key should be set to the subscription 
    key provided by the Azure APIM instance in your GraphRAG resource group.
"""

headers = {"Ocp-Apim-Subscription-Key": ocp_apim_subscription_key}

## Upload Files to Storage Data

In [11]:
def upload_files(
    file_directory: str,
    storage_name: str,
    batch_size: int = 100,
    overwrite: bool = True,
    max_retries: int = 5,
) -> requests.Response | list[Path]:
    """
    Upload files to a blob storage container.

    Args:
    file_directory - a local directory of .txt files to upload. All files must be in utf-8 encoding.
    storage_name - a unique name for the Azure storage container.
    batch_size - the number of files to upload in a single batch.
    overwrite - whether or not to overwrite files if they already exist in the storage container.
    max_retries - the maximum number of times to retry uploading a batch of files if the API is busy.

    NOTE: Uploading files may sometimes fail if the blob container was recently deleted
    (i.e. a few seconds before. The solution "in practice" is to sleep a few seconds and try again.
    """
    url = apim_url + "/data"

    def upload_batch(
        files: list, storage_name: str, overwrite: bool, max_retries: int
    ) -> requests.Response:
        for _ in range(max_retries):
            response = requests.post(
                url=url,
                files=files,
                params={"storage_name": storage_name, "overwrite": overwrite},
                headers=headers,
            )
            # API may be busy, retry
            if response.status_code == 500:
                print("API busy. Sleeping and will try again.")
                time.sleep(10)
                continue
            return response
        return response

    batch_files = []
    accepted_file_types = ["text/plain"]
    filepaths = list(Path(file_directory).iterdir())
    for file in tqdm(filepaths):
        # validate that file is a file, has acceptable file type, has a .txt extension, and has utf-8 encoding
        if (
            not file.is_file()
            or file.suffix != ".txt"
            or magic.from_file(str(file), mime=True) not in accepted_file_types
        ):
            print(f"Skipping invalid file: {file}")
            continue
        # open and decode file as utf-8, ignore bad characters
        batch_files.append(
            ("files", open(file=file, mode="r", encoding="utf-8", errors="ignore"))
        )
        # upload batch of files
        if len(batch_files) == batch_size:
            response = upload_batch(batch_files, storage_name, overwrite, max_retries)
            # if response is not ok, return early
            if not response.ok:
                return response
            batch_files.clear()
    # upload remaining files
    if len(batch_files) > 0:
        response = upload_batch(batch_files, storage_name, overwrite, max_retries)
    return response

In [16]:
response = upload_files(
    file_directory=file_directory,
    storage_name=storage_name,
    batch_size=100,
    overwrite=True,
)
if not response.ok:
    print(response.text)
else:
    print(response)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 49.41it/s]


<Response [200]>


List the new existing storage container 

In [18]:
def list_files() -> requests.Response:
    """List all data storage containers."""
    url = apim_url + "/data"
    return requests.get(url=url, headers=headers)

In [20]:
response = list_files()

pprint(response.json())

{
    'storage_name': [
        'test-storage1',
    ],
}


## Create an Index

After data files have been uploaded, it is now possible to construct a knowledge graph by creating a search index. If an entity configuration is not provided, a default entity configuration will be used that has been shown to generally work well.

In [23]:
def build_index(
    storage_name: str,
    index_name: str,
) -> requests.Response:
    """Create a search index.
    This function kicks off a job that builds a knowledge graph (KG) index from files located in a blob storage container.
    """
    url = apim_url + "/index"
    request = {
        "storage_name": storage_name,
        "index_name": index_name
    }
    return requests.post(url, json=request, headers=headers)

In [24]:
response = build_index(
    storage_name=storage_name,
    index_name=index_name
)
print(response)
if response.ok:
    print(response.text)
else:
    print(f"Failed to submit job.\nStatus: {response.text}")

<Response [200]>
{"status":"indexing operation has been scheduled."}


Note: An indexing job may fail sometimes due to insufficient TPM quota of the GPT-4 turbo model. In this situation, an indexing job can be restarted by re-running the cell above with the same parameters. `graphrag` caches previous indexing results as a cost-savings measure so that restarting indexing jobs will "pick up" where the last job stopped.

#### Check the status of an indexing job

Please wait for your index to reach 100 percent complete before continuing on to the next section to run queries.

In [34]:
def index_status(index_name: str) -> requests.Response:
    url = apim_url + f"/index/status/{index_name}"
    return requests.get(url, headers=headers)

In [39]:
response = index_status(index_name)

pprint(response.json())

{
    'status_code': 200,
    'index_name': 'cdifonzo-test-index1',
    'storage_name': 'test-storage1',
    'entity_config_name': None,
    'status': 'complete',
    'percent_complete': 100.0,
    'progress': '14 out of 14 workflows completed successfully.',
}


#### List indexes
To view a list of all indexes that exist in the GraphRAG service:

In [40]:
def list_indexes() -> list:
    """List all search indexes."""
    url = apim_url + "/index"
    response = requests.get(url, headers=headers)
    try:
        indexes = json.loads(response.text)
        return indexes["index_name"]
    except json.JSONDecodeError:
        print(response.text)
        return response

In [41]:
all_indexes = list_indexes()
pprint(all_indexes)

[
    'cdifonzo-test-index1',
]


## Query

After an indexing job has completed, the knowledge graph is ready to query. Two types of queries (global and local) are currently supported. In addition, you can issue a query over a single index or multiple indexes.

In [42]:
"""Needed helper function to parse out the clear result from the query response. """
def parse_query_response(
    response: requests.Response, return_context_data: bool = False
) -> requests.Response | dict[list[dict]]:
    """
    Prints response['result'] value and optionally
    returns associated context data.
    """
    if response.ok:
        print(json.loads(response.text)["result"])
        if return_context_data:
            return json.loads(response.text)["context_data"]
        return response
    else:
        print(response.reason)
        print(response.content)
        return response

#### Global Search

Global search queries are resource-intensive, but give good responses to questions that require an understanding of the dataset as a whole.

In [43]:
def global_search(index_name: str | list[str], query: str) -> requests.Response:
    """Run a global query over the knowledge graph(s) associated with one or more indexes"""
    url = apim_url + "/query/global"
    request = {"index_name": index_name, "query": query}
    return requests.post(url, json=request, headers=headers)

In [44]:
%%time
# pass in a single index name as a string or to query across multiple indexes, set index_name=[myindex1, myindex2]
global_response = global_search(
    index_name=index_name, query="Summarize the main topics of this data"
)
# print the result and save context data in a variable
global_response_data = parse_query_response(global_response, return_context_data=True)
global_response_data

# Overview of Main Topics

The dataset provides a multifaceted examination of Arizona and Alaska, covering historical, cultural, political, and economic aspects. Below is a synthesis of the main topics derived from the dataset.

## Historical Significance and Landmarks

Arizona and Alaska boast significant historical landmarks and events. In Arizona, the Japanese American internment camps at Mount Lemmon and the legacy of German POW camps are notable historical points. The Grand Canyon National Park is not only a natural wonder but also a site of historical importance. The Alaska Purchase is a pivotal historical event, with figures such as Alexander II and William H. Seward playing key roles [Data: Reports (4, 3, 7, 13)].

## Community and Cultural Dynamics

The dataset highlights the diverse communities within Arizona, including the Navajo Nation and the influence of religious organizations like the Roman Catholic Church and the Church of Jesus Christ of Latter-day Saints. In Alaska, 

{'reports': [{'id': '4',
   'title': 'Japanese American Internment at Mount Lemmon',
   'content': "# Japanese American Internment at Mount Lemmon\n\nThis report examines the historical community associated with the Japanese American internment camps during World War II, specifically focusing on the camp located at Mount Lemmon, Arizona. The entities involved include the state of Arizona and the specific location of Mount Lemmon.\n\n## Historical significance of Japanese American internment camps\n\nThe Japanese American internment camps represent a critical period in American history where persons of Japanese descent were forcibly relocated and interned due to wartime fears. These camps are a stark reminder of the consequences of wartime hysteria and racial prejudice, which led to the violation of civil liberties and rights of American citizens and residents. The internment has had a lasting impact on the Japanese American community and continues to be a subject of reflection and educ

An *experimental* API endpoint has been designed to support streaming back the graphrag response while executing a global query (useful in chatbot applications).

In [45]:
def global_search_streaming(
    index_name: str | list[str], query: str
) -> requests.Response:
    """Run a global query across one or more indexes and stream back the response"""
    url = apim_url + "/experimental/query/global/streaming"
    request = {"index_name": index_name, "query": query}
    context_list = []
    with requests.post(url, json=request, headers=headers, stream=True) as r:
        r.raise_for_status()
        for chunk in r.iter_lines(chunk_size=256 * 1024, decode_unicode=True):
            try:
                payload = json.loads(chunk)
                token = payload["token"]
                context = payload["context"]
                if token != "<EOM>":
                    print(token, end="")
                elif (token == "<EOM>") and not context:
                    print("\n")  # transition from output message to context
                else:
                    context_list.append(context)
            except json.JSONDecodeError:
                print(type(chunk), len(chunk), sys.getsizeof(chunk), chunk, end="\n")
    display(pd.DataFrame.from_dict(context_list).head(10))

In [46]:
global_search_streaming(
    index_name=index_name, query="Summarize the main topics of this data"
)

# Overview of Arizona and Alaska

The dataset provides a rich tapestry of historical, political, and cultural insights into the states of Arizona and Alaska. It captures the essence of significant events and entities that have left an indelible mark on the communities within these states.

## Arizona's Historical and Cultural Significance

### World War II Impact
Arizona's history is deeply intertwined with World War II. The internment of Japanese Americans is a stark reminder of the civil liberties challenges during the war, with Mount Lemmon serving as one of the internment sites [Data: Reports (4)]. Additionally, the state hosted German POW camps, one of which was later transformed into the Phoenix Zoo, indicating a complex legacy of the war [Data: Reports (3)].

### Natural Heritage
The Grand Canyon National Park stands as a testament to Arizona's natural heritage. Carved by the Colorado River, it was preserved due to the efforts of President Theodore Roosevelt, emphasizing its env

Unnamed: 0,id,title,content,rank,index_name,index_id
0,4,Japanese American Internment at Mount Lemmon,# Japanese American Internment at Mount Lemmon...,8.0,af20e88224165cf5ea4fe06f0aae63ca,4
1,3,Arizona's WWII German POW Camps Legacy,# Arizona's WWII German POW Camps Legacy\n\nTh...,3.0,af20e88224165cf5ea4fe06f0aae63ca,3
2,7,Grand Canyon National Park and Its Historical ...,# Grand Canyon National Park and Its Historica...,8.5,af20e88224165cf5ea4fe06f0aae63ca,7
3,0,"Arizona: A Mosaic of Communities, History, and...","# Arizona: A Mosaic of Communities, History, a...",7.5,af20e88224165cf5ea4fe06f0aae63ca,0
4,13,The Alaska Purchase: Alexander II and William ...,# The Alaska Purchase: Alexander II and Willia...,8.0,af20e88224165cf5ea4fe06f0aae63ca,13
5,10,Arizona's Immigration Legislation and Judicial...,# Arizona's Immigration Legislation and Judici...,7.5,af20e88224165cf5ea4fe06f0aae63ca,10
6,6,Flagstaff and Coconino County Community Report,# Flagstaff and Coconino County Community Repo...,6.5,af20e88224165cf5ea4fe06f0aae63ca,6
7,12,University of Alaska Fairbanks and Alaska Nati...,# University of Alaska Fairbanks and Alaska Na...,6.5,af20e88224165cf5ea4fe06f0aae63ca,12
8,8,Maricopa County and the Legacy of Voting Rights,# Maricopa County and the Legacy of Voting Rig...,7.5,af20e88224165cf5ea4fe06f0aae63ca,8
9,2,Anchorage: Resilience and Cultural Expansion,# Anchorage: Resilience and Cultural Expansion...,7.5,af20e88224165cf5ea4fe06f0aae63ca,2


#### Local Search

Local search queries are best suited for narrow-focused questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?)

In [47]:
def local_search(index_name: str | list[str], query: str) -> requests.Response:
    """Run a local query over the knowledge graph(s) associated with one or more indexes"""
    url = apim_url + "/query/local"
    request = {"index_name": index_name, "query": query}
    return requests.post(url, json=request, headers=headers)

In [48]:
%%time
# pass in a single index name as a string or to query across multiple indexes, set index_name=[myindex1, myindex2]
local_response = local_search(
    index_name=index_name, query="Who are the primary actors in these communities?"
)
# print the result and save context data in a variable
local_response_data = parse_query_response(local_response, return_context_data=True)
local_response_data

# Overview of Primary Actors in Arizona and Alaska Communities

## Arizona's Key Entities

### Educational Institutions
- **University of Arizona and Arizona State University**: These are central to Arizona's higher education, providing academic programs and contributing to intellectual and economic development [Data: Entities (11, 12); Relationships (2, 3)].
- **Northern Arizona University**: Serves additional communities within Arizona, complementing the state's educational landscape [Data: Relationships (62)].

### Native American Nations
- **Navajo Nation**: The largest Native American tribe in the U.S., with significant territory and cultural-political influence in Arizona [Data: Entities (15); Relationships (4)].

### Religious Organizations
- **Roman Catholic Church and Church of Jesus Christ of Latter-day Saints**: Reflect the state's religious diversity and potential influence on local communities and politics [Data: Entities (16, 17); Relationships (5, 6)].

### Political Fig

{'reports': [{'id': '0',
   'title': 'Arizona: A Mosaic of Communities, History, and Politics',
   'content': "# Arizona: A Mosaic of Communities, History, and Politics\n\nArizona is a state with a rich tapestry of communities, historical events, and political entities. Its entities range from major public universities and Native American nations to political figures and historical landmarks. The relationships between these entities highlight the state's diverse cultural, educational, and political landscape.\n\n## Arizona's Educational Institutions as Pillars of the Community\n\nArizona's major public universities, the University of Arizona and Arizona State University, are central to the state's higher education and research. They provide academic programs and opportunities to a large student body, contributing to the state's intellectual and economic development [Data: Entities (11, 12); Relationships (2, 3)]. Northern Arizona University further complements the state's educational l

## Sources

In a query response, citations will often appear that support GraphRAG's response. API endpoints are provided to enable retrieval of the sourced documents, entities, relationships, etc.

Multiple types of sources may be referenced in a query: Reports, Entities, Relationships, Claims, and Text Units. The API provides various endpoints to retrieve these sources for data provenance.

#### Get a Report

In [49]:
def get_report(index_name: str, report_id: str) -> requests.Response:
    """Retrieve a report generated by GraphRAG for a specific index."""
    url = apim_url + f"/source/report/{index_name}/{report_id}"
    return requests.get(url, headers=headers)

In [50]:
report_response = get_report(index_name, 0)
print(report_response.json()["text"]) if report_response.ok else (report_response.reason, report_response.content)

# Arizona: A Mosaic of Communities, History, and Politics

Arizona is a state with a rich tapestry of communities, historical events, and political entities. Its entities range from major public universities and Native American nations to political figures and historical landmarks. The relationships between these entities highlight the state's diverse cultural, educational, and political landscape.

## Arizona's Educational Institutions as Pillars of the Community

Arizona's major public universities, the University of Arizona and Arizona State University, are central to the state's higher education and research. They provide academic programs and opportunities to a large student body, contributing to the state's intellectual and economic development [Data: Entities (11, 12); Relationships (2, 3)]. Northern Arizona University further complements the state's educational landscape, serving additional communities within Arizona [Data: Relationships (62)].

## The Navajo Nation's Unique Cu

#### Get an Entity

In [51]:
def get_entity(index_name: str, entity_id: str) -> requests.Response:
    """Retrieve an entity generated by GraphRAG for a specific index."""
    url = apim_url + f"/source/entity/{index_name}/{entity_id}"
    return requests.get(url, headers=headers)

In [52]:
entity_response = get_entity(index_name, 0)
entity_response.json() if entity_response.ok else (entity_response.reason, entity_response.content)

{'name': 'ARIZONA',
 'description': "Arizona is a U.S. state located in the Southwestern United States, known for its desert climate, as well as its forests, plateaus, and metropolitan areas that exhibit a variety of temperatures and weather patterns. It was admitted to the Union on February 14, 1912, as the 48th state and the last of the contiguous states to be incorporated. Arizona boasts a diverse economy and has a historical reliance on the 'five C's'. It is also home to the Grand Canyon National Park, one of the state's most prominent natural features. Following the 2010 census, Arizona gained a ninth seat in the House of Representatives due to redistricting, reflecting changes in its population.",
 'text_units': ['6d0038acddcd4295e1a1d61934522e36',
  '6f74f3aa6337f5d03debbaf8424f68f0',
  'a439d13bfd7279d36fb7e76363bf0699',
  'b641199b00c40165babb1bf98db5b9da',
  'd153b2ce9dee240a9106b3668aa275b6']}

#### Get a Relationship

In [53]:
def get_relationship(index_name: str, relationship_id: str) -> requests.Response:
    """Retrieve a relationship generated by GraphRAG for a specific index."""
    url = apim_url + f"/source/relationship/{index_name}/{relationship_id}"
    return requests.get(url, headers=headers)

In [54]:
relationship_response = get_relationship(index_name, 1)
relationship_response.json() if relationship_response.ok else (relationship_response.reason, relationship_response.content)

{'source': 'ARIZONA',
 'source_id': 0,
 'target': 'GRAND CANYON NATIONAL PARK',
 'target_id': 10,
 'description': 'The Grand Canyon National Park is a major natural feature and tourist attraction in Arizona',
 'text_units': ['d153b2ce9dee240a9106b3668aa275b6']}

#### Get a Claim

In [None]:
def get_claim(index_name: str, claim_id: str) -> requests.Response:
    """Retrieve a claim/covariate generated by GraphRAG for a specific index."""
    url = apim_url + f"/source/claim/{index_name}/{claim_id}"
    return requests.get(url, headers=headers)

In [None]:
claim_response = get_claim(index_name, 1)
if claim_response.ok:
    pprint(claim_response.json())
else:
    print(claim_response)
    print(claim_response.text)

#### Get a Text Unit

In [None]:
def get_text_unit(index_name: str, text_unit_id: str) -> requests.Response:
    """Retrieve a text unit generated by GraphRAG for a specific index."""
    url = apim_url + f"/source/text/{index_name}/{text_unit_id}"
    return requests.get(url, headers=headers)

In [None]:
# get a text unit id from one of the previous Source endpoint results (look for 'text_units' in the response)
text_unit_id = ""
if not text_unit_id:
    raise ValueError(
        "Must provide a text_unit_id from previous source results. Look for 'text_units' in the response."
    )
text_unit_response = get_text_unit(index_name, text_unit_id)
if text_unit_response.ok:
    print(text_unit_response.json()["text"])
else:
    print(text_unit_response.reason)
    print(text_unit_response.content)

## Exploring the GraphRAG knowledge graph
The API currently provides some basic functionality to better understand the knowledge graph that was constructed during the indexing process.

In addition, an option is available to export the graph to a graphml file which can be imported by other open source visualization software (we recommend [Gephi](https://gephi.org/)) for deeper exploration.

#### Basic knowledge graph statistics

In [None]:
def get_graph_stats(index_name: str) -> requests.Response:
    """Get basic statistics about the knowledge graph constructed by GraphRAG."""
    url = apim_url + f"/graph/stats/{index_name}"
    return requests.get(url, headers=headers)

In [None]:
response = get_graph_stats(index_name)
print(response)
print(response.text)

#### Get a GraphML file

In [None]:
def save_graphml_file(index_name: str, graphml_file_name: str) -> None:
    """Retrieve and save a graphml file that represents the knowledge graph.
    The file is downloaded in chunks and saved to the local file system.
    """
    url = apim_url + f"/graph/graphml/{index_name}"
    if Path(graphml_file_name).suffix != ".graphml":
        raise UserWarning(f"{graphml_file_name} must have a .graphml file extension")
    with requests.get(url, headers=headers, stream=True) as r:
        r.raise_for_status()
        with open(graphml_file_name, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                f.write(chunk)

In [None]:
# will save graphml file to the current local directory
save_graphml_file(index_name, "knowledge_graph.graphml")