# GraphRAG(global search)

### Installation of Libraries

In [None]:
# !pip install graphrag
# !pip install PyMuPDF

In [None]:
import fitz  # PyMuPDF
import os
import shutil
import subprocess

In [None]:
# !mkdir -p ./ragtest/input

### Deleteing already existing files inside input directory

In [None]:
# Directory path
directory = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input'

# Get all files in the directory
files = os.listdir(directory)

# Iterate over all files and delete each one
for file in files:
    file_path = os.path.join(directory, file)
    try:
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Deleted {file_path}")
        else:
            print(f"{file_path} is not a file.")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")

Deleted C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\Creating Large Language Model Applications Utilizing LangChain_ A Primer on Developing LLM Apps Fast.txt


### Importing the data (.txt files)

In [None]:
# !curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt > ./ragtest/input/book.txt

In [None]:
# Define the directories
pdf_dir = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\pdfs'
output_dir = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Process each PDF file in the directory
for pdf_file_name in os.listdir(pdf_dir):
    if pdf_file_name.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file_name)
        txt_file_name = os.path.splitext(pdf_file_name)[0] + '.txt'
        output_path = os.path.join(output_dir, txt_file_name)

        # Open the PDF file
        pdf_document = fitz.open(pdf_path)

        # Extract text from each page
        full_text = ""
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            full_text += page.get_text()

        # Save the extracted text to a .txt file
        with open(output_path, 'w', encoding='utf-8') as text_file:
            text_file.write(full_text)

        print(f'Text extracted and saved to {output_path}')


Text extracted and saved to C:\Users\15011\Documents\Projects\GraphRAG\ragtest\input\37970083.txt


### Settting Up the Workspace Variables

In [None]:
# !python -m graphrag.index --init --root ./ragtest

### Deleting the previous available data at output folder

In [None]:
# Define the directory to be deleted
dir_path = r'C:\Users\15011\Documents\Projects\GraphRAG\ragtest\output'

# Check if the directory exists
if os.path.exists(dir_path):
    # Delete the directory and all its contents
    shutil.rmtree(dir_path)
    print(f'Directory {dir_path} has been deleted successfully.')
else:
    print(f'Directory {dir_path} does not exist.')


Directory C:\Users\15011\Documents\Projects\GraphRAG\ragtest\output has been deleted successfully.


### Running the Indexing pipeline

In [None]:
!python -m graphrag.index --root ./ragtest

⠋ GraphRAG Indexer 
🚀 Reading settings from ragtest\settings.yaml
⠋ GraphRAG Indexer 
⠙ GraphRAG Indexer 
⠼ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…
⠼ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…
└── create_base_text_units
⠼ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…
└── create_base_text_units
⠴ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…
└── create_base_text_units
⠴ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…
└── create_base_text_units
⠸ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----- 100% 0:00:… 0:00:…
└── create_base_text_units
    └── Verb chunk ---------------------------------------   0% -:--:-- 0:00:00
⠼ GraphRAG Indexer 
├── Loading Input (text) - 1 files loaded (0 filtered) ----

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

  from .autonotebook import tqdm as notebook_tqdm


## Global Search example

Global search method generates answers by searching over all AI-generated community reports in a map-reduce fashion. This is a resource-intensive method, but often gives good responses for questions that require an understanding of the dataset as a whole (e.g. What are the most significant values of the herbs mentioned in this notebook?).

### LLM setup

In [3]:
api_key = "34a6e9e765d94d3c8a318337cbc122cd"
llm_model = "gpt-4o"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
    api_version="2024-02-15-preview",
    deployment_name="gpt-4o",
    api_base= "https://df-pocs-q1.openai.azure.com/"
)

token_encoder = tiktoken.get_encoding("cl100k_base")

### Load community reports as context for global search

- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.
- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the `rank` attribute in the community reports table for context ranking)

In [10]:
# parquet files generated from indexing pipeline
INPUT_DIR = "C:\Users\15011\Documents\Projects\GraphRAG\ragtest\output"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [11]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 9


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,6,# ACMI and Retraction Issues\n\nThe community ...,1,7.5,ACMI and Retraction Issues,The impact severity rating is high due to the ...,"The community revolves around ACMI, an organiz...",[{'explanation': 'ACMI is the central entity i...,"{\n ""title"": ""ACMI and Retraction Issues"",\...",08ba5aba-c92e-439a-b8c8-5d93633489b9
1,7,# Reviewer 2 and PCR Study\n\nThe community re...,1,7.5,Reviewer 2 and PCR Study,The impact severity rating is high due to the ...,"The community revolves around Reviewer 2, who ...",[{'explanation': 'Reviewer 2 is a pivotal enti...,"{\n ""title"": ""Reviewer 2 and PCR Study"",\n ...",e5ed9b22-f3cb-4974-bd55-83bbfd16aade
2,8,# M. leprae Study and Manuscript Review\n\nThe...,1,6.5,M. leprae Study and Manuscript Review,The impact severity rating is moderately high ...,The community revolves around a study on M. le...,[{'explanation': 'The authors are the key enti...,"{\n ""title"": ""M. leprae Study and Manuscrip...",703bdd07-4b39-4e2e-8952-30262c3ffd0c
3,0,# Patel et al. and Access Microbiology 2023;5:...,0,7.5,Patel et al. and Access Microbiology 2023;5:00...,The impact severity rating is high due to the ...,The community revolves around the manuscript a...,[{'explanation': 'Patel et al. have authored a...,"{\n ""title"": ""Patel et al. and Access Micro...",b5e6f608-39c3-407a-9368-89166ec125d2
4,1,# Tofacitinib and Anti-Leprosy Treatment Commu...,0,7.5,Tofacitinib and Anti-Leprosy Treatment Community,The impact severity rating is high due to the ...,The community revolves around the use of Tofac...,[{'explanation': 'Tofacitinib is a central ent...,"{\n ""title"": ""Tofacitinib and Anti-Leprosy ...",4334ad64-023e-4bee-9746-1e9f3faedb88


#### Build global context based on community reports

In [12]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

#### Perform global search

In [13]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [14]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=False,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [None]:
result = await search_engine.asearch(
    "what is the address of the primary author?"
)

print(result.response)

In [None]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

In [None]:
# inspect number of LLM calls and tokens
print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")

LLM calls: 13. LLM tokens: 184660
