# 1. Environment Setup

In [None]:
%pip install -qq langchain langchain-nvidia-ai-endpoints gradio
%pip install -qq arxiv pymupdf
%pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain<1.0.0,>=0.3.21 (from langchain-community)
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain-text-spli

In [None]:
import os
os.environ["NVIDIA_API_KEY"] = "NVIDIA_API_KEY"

In [None]:
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

In [None]:
from langchain_core.runnables import RunnableLambda
from functools import partial

def Rprint(perface='State: '):
  def print_and_return(x, perface=''):
    print(f'{perface}{x}')
    return x
  return RunnableLambda(partial(print_and_return, perface=perface))

In [None]:
def PPrint(preface="State: "):
    def print_and_return(x, preface=""):
        pprint(preface, x)
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))

# 2. Loading Documents
* `UnstructuredFileLoader`: Generally-useful file loader for arbitrary files; doesn't make too many assumptions about your document structure and is usually sufficient.
* `ArxivLoader`: A more specialized file-loader which can communicate with the Arxiv interface directly. Just one example of many, this will make some more assumptions about your data to yield nicer parsings and auto-fill metadata (useful when you have multiple documents/formats).

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders import ArxivLoader


documents = ArxivLoader(query="2404.16130").load() # its about GraphRAG

In [None]:
## Printing out a sample of the content
print("Number of Documents Retrieved:", len(documents))
print(f"Sample of Document 1 Content (Total Length: {len(documents[0].page_content)}):")
print(documents[0].page_content[:1000])

Number of Documents Retrieved: 1
Sample of Document 1 Content (Total Length: 89583):
From Local to Global: A GraphRAG Approach to
Query-Focused Summarization
Darren Edge1†
Ha Trinh1†
Newman Cheng2
Joshua Bradley2
Alex Chao3
Apurva Mody3
Steven Truitt2
Dasha Metropolitansky1
Robert Osazuwa Ness1
Jonathan Larson1
1Microsoft Research
2Microsoft Strategic Missions and Technologies
3Microsoft Office of the CTO
{daedge,trinhha,newmancheng,joshbradley,achao,moapurva,
steventruitt,dasham,robertness,jolarso}@microsoft.com
†These authors contributed equally to this work
Abstract
The use of retrieval-augmented generation (RAG) to retrieve relevant informa-
tion from an external knowledge source enables large language models (LLMs)
to answer questions over private and/or previously unseen document collections.
However, RAG fails on global questions directed at an entire text corpus, such
as “What are the main themes in the dataset?”, since this is inherently a query-
focused summarization (QFS) ta

In [None]:
pprint(documents[0].metadata)

# 3. Transforming the documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1200,
    chunk_overlap  = 200,
    separators=["\n\n", "\n", ".", ";", ",", " ", ""],
)

In [None]:
docs_split = text_splitter.split_documents(documents)


The code filters out documents that are mostly filled with numbers or irrelevant content, ensuring that only those documents with a significant amount of meaningful (alphabetic) text remain in the `docs_split` list.


In [None]:
 def include_doc(doc):
     ## Some chunks will be overburdened with useless numerical data, so we'll filter it out
     string = doc.page_content
     if len([l for l in string if l.isalpha()]) < (len(string)//2):
         return False
     return True

docs_split = [doc for doc in docs_split if include_doc(doc)]
print(len(docs_split))

86


In [None]:
for i in (0, 1, 2, 15, -1):
    pprint(f"[Document {i}]")
    print(docs_split[i].page_content)
    pprint("="*64)

From Local to Global: A GraphRAG Approach to
Query-Focused Summarization
Darren Edge1†
Ha Trinh1†
Newman Cheng2
Joshua Bradley2
Alex Chao3
Apurva Mody3
Steven Truitt2
Dasha Metropolitansky1
Robert Osazuwa Ness1
Jonathan Larson1
1Microsoft Research
2Microsoft Strategic Missions and Technologies
3Microsoft Office of the CTO
{daedge,trinhha,newmancheng,joshbradley,achao,moapurva,
steventruitt,dasham,robertness,jolarso}@microsoft.com
†These authors contributed equally to this work
Abstract
The use of retrieval-augmented generation (RAG) to retrieve relevant informa-
tion from an external knowledge source enables large language models (LLMs)
to answer questions over private and/or previously unseen document collections.
However, RAG fails on global questions directed at an entire text corpus, such
as “What are the main themes in the dataset?”, since this is inherently a query-
focused summarization (QFS) task, rather than an explicit retrieval task. Prior
QFS methods, meanwhile, do not scal

ical RAG systems. To combine the strengths of these contrasting methods, we
propose GraphRAG, a graph-based approach to question answering over private
text corpora that scales with both the generality of user questions and the quantity
of source text. Our approach uses an LLM to build a graph index in two stages:
first, to derive an entity knowledge graph from the source documents, then to pre-
generate community summaries for all groups of closely related entities. Given a
question, each community summary is used to generate a partial response, before
all partial responses are again summarized in a final response to the user. For a
class of global sensemaking questions over datasets in the 1 million token range,
we show that GraphRAG leads to substantial improvements over a conventional
RAG baseline for both the comprehensiveness and diversity of generated answers.
1
Introduction
Retrieval augmented generation (RAG) (Lewis et al., 2020) is an established approach to using
LLMs to ans

LLMs to answer queries based on data that is too large to contain in a language model’s context
window, meaning the maximum number of tokens (units of text) that can be processed by the LLM
at once (Kuratov et al., 2024; Liu et al., 2023). In the canonical RAG setup, the system has access to
a large external corpus of text records and retrieves a subset of records that are individually relevant
to the query and collectively small enough to fit into the context window of the LLM. The LLM then
Preprint. Under review.
arXiv:2404.16130v2  [cs.CL]  19 Feb 2025
generates a response based on both the query and the retrieved records (Baumel et al., 2018; Dang,
2006; Laskar et al., 2020; Yao et al., 2017). This conventional approach, which we collectively call
vector RAG, works well for queries that can be answered with information localized within a small
set of records. However, vector RAG approaches do not support sensemaking queries, meaning
queries that require global understanding of the 

3.1.2
Text Chunks →Entities & Relationships
In this step, the LLM is prompted to extract instances of important entities and the relationships
between the entities from a given chunk. Additionally, the LLM generates short descriptions for the
entities and relationships. To illustrate, suppose a chunk contained the following text:
4
NeoChip’s (NC) shares surged in their first week of trading on the NewTech Ex-
change. However, market analysts caution that the chipmaker’s public debut may
not reflect trends for other technology IPOs. NeoChip, previously a private entity,
was acquired by Quantum Systems in 2016. The innovative semiconductor firm
specializes in low-power processors for wearables and IoT devices.
The LLM is prompted such that it extracts the following:
• The entity NeoChip, with description “NeoChip is a publicly traded company specializing
in low-power processors for wearables and IoT devices.”
• The entity Quantum Systems, with description “Quantum Systems is a firm that 

For example, if the question is ’What is the capital
of France?’, a direct answer would be ’Paris’.
A direct answer should not provide any irrelevant or
unnecessary information that does not answer the question.
For example, an indirect answer would be ’The
capital of France is located on the river Seine’.",
"empowerment":
"How well does the answer help the reader understand and make informed judgements about
the topic without being misled or making fallacious assumptions.
Evaluate each answer on the quality of
answer as it relates to clearly explaining and providing reasoning and sources behind the claims in the
answer."
}
25
G
Statistical Analysis
Table 6: Pairwise comparisons of six conditions on four metrics across 125 questions and two
datasets. For each question and metric, the winning condition received a score of 100, the losing
condition received a score of 0, and in the event of a tie, each condition was scored 50. These scores
were then averaged over five evaluation runs for

# 4. Refining summaries

In [None]:
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import PydanticOutputParser

from langchain_nvidia_ai_endpoints import ChatNVIDIA

from pydantic import BaseModel, Field
from typing import List
from IPython.display import clear_output

class DocummentSummaryBase(BaseModel):
  running_summary: str = Field("", description='Running description of the document. Do not override; only update!')
  main_ideas: List[str] = Field([], description='Most important information from the document (max 3)')
  loose_ends: List[str] = Field([], description="Open questions that would be good to incorporate into summary, but that are yet unknown (max 3)")



In [None]:
# prompt

prompt_template = ChatPromptTemplate.from_template(
    "You are generating a running summary of the document. Make it readable by a technical user."
    " After this, the old knowledge base will be replaced by the new one. Make sure a reader can still understand everything."
    " Keep it short, but as dense and useful as possible! The information should flow from chunk to (loose ends or main ideas) to running_summary."
    " The updated knowledge base keep all of the information from running_summary here: {info_base}."
    "\n\n{format_instructions}. Follow the format precisely, including quotations and commas"
    "\n\nWithout losing any of the info, update the knowledge base with the following: {input}"
)



The `reextract` function performs a series of operations to:
- Extract data using an LLM and a prompt.
- Format and parse the data according to a given `pydantic_class`.
- Return the parsed data in a structured dictionary format, with the key `'info_base'`.

In [None]:
def reextract(pydantic_class, llm, prompt):
    '''
    Runnable Extraction module
    Returns a knowledge dictionary populated by slot-filling extraction
    '''
    parser = PydanticOutputParser(pydantic_object=pydantic_class)
    instruct_merge = RunnableAssign({'format_instructions' : lambda x: parser.get_format_instructions()})
    def preparse(string):
       if '{' not in string: string = '{' + string
       if '}' not in string: string = string + '}'
       string = (string
            .replace("\\_", "_")a
            .replace("\n", " ")
            .replace("\]", "]")
            .replace("\[", "[")
        )
        # print(string)  ## Good for diagnostics
       return string
    # Return a dictionary with 'info_base' containing the parsed output
    return instruct_merge | prompt | llm | preparse | parser | RunnableLambda(lambda x: {'info_base': x})

In [None]:
latest_summary = ""

def RSummarizer(knowledge, llm, prompt, verbose=False):
    '''
    Exercise: Create a chain that summarizes
    '''

    def summarize_docs(docs):

        parse_chain = reextract(knowledge.__class__, llm, prompt)

        state = {'info_base': knowledge}

        global latest_summary
        for i, doc in enumerate(docs):
            state = parse_chain.invoke({'input': doc.page_content, 'info_base': state['info_base']})

            assert 'info_base' in state
            if verbose:
                print(f"Considered {i+1} documents")
                pprint(state['info_base'])
                latest_summary = state['info_base']
                clear_output(wait=True)

        return state['info_base']

    return RunnableLambda(summarize_docs)

instruct_model = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1").bind(max_tokens=4096)
instruct_llm = instruct_model | StrOutputParser()

## Take the first 10 document chunks and accumulate a DocumentSummaryBase
summarizer = RSummarizer(DocummentSummaryBase(), instruct_llm, prompt_template, verbose=True) # This line was problematic
summary = summarizer.invoke(docs_split[:15])

Considered 15 documents


In [None]:
pprint(latest_summary)