In [None]:
# https://docs.llamaindex.ai/en/stable/examples/index_structs/doc_summary/DocSummary/
import os
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core.node_parser import SentenceSplitter

In [6]:
wiki_titles = ["Toronto", "Seattle", "Chicago", "Boston", "Houston"]
url = "https://en.wikipedia.org/w/api.php"

In [7]:
from pathlib import Path
import requests

for title in wiki_titles:
    response = requests.get(
        url = url,
        params = {
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path("data")
    if not data_path.exists():
        data_path.mkdir()
    
    with open(data_path / f"{title}.txt", "w") as f:
        f.write(wiki_text)

In [8]:
city_docs = []
for wiki_title in wiki_titles:
    docs = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()
    docs[0].doc_id = wiki_title
    city_docs.extend(docs)

In [14]:
gemini = GoogleGenAI(model = 'gemini-2.0-flash')
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=100)

In [16]:
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_google_genai import GoogleGenerativeAIEmbeddings

response_synthesizer = get_response_synthesizer(
    llm = gemini,
    response_mode="tree_summarize",
    use_async = True
)

google_embed = GoogleGenerativeAIEmbeddings(
    model = 'models/embedding-001'
)
embed_model = LangchainEmbedding(GoogleGenerativeAIEmbeddings)
doc_summary_index = DocumentSummaryIndex(
    city_docs,
    llm = gemini,
    embed_model = embed_model,
    transformations = [splitter],
    response_synthesizer = response_synthesizer,
    show_progress = True
)

ValueError: ref_doc_id of node cannot be None when building a document summary index