In [1]:
from llama_index import SimpleDirectoryReader
from llama_index.ingestion import IngestionPipeline
from llama_index.node_parser import CodeSplitter, TokenTextSplitter
from llama_index.extractors import TitleExtractor, QuestionsAnsweredExtractor, EntityExtractor
from setup_llm import load_llm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load raw data from directory
code_documents = SimpleDirectoryReader(
    input_dir="./data",
    required_exts=[".go"],
    recursive=True,
).load_data()

txt_documents = SimpleDirectoryReader(
    input_dir="./data",
    required_exts=[".md"],
    recursive=True,
).load_data()

print(f"Loaded {len(code_documents)} documents")
print(f"Loaded {len(txt_documents)} documents")

Loaded 53 documents
Loaded 2 documents


In [4]:
# Create parser for code and txt
code_parser = CodeSplitter.from_defaults(
    language="go",
)

txt_parser = TokenTextSplitter.from_defaults()

In [2]:
llm = load_llm()

Loading checkpoint shards: 100%|██████████| 8/8 [00:06<00:00,  1.30it/s]


In [5]:
# Define the metadata extractors
title_extractor = TitleExtractor(llm=llm)
qa_extractor = QuestionsAnsweredExtractor(llm=llm, questions=3)
en_extractor = EntityExtractor()

In [6]:
# Define the ingestion pipeline
pipeline_code = IngestionPipeline(
    transformations=[code_parser, title_extractor, qa_extractor, en_extractor],
)

pipeline_txt = IngestionPipeline(
    transformations=[txt_parser, title_extractor, qa_extractor, en_extractor],
)

In [7]:

nodes_code = pipeline_code.run(
    documents=code_documents,
    in_place=True,
    show_progress=True,
)
nodes_txt = await pipeline_txt.run(
    documents=txt_documents,
    in_place=True,
    show_progress=True,
)


Parsing nodes: 100%|██████████| 53/53 [00:00<00:00, 1210.97it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 3/3 [00:08<00:00,  2.86s/it]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 3/3 [00:02<00:00,  1.09it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id

TypeError: object list can't be used in 'await' expression

In [5]:
from llama_index import ServiceContext, VectorStoreIndex, StorageContext

auto_merging_context_c = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-large-en-v1.5",
    node_parser=code_parser,
)

auto_merging_context_t = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-large-en-v1.5",
    node_parser=txt_parser,
)

storage_context_c = StorageContext.from_defaults()
storage_context_c.docstore.add_documents(nodes_code)

storage_context_t = StorageContext.from_defaults()
storage_context_t.docstore.add_documents(nodes_txt)

automerging_index = VectorStoreIndex(
    nodes=nodes_code,
    storage_context=storage_context_c,
    service_context=auto_merging_context_c,
)

automerging_index_t = VectorStoreIndex(
    nodes=nodes_txt,
    storage_context=storage_context_t,
    service_context=auto_merging_context_t,
)

automerging_index.storage_context.persist(persist_dir="./merging_index_c")
automerging_index_t.storage_context.persist(persist_dir="./merging_index_t")

config.json: 100%|██████████| 779/779 [00:00<00:00, 5.94MB/s]
model.safetensors: 100%|██████████| 1.34G/1.34G [00:13<00:00, 96.7MB/s]
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 4.28MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.15MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.77MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 671kB/s]


NameError: name 'nodes_code' is not defined