# Extracting Metadata with Llamaindex

---
CONCLUSIONS:
* Predefined Metadata Extractor modules work fairly well but the process is very slow
* Custom Metadata Extractor modules can be crated. This has not been investigated in current notebook.
* Pydantic Extractor seems to work well with OpenAIPydanticProgram, however swiching to the local GuidancePydanticProgram breaks the pipline (https://github.com/run-llama/llama_index/issues/9914)
* GuidancePydanticProgram without the pipeline dones not output structured enough data:  OutputParserException: Failed to parse pydantic object from guidance program. Probably the LLM failed to produce data with right json schema

* NOTE: version for llama-cpp-python was upgraded to latest when installing llama-index-llms-llama-cpp. Latest version might give unstable output for {Guidance}. Stable output with llama-cpp-python==0.2.26
* llama-cpp-python with GPU:\
CUDACXX=/usr/local/cuda-12/bin/nvcc CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all-major" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.26 --no-cache-dir --force-reinstall --upgrade
---
---

In [None]:
# !wget -O "data/10k-132.pdf" "https://www.dropbox.com/scl/fi/6dlqdk6e2k1mjhi8dee5j/uber.pdf?rlkey=2jyoe49bg2vwdlz30l76czq6g&dl=1"
# !wget -O "data/10k-vFinal.pdf" "https://www.dropbox.com/scl/fi/qn7g3vrk5mqb18ko4e5in/lyft.pdf?rlkey=j6jxtjwo8zbstdo4wz3ns8zoj&dl=1"

## Metadata Extractors modules
https://docs.llamaindex.ai/en/stable/examples/metadata_extraction/MetadataExtractionSEC/ \
https://docs.llamaindex.ai/en/stable/module_guides/indexing/metadata_extraction/ \
https://docs.llamaindex.ai/en/stable/examples/metadata_extraction/EntityExtractionClimate/ - Entitiy extractor

In [None]:
import nest_asyncio # required for ingesion pipline
nest_asyncio.apply()

from llama_index.llms.llama_cpp import LlamaCPP

from llama_index.core.schema import MetadataMode
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    #BaseExtractor,
)
from llama_index.extractors.entity import EntityExtractor

from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline


In [None]:
llm = LlamaCPP(
    model_path="/home/dorota/models/mistral-7b-instruct-v0.2.Q6_K.gguf",
    context_window=16384, # n_ctx=0
    max_new_tokens=1024,
    model_kwargs={"n_gpu_layers": 33},
    verbose=False
) 

In [None]:
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)

In [None]:
extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=2, llm=llm),
    EntityExtractor(prediction_threshold=0.5), # default model is tomaarsen/span-marker-mbert-base-multinerd
    SummaryExtractor(summaries=["prev", "self"], llm=llm),
    KeywordExtractor(keywords=10, llm=llm),
    # CustomExtractor()
]

In [None]:
transformations = [text_splitter] + extractors

In [None]:
ARTICLE = SimpleDirectoryReader(input_files=["/home/dorota/LLM-diploma-project/00_concept_tests/data/40001_2023_Article_1364.pdf"]).load_data()
ARTICLE = ARTICLE[0:1]

In [None]:
pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(documents=ARTICLE)

In [None]:
nodes[1].metadata


In [None]:
# https://docs.llamaindex.ai/en/stable/module_guides/indexing/metadata_extraction/
# TODO: try custom extractor
# from llama_index.core.extractors import BaseExtractor
# from typing import List, Dict

# class CustomExtractor(BaseExtractor):
#     async def aextract(self, nodes) -> List[Dict]:
#         metadata_list = [
#             {
#                 "custom": node.metadata["document_title"]
#                 + "\n"
#                 + node.metadata["excerpt_keywords"]
#             }
#             for node in nodes
#         ]
#         return metadata_list

## Pydantic Extractor with pipeline and GidancePydanticProgram
https://docs.llamaindex.ai/en/stable/examples/metadata_extraction/PydanticExtractor/

In [None]:
import nest_asyncio
nest_asyncio.apply()

from pydantic import BaseModel, Field
from typing import List

from llama_index.llms.llama_cpp import LlamaCPP

In [None]:
class NodeMetadata(BaseModel):
    """Node metadata."""

    entities: List[str] = Field(
        ..., description="Unique entities in this text chunk."
    )
    summary: str = Field(
        ..., description="A concise summary of this text chunk."
    )
    contains_number: bool = Field(
        ...,
        description=(
            "Whether the text chunk contains any numbers (ints, floats, etc.)"
        ),
    )

In [None]:
llm = LlamaCPP(
    model_path="/home/dorota/models/mistral-7b-instruct-v0.2.Q6_K.gguf",
    context_window=16384, # n_ctx=0
    max_new_tokens=1024,
    model_kwargs={"n_gpu_layers": 33},
    verbose=False
) 

In [None]:
from llama_index.program.guidance import GuidancePydanticProgram #from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core.extractors import PydanticProgramExtractor

EXTRACT_TEMPLATE_STR = """\
Here is the content of the section:
----------------
{context_str}
----------------
Given the contextual information, extract out a {class_name} object.\
"""

guidance_program = GuidancePydanticProgram.from_defaults(
    output_cls=NodeMetadata,
    prompt_template_str="{input}",
    # extract_template_str=EXTRACT_TEMPLATE_STR
)

program_extractor = PydanticProgramExtractor(
    program=guidance_program, input_key="input", show_progress=True, extract_template_str = EXTRACT_TEMPLATE_STR
)

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

ARTICLE = SimpleDirectoryReader(input_files=["/home/dorota/LLM-diploma-project/00_concept_tests/data/40001_2023_Article_1364.pdf"]).load_data()
ARTICLE = ARTICLE[0:1]

node_parser = SentenceSplitter(chunk_size=1024)
pipeline = IngestionPipeline(transformations=[node_parser, program_extractor])
orig_nodes = pipeline.run(documents=ARTICLE)

# can not run the pipeline with GuidancePydanticProgram; TypeError: GuidancePydanticProgram.program() missing 2 required positional arguments: 'tools_str' and 'query_str'

## Pydantic Extractor with GidancePydanticProgram without pipeline

In [None]:
from pydantic import BaseModel, Field
from typing import List

In [None]:
class NodeMetadata(BaseModel):
    """Node metadata."""

    entities: List[str] = Field(
        ..., description="Unique entities in this text chunk."
    )
    summary: str = Field(
        ..., description="A concise summary of this text chunk."
    )
    contains_number: bool = Field(
        ...,
        description=(
            "Whether the text chunk contains any numbers (ints, floats, etc.)"
        ),
    )

In [None]:
from llama_index.llms.llama_cpp import LlamaCPP
import guidance
llm = guidance.models.LlamaCppChat("/home/dorota/models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_gpu_layers=10,  n_ctx=0)

# llm has to be a guidance model and LlamaCppChat, not just LlamaCPP

In [None]:
from llama_index.program.guidance import GuidancePydanticProgram
from pypdf import PdfReader 
  
reader = PdfReader('/home/dorota/LLM-diploma-project/00_concept_tests/data/40001_2023_Article_1364.pdf') 
num_pages = len(reader.pages)
TEXT = ""
for page_num in range(1): #change to range(num_pages) for whole document
    page = reader.pages[page_num]  
    TEXT += page.extract_text()


program = GuidancePydanticProgram(
    output_cls=NodeMetadata,
    prompt_template_str=(
        """\
        Here is the content of the section:
        ----------------
        {{query_str}}
        ----------------
        Given the contextual information, extract a pydantic object.\
        """
    ),
    guidance_llm=llm,
    verbose=True,
)

output = program(query_str=TEXT, tools_str='')

# OutputParserException: Failed to parse pydantic object from guidance program. Probably the LLM failed to produce data with right json schema