# demo: extract data from PDF source, and export in structured format

In [1]:
import sys
import os
sys.path.insert(0, r'C:\Repositories others\LLM-data-processer')
from llm_helper import read_pdf2text, InfoExtractor

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


## main code (API provider: Google)

### Define technology schema

In [2]:
# Define the schema structure
data_schema_tech_storage = {
    'tech_type': 'TechnologyStorage',
    'fields': {
        'tech_name': {
            'field_type': 'str',
            'description': 'The name of the storage technology.'
        },
        'tech_description': {
            'field_type': 'str',
            'description': 'A brief description of the storage technology.'
        },
        'efficiency_text': {
            'field_type': 'str',
            'description': 'The text efficiency of the storage technology, could be a range or fixed value.'
        },
        'efficiency_avg': {
            'field_type': 'float',
            'description': 'The average efficiency of the storage technology.'
        },
        'efficiency_low': {
            'field_type': 'float',
            'description': 'The lower bound of the efficiency of the storage technology.'
        },
        'efficiency_high': {
            'field_type': 'float',
            'description': 'The upper bound of the efficiency of the storage technology.'
        }
    }
}

In [3]:
# Define prompt templates as dictionaries
# the prompt can be changed, but "NOTE" do not remove the five keys in "{}"

base_prompt_dict = {
    "system": (
        "You are an energy technology expert. Generate a simple description for the requested storage technology. "
        "Your output **MUST** come from the following information source and be in the specified JSON format."
        "If the value cannot be found in the information source, please indicate it clearly in the output."
        "\n\nInformation Source:\n{info_source}"
    ),
    "human": "Generate a simple description for {technology_name}. Your output format instructions are:\n{format_instructions}"
}

fix_prompt_dict = {
    "system": (
        "You are an expert JSON format fixer. The previous JSON output failed to parse against the required Pydantic schema."
        " **You must correct the JSON** to strictly adhere to the schema. The required format instructions are: {format_instructions}"
    ),
    "human": (
        "The original user request was: 'Generate a simple description for {technology_name}'. "
        "The malformed output was: '{malformed_output}'. Correct the JSON and only output the fixed JSON object."
    )
}

In [None]:
## load information from a PDF file
pdf_text = read_pdf2text('./data/CarnotBattery_Wikipedia.pdf')

In [None]:
ie = InfoExtractor()    # create a InfoExtractor instance

# load three components: data schema, prompt templates, and information source
ie.load_data_schema(data_schema_tech_storage)
ie.load_prompt_templates(base_prompt_dict, fix_prompt_dict)
ie.load_info_source("Carnot Battery", pdf_text)

# run data extraction
ie.extract_tech_info()