# demo: extract data from PDF source, and export in structured format

In [2]:
import pandas as pd
import sys

import os
sys.path.insert(0, r'C:\Repositories others\LLM-data-processer')
from llm_helper import read_pdf2text

In [3]:
pdf_text = read_pdf2text('./data/CarnotBattery_Wikipedia.pdf')

## main code (API provider: Google)

In [4]:
import json
from typing import List, Dict, Any

# Ensure you have the necessary langchain packages installed:
# pip install langchain-core langchain-huggingface pydantic

# --- Pydantic Schema Definition ---
from pydantic import BaseModel, Field
from langchain_core.exceptions import OutputParserException

  from pydantic.v1.fields import FieldInfo as FieldInfoV1


### Define technology schema

In [5]:
## define technology schema

class TechnologyStorage(BaseModel):
    """Schema for a storage technology."""
    tech_name: str = Field(description="The name of the storage technology.")
    # capacity: float = Field(description="The storage capacity in appropriate units.")
    tech_description: str = Field(description="A brief description of the storage technology.")
    efficiency_text: str = Field(description="The text efficiency of the storage technology, could be a range or fixed value.")
    efficiency_avg: float = Field(description="The average efficiency of the storage technology.")
    efficiency_low: float = Field(description="The lower bound of the efficiency of the storage technology.")
    efficiency_high: float = Field(description="The upper bound of the efficiency of the storage technology.")

### LangChain components

In [6]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize LLM and Parser using Google Gemini
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=os.getenv("GEMINI_API_KEY"),
    temperature=0.0
)
parser = JsonOutputParser(pydantic_object=TechnologyStorage)

### Prompt definitions

In [7]:
# 1. Base Prompt (to generate the initial, potentially bad, JSON)
base_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an energy technology expert. Generate a simple description for the requested storage technology. "+ 
         "Your output **MUST** come from the following information source and be in the specified JSON format."+ 
         "If the value cannot be found in the information source, please indicate it clearly in the output."+
        "\n\nInformation Source:\n{info_source}"
         ),
        ("human", "Generate a simple description for {technology_name}. Your output format instructions are:\n{format_instructions}"),
    ]
)

# 2. Fixing Prompt (used only if parsing fails)
fix_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert JSON format fixer. The previous JSON output failed to parse against the required Pydantic schema."+
         " **You must correct the JSON** to strictly adhere to the schema. The required format instructions are: {format_instructions}"),
        ("human", "The original user request was: 'Generate a simple description for {technology_name}'. "+
         "The malformed output was: '{malformed_output}'. Correct the JSON and only output the fixed JSON object."),
    ]
)

### Core Logic Function

In [8]:
def get_tech_info_json(technology_name: str, info_source: str, max_retries: int = 3) -> TechnologyStorage:
    """
    Attempts to get a valid Pydantic object from the LLM, retrying up to 
    max_retries times if the JSON parsing fails.
    """
    print(f"Attempting to generate technology description for: **{technology_name}**")
    
    # 1. First Attempt - Use the base generation chain
    base_chain = base_prompt | llm 
    
    # Get the LLM's initial response (potentially malformed JSON string)
    initial_response = base_chain.invoke({
        "technology_name": technology_name, 
        "info_source": info_source,
        "format_instructions": parser.get_format_instructions()
    })
    json_output = initial_response.content

    print(f"Initial JSON Output:\n{json_output}")

    
    # 2. Start the Retry Loop
    for attempt in range(max_retries):
        try:
            # Attempt to parse the JSON using the Pydantic parser
            print(f"\n‚úÖ Attempt {attempt + 1}: Parsing successful!")
            return parser.parse(json_output)
        
        except OutputParserException as e:
            # If parsing fails, proceed to fixing mechanism
            if attempt >= max_retries - 1:
                # Last attempt failed, raise error
                raise OutputParserException(f"Failed to parse output after {max_retries} retries.")
            
            print(f"‚ùå Attempt {attempt + 1}: Parsing failed (Error: {e}). Retrying with fix prompt...")
            
            # Use the fixing prompt and LLM to repair the output
            fix_chain = fix_prompt | llm
            
            fix_response = fix_chain.invoke({
                "technology_name": technology_name, 
                "format_instructions": parser.get_format_instructions(),
                "malformed_output": json_output 
            })
            
            # Update json_output with the new, hopefully fixed, JSON content
            json_output = fix_response.content

            print(f"Fixed JSON Output:\n{json_output}")
    
    # Should not be reached if max_retries is hit, but included for completeness
    raise OutputParserException(f"Failed to parse output after {max_retries} retries. Last output: {json_output}")

### Exectue the script

In [9]:
tech_to_generate = "Carnot Battery"
pdf_text = read_pdf2text('./data/CarnotBattery_Wikipedia.pdf')

try:
    reliable_tech_description = get_tech_info_json(tech_to_generate, pdf_text)
        
except OutputParserException as e:
    print("\n" + "="*50)
    print(f"üõë CRITICAL FAILURE: {e}")
    print("="*50)

Attempting to generate technology description for: **Carnot Battery**
Initial JSON Output:
{"tech_name": "Carnot battery", "tech_description": "A Carnot battery is a type of energy storage system that stores electricity in thermal energy storage. During the charging process, electricity is converted into heat and kept in heat storage. During the discharging process, the stored heat is converted back into electricity. It is also known as power-to-heat-to-power.", "efficiency_text": "Carnot batteries generally aim for a 40-70% efficiency range, with an optimal design achieving 57% efficiency and efficiencies up to 81% possible.", "efficiency_avg": 57, "efficiency_low": 40, "efficiency_high": 81}

‚úÖ Attempt 1: Parsing successful!


In [10]:
reliable_tech_description

{'tech_name': 'Carnot battery',
 'tech_description': 'A Carnot battery is a type of energy storage system that stores electricity in thermal energy storage. During the charging process, electricity is converted into heat and kept in heat storage. During the discharging process, the stored heat is converted back into electricity. It is also known as power-to-heat-to-power.',
 'efficiency_text': 'Carnot batteries generally aim for a 40-70% efficiency range, with an optimal design achieving 57% efficiency and efficiencies up to 81% possible.',
 'efficiency_avg': 57,
 'efficiency_low': 40,
 'efficiency_high': 81}