In [1]:
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [5]:
# 1. SETUP: Ensure your Google API Key is set
os.environ["GOOGLE_API_KEY"] = "AIzaSyA5fphM-KGBPL9k8v58_PXeZP-vDZJktrE"

In [9]:

if "GOOGLE_API_KEY" not in os.environ:
    print("Please set your GOOGLE_API_KEY environment variable.")
    exit()

# 2. DEFINE THE HTML CONTENT (Test Data)
# This includes valid numbers, fax numbers (to ignore), outlets, and HQ context.
html_test_content = """
<div id="contact-page">
    <header>
        <h1>Global Tech Singapore - Contact Us</h1>
    </header>
    
    <section class="main-office">
        <h2>Corporate Headquarters</h2>
        <p>123 Marina Bay Blvd, Tower 1</p>
        <p><strong>Tel:</strong> +65 6777 1234</p>
        <p><strong>Fax:</strong> 6777 1235 (Facsimile)</p> <!-- Should be ignored -->
    </section>

    <section class="branches">
        <h3>Our Locations</h3>
        
        <div class="store">
            <h4>Jurong Point Outlet</h4>
            <p>Call us: 9123 4567</p>
        </div>

        <div class="store">
            <h4>Tampines Mall Branch</h4>
            <p>Phone: 65 6789 0000</p>
        </div>
        
        <div class="store">
            <h4>Orchard Store</h4>
            <p>Hotline: 66669999</p> 
        </div>

        <div class="overseas">
            <h4>Malaysia Support</h4>
            <p>+60 3 2123 4567</p> <!-- Should be ignored (Non-SG) -->
        </div>
    </section>
    
    <footer>
        <p>General Enquiries: +65 6222 3333</p> <!-- Context unclear, might be HQ or Unclassified -->
    </footer>
</div>
"""

# 3. DEFINE THE PROMPT TEMPLATE
# We paste your exact prompt instructions here.
prompt_text = """
You are an Information Extraction Agent.

Your input will be a block of HTML from a scraped webpage.
Your task is:
1. Extract ONLY valid Singapore phone numbers.
2. Classify each number as HQ or Outlet based on context.
3. Return a strict JSON result.

-------------------------
SINGAPORE NUMBER RULES
-------------------------
Valid formats:
- +65 XXXXXXXX
- +65 XXXX XXXX
- 65 XXXXXXXX
- 6XXXXXXX, 8XXXXXXX, 9XXXXXXX

Reject:
- Fax numbers ("Fax" or "Facsimile")
- Non-Singapore numbers
- Partial or placeholder numbers

-------------------------
CLASSIFICATION RULES
-------------------------
HQ indicators:
HQ, "Head Office", "Corporate", "Main Office", "(HQ)"

Outlet indicators:
Outlet, "Branch", "Store", location names, "(Outlet)"

If only one number exists → classify as HQ.
If unclear → put into unclassified.

Use nearby text, section titles, headings, labels, or parentheses.

-------------------------
OUTPUT FORMAT (STRICT)
-------------------------
{{
"hq_numbers": [],
"outlet_numbers": [],
"unclassified_numbers": [],
"confidence_score": 0.0
}}

Rules:
- Do NOT hallucinate numbers.
- Only use numbers found in the HTML.
- Preserve digits exactly.
- Return ONLY valid JSON, no markdown formatting.

-------------------------
HTML CONTENT TO ANALYZE
-------------------------
{content}
"""

# Note: We escaped the JSON braces in the prompt using double braces {{ }} 
# so LangChain doesn't confuse them with variables.

# 4. INITIALIZE THE MODEL AND CHAIN
# Changed to gemini-1.5-flash which is available and works with LangChain
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,          # Temperature 0 for strict extraction
)

prompt = PromptTemplate(
    template=prompt_text,
    input_variables=["content"]
)

# JsonOutputParser ensures the result is a Python Dictionary, stripping markdown code blocks
parser = JsonOutputParser()

chain = prompt | llm | parser

# 5. RUN THE CHAIN
try:
    print("Analyzing HTML content with Gemini...")
    result = chain.invoke({"content": html_test_content})
    
    print("\n--- EXTRACTION RESULT ---")
    print(json.dumps(result, indent=4))

except Exception as e:
    print(f"Error occurred: {e}")

Analyzing HTML content with Gemini...

--- EXTRACTION RESULT ---
{
    "hq_numbers": [
        "+65 6777 1234"
    ],
    "outlet_numbers": [
        "9123 4567",
        "65 6789 0000",
        "66669999"
    ],
    "unclassified_numbers": [
        "+65 6222 3333"
    ],
    "confidence_score": 1.0
}
