In [1]:
from bs4 import BeautifulSoup
from collections import defaultdict
import re

In [None]:
def parse_html_chunks(html: str):
    soup = BeautifulSoup(html, "html.parser")
    chunks = []
    current_chunk = []

    def flush_chunk():
        nonlocal current_chunk
        if current_chunk:
            chunks.append("\n".join(current_chunk))
            current_chunk = []

    # Helper to determine if this is a "heading-ish" or "start of section" element
    def is_new_section(tag):
        if tag.name in ['h1', 'h2', 'h3', 'h4']:
            return True
        if tag.name == 'div' and any(kw in " ".join(tag.get('class', [])) for kw in ['heading', 'title', 'section', 'card']):
            return True
        return False

    # Traverse top-level tags
    for tag in soup.body.find_all(recursive=False) if soup.body else soup.find_all(recursive=False):
        # Skip script/style
        if tag.name in ['script', 'style']:
            continue

        text = tag.get_text(strip=True)
        if not text:
            continue

        # Start new chunk if this is a new section
        if is_new_section(tag):
            flush_chunk()
        current_chunk.append(f"[{tag.name.upper()}] {' '.join(tag.get('class', []))} :: {text}")

    flush_chunk()
    return chunks

In [None]:
# --- Example Usage ---
with open("tracphone_40_dollar_20_gb.html", "r", encoding="utf-8") as file:
    html_data = file.read()

chunks = parse_html_chunks(html_data)
# for i, chunk in enumerate(chunks, 1):
#     print(f"\n--- Chunk #{i} ---\n{chunk}")
#     print("\n\n")
# to_process = ' '.join(chunks).split("Gold Unlimited")[0]
# print(to_process)
to_process = ' '.join(chunks)

In [None]:
to_process

In [3]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.chains import create_extraction_chain_pydantic
from langchain_community.llms import Ollama

In [4]:
from ollama import chat

In [None]:
system_message = """
You are a content transformation assistant. Your task is to extract and organize human-readable, structured summaries from cluttered or semi-structured input such as raw HTML, UI text dumps, or annotated logs.

Follow these strict formatting and behavior rules:

Summarize core content clearly: Identify key concepts like plan names, prices, features, limitations, and promotions. Discard irrelevant UI elements like buttons, form inputs, or JavaScript tokens.

Group similar content: If multiple offerings are present (e.g. phone plans), group them under clearly named headings (e.g. "Platinum Unlimited Plan") and present in a consistent format.

Use bullet points: Organize extracted features, prices, and benefits using clean, consistent bullet points under each group.

Include general/common information separately: If certain features or disclaimers apply to all items (e.g., international calling in all plans), extract and list them under a "General Features" or "Common Info" section.

Ignore noise: Remove or ignore tokens like template placeholders (#priceDollar, {{...}}), navigation links, form elements, marketing filler, or inline formatting.

Output must be plain text or markdown, not HTML or code.

Example input types you may receive include: HTML blobs, noisy UI strings, copied marketing text, or raw component-rendered content.
"""

In [None]:
response = chat(
 messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": to_process},
    ],
  model='gemma3:4b'
)
print(response.message.content)

## Creating Structured JSON using pydandic schema definitions

In [7]:
tracphone_message_content = """## Tracfone Broadband Plan Summary - $40 Unlimited Talk & Text

**Plan Details:**

*   **Plan Name:** Broadband Unlimited Talk and Text with 20GB of Data
*   **Price:** $40.00 per month
*   **Hotspot Capable:** Yes
*   **ID Protection:** Included
*   **Contract Required:** No

**Additional Charges & Terms:**

*   **Activation Fee:** $0.00
*   **Provider Monthly Fees:** $0.00
*   **One-Time Fees at Purchase:** Varies
*   **Federal Universal Service Fund (FUSF):** $0.23
*   **Regulatory Cost Recovery:** $0.08
*   **Early Termination Fee:** $0.00
*   **Government Taxes:** Varies by location

**Speeds:**

*   **Typical Download Speed:** 35-143 Mbps (5G)
*   **Typical Upload Speed:** 5-31 Mbps
*   **Typical Latency:** 42-64 ms

**Data Included:**

*   20 GB of Data

**General Features:**

*   Speeds provided are typical and may vary.
*   Speeds are based on 5G network.
*   Access to FCC Consumer Resource Center: fcc.gov/consumer"""

In [9]:
sample_text_content_input_for_json = tracphone_message_content#response.message.content

In [11]:
from pydantic import BaseModel, HttpUrl
from typing import Optional, List

class Fee(BaseModel):
    name: str
    amount: str 

class SpeedInfo(BaseModel):
    network_type: str 
    typical_download_speed: str
    typical_upload_speed: str
    typical_latency: str

class NetworkPolicy(BaseModel):
    title: str 
    policy_link: HttpUrl

class CustomerSupport(BaseModel):
    website: HttpUrl
    phone_number: str

class BroadbandFacts(BaseModel):
    provider: str
    plan_name: str
    disclosure_title: str
    monthly_price: str
    monthly_price_notes: List[str]
    additional_fees: List[Fee]
    one_time_fees: List[Fee]
    early_termination: List[Fee]
    federal_universal_fund_fee: List[Fee]
    regulatory_cost_recovery: List[Fee]
    government_taxes: List[Fee]
    discounts_and_bundles_link: Optional[HttpUrl]
    speeds: List[SpeedInfo]
    data_included: str
    additional_data_charges: str
    network_policies: List[NetworkPolicy]
    customer_support: CustomerSupport
    fcc_consumer_info_link: HttpUrl
    reference_code: Optional[str]

In [13]:
impoved_system_message = f"""
Extract the following structured data from the provided HTML. The extracted information should match the following fields exactly:

provider (string): Name of the broadband provider.

plan_name (string): Name of the plan.

disclosure_title (string): Title of the disclosure (e.g., "Mobile Broadband Consumer Disclosure").

monthly_price (string): Monthly cost of the plan (including the dollar sign).

monthly_price_notes (list of strings): Any notes about the monthly price (such as if it's not introductory or doesn't require a contract).

additional_fees (list of objects): Each object should have:

name (string): Name of the recurring fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

one_time_fees (list of objects): Each object should have:

early_termination:
name (string): Name of the one-time fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

federal_universal_fund_fee:
name (string): Name of the one-time fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

regulatory_cost_recovery:
name (string): Name of the one-time fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

government_taxes: 
name (string): Name of the one-time fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

discounts_and_bundles_link (URL): URL link for discounts and bundles information (if available).

speeds (list of objects): Each object should represent a network type (5G Ultra Wideband, 5G, 4G LTE) with:

network_type (string): Name of the network type.

typical_download_speed (string): Typical download speed range (e.g., "number-number Mbps").

typical_upload_speed (string): Typical upload speed range (e.g., "number-number Mbps").

typical_latency (string): Typical latency range (e.g., "number-number ms").

data_included (string): Data included with the plan (e.g., "Unlimited").

additional_data_charges (string): Cost for additional data usage (including the dollar sign).

network_policies (list of objects): Each object should have:

title (string): Policy title (e.g., "Network Management", "Privacy").

policy_link (URL): URL link to the policy.

customer_support (object):

website (URL): URL to the customer support page.

phone_number (string): Customer support phone number.

fcc_consumer_info_link (URL): Link to the FCC consumer information page.

reference_code (string): Reference code shown at the end of the facts (if available).

Important Notes:

Clean up extra whitespace and line breaks from extracted text.

Preserve formatting like dollar signs ("$") where present.

All URLs should be extracted completely and valid.

If a field is not present, leave it as null (or empty for optional fields)."""

In [None]:
jsonresponse = chat(
    messages=[
        {"role": "system", "content": impoved_system_message},
        {"role": "user", "content": sample_text_content_input_for_json},
    ],
    model="gemma3:4b",
    format=BroadbandFacts.model_json_schema(), options={"temperature": 0.1}
)

In [None]:
print(jsonresponse.message.content)

## Uploading to Neo4j

In [None]:
line5_json_data = jsonresponse.message.content

In [None]:
line5_input_json_data = json.loads(line5_json_data.split('network_policies')[0].strip()[:-5]+"}")

In [None]:
line5_input_json_data

In [None]:
from neo4j import GraphDatabase
import json

In [None]:
# Replace with your Neo4j credentials
NEO4J_URI = "neo4j+s://3ea3293d.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "<API KEY>"

In [None]:
def upload_to_neo4j(data):
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    with driver.session() as session:
        session.write_transaction(_upload_data_tx, data)
    driver.close()


def _upload_data_tx(tx, data):
    provider = data.get('provider')
    plan_name = data.get('plan_name')

    # Step 1: Create provider and plan, and their relationship
    tx.run("""
        MERGE (p:Provider {name: $provider})
        MERGE (pl:Plan {
            name: $plan_name,
            disclosure_title: $disclosure_title,
            monthly_price: $monthly_price,
            monthly_price_notes: $monthly_price_notes,
            discounts_and_bundles_link: $discounts_and_bundles_link,
            data_included: $data_included,
            additional_data_charges: $additional_data_charges
        })
        MERGE (p)-[:OFFERS]->(pl)
    """, {
        "provider": provider,
        "plan_name": plan_name,
        "disclosure_title": data.get('disclosure_title'),
        "monthly_price": data.get('monthly_price'),
        "monthly_price_notes": '; '.join(data.get('monthly_price_notes', [])),
        "discounts_and_bundles_link": data.get('discounts_and_bundles_link'),
        "data_included": data.get('data_included'),
        "additional_data_charges": data.get('additional_data_charges')
    })

    # Step 2: Add fees
    for fee in data.get('additional_fees', []):
        _merge_fee_and_link(tx, plan_name, fee, "additional")
    for fee in data.get('one_time_fees', []):
        _merge_fee_and_link(tx, plan_name, fee, "one_time")

    # Step 3: Add speeds
    for speed in data.get('speeds', []):
        _merge_speed_and_link(tx, plan_name, speed)


def _merge_fee_and_link(tx, plan_name, fee, fee_type):
    if not fee.get('name') or not fee.get('amount'):
        return
    tx.run("""
        MATCH (pl:Plan {name: $plan_name})
        MERGE (f:Fee {
            name: $name,
            amount: $amount,
            type: $type
        })
        MERGE (pl)-[:HAS_FEE]->(f)
    """, {
        "plan_name": plan_name,
        "name": fee.get('name'),
        "amount": fee.get('amount'),
        "type": fee_type
    })


def _merge_speed_and_link(tx, plan_name, speed):
    if not speed.get('network_type'):
        return
    tx.run("""
        MATCH (pl:Plan {name: $plan_name})
        MERGE (s:Speed {
            network_type: $network_type,
            download_speed: $download,
            upload_speed: $upload,
            latency: $latency
        })
        MERGE (pl)-[:HAS_SPEED]->(s)
    """, {
        "plan_name": plan_name,
        "network_type": speed.get('network_type'),
        "download": speed.get('typical_download_speed'),
        "upload": speed.get('typical_upload_speed'),
        "latency": speed.get('typical_latency')
    })

In [None]:
# === Run the upload ===
upload_to_neo4j(line5_input_json_data)