In [6]:
from bs4 import BeautifulSoup
from collections import defaultdict
import re

In [8]:
def parse_html_chunks(html: str):
    soup = BeautifulSoup(html, "html.parser")
    chunks = []
    current_chunk = []

    def flush_chunk():
        nonlocal current_chunk
        if current_chunk:
            chunks.append("\n".join(current_chunk))
            current_chunk = []

    # Helper to determine if this is a "heading-ish" or "start of section" element
    def is_new_section(tag):
        if tag.name in ['h1', 'h2', 'h3', 'h4']:
            return True
        if tag.name == 'div' and any(kw in " ".join(tag.get('class', [])) for kw in ['heading', 'title', 'section', 'card']):
            return True
        return False

    # Traverse top-level tags
    for tag in soup.body.find_all(recursive=False) if soup.body else soup.find_all(recursive=False):
        # Skip script/style
        if tag.name in ['script', 'style']:
            continue

        text = tag.get_text(strip=True)
        if not text:
            continue

        # Start new chunk if this is a new section
        if is_new_section(tag):
            flush_chunk()
        current_chunk.append(f"[{tag.name.upper()}] {' '.join(tag.get('class', []))} :: {text}")

    flush_chunk()
    return chunks

In [136]:
# --- Example Usage ---
with open("pbc_5lines.html", "r", encoding="utf-8") as file:
    html_data = file.read()

chunks = parse_html_chunks(html_data)
# for i, chunk in enumerate(chunks, 1):
#     print(f"\n--- Chunk #{i} ---\n{chunk}")
#     print("\n\n")
to_process = ' '.join(chunks).split("Gold Unlimited")[0]
print(to_process)

[HTML]  :: Broadband Facts(3Items)Want to compare plans? Read the labels to get the details on pricing, data allowances and limits, network speeds, and more.Best Value!My current planPlatinum Unlimited$225/mo/per linePrice is 225 dollars and 0 cents per month$60/mo for the first 3 months with Auto PayPrice is #planPriceDollar dollars and #planPriceCent cents per monthprice is dollar #priceDollar and #priceCent cents /moWas priced at #listPriceDollar dollars and #listPriceCent cents now priced at #priceDollar dollars and #priceCent cents	 /mo(0)0 star reviews from 0 customer$60 per month for the first 3 months with Auto Pay/mo/moPay$225/monthBroadband FactsStraight Talk WirelessPlatinum UnlimitedMobile Broadband Consumer DisclosureMonthly Price$65.00This Monthly Price is not an introductory rate.This Monthly Price does not require a contract.Additional Charges & TermsActivation Fee$0.00Provider Monthly Fees$0.00One-Time Fees at the Time of PurchaseFederal Universal Service Fund$0.37Regu

In [21]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.chains import create_extraction_chain_pydantic
from langchain_community.llms import Ollama

In [22]:
from ollama import chat

In [24]:
system_message = """
You are a content transformation assistant. Your task is to extract and organize human-readable, structured summaries from cluttered or semi-structured input such as raw HTML, UI text dumps, or annotated logs.

Follow these strict formatting and behavior rules:

Summarize core content clearly: Identify key concepts like plan names, prices, features, limitations, and promotions. Discard irrelevant UI elements like buttons, form inputs, or JavaScript tokens.

Group similar content: If multiple offerings are present (e.g. phone plans), group them under clearly named headings (e.g. "Platinum Unlimited Plan") and present in a consistent format.

Use bullet points: Organize extracted features, prices, and benefits using clean, consistent bullet points under each group.

Include general/common information separately: If certain features or disclaimers apply to all items (e.g., international calling in all plans), extract and list them under a "General Features" or "Common Info" section.

Ignore noise: Remove or ignore tokens like template placeholders (#priceDollar, {{...}}), navigation links, form elements, marketing filler, or inline formatting.

Output must be plain text or markdown, not HTML or code.

Example input types you may receive include: HTML blobs, noisy UI strings, copied marketing text, or raw component-rendered content.
"""

In [138]:
response = chat(
 messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": to_process},
    ],
  model='gemma3:4b'
)
print(response.message.content)

## Straight Talk Broadband Plans

Here's a breakdown of the available broadband plans:

**Platinum Unlimited Plan**

*   **Price:** $225/month per line
*   **Introductory Price:** $60/month for the first 3 months with Auto Pay
*   **Regular Price:** $225/month
*   **Data Included:** Unlimited
*   **Additional Data Charges:** $0.00 per GB
*   **Speeds Provided:**
    *   Typical Download Speed: 195-634 Mbps
    *   Typical Upload Speed: 10-53 Mbps
    *   Typical Latency: 37-57 ms

**Straight Talk Wireless Platinum Unlimited Mobile Broadband**

*   **Price:** $65.00/month
*   **Notes:** This Monthly Price is not an introductory rate and does not require a contract.
*   **Additional Charges & Terms:**
    *   Activation Fee: $0.00
    *   Provider Monthly Fees: $0.00
    *   One-Time Fees at the Time of Purchase: Federal Universal Service Fund $0.37, Regulatory Cost Recovery $0.13, Early Termination Fee: $0.00
    *   Government Taxes: Varies by location
*   **Speeds Provided:**
    *   

## Creating Structured JSON using pydandic schema definitions

In [140]:
sample_text_content_input_for_json = response.message.content

In [142]:
from pydantic import BaseModel, HttpUrl
from typing import Optional, List

class Fee(BaseModel):
    name: str
    amount: str 

class SpeedInfo(BaseModel):
    network_type: str 
    typical_download_speed: str
    typical_upload_speed: str
    typical_latency: str

class NetworkPolicy(BaseModel):
    title: str 
    policy_link: HttpUrl

class CustomerSupport(BaseModel):
    website: HttpUrl
    phone_number: str

class BroadbandFacts(BaseModel):
    provider: str
    plan_name: str
    disclosure_title: str
    monthly_price: str
    monthly_price_notes: List[str]
    additional_fees: List[Fee]
    one_time_fees: List[Fee]
    discounts_and_bundles_link: Optional[HttpUrl]
    speeds: List[SpeedInfo]
    data_included: str
    additional_data_charges: str
    network_policies: List[NetworkPolicy]
    customer_support: CustomerSupport
    fcc_consumer_info_link: HttpUrl
    reference_code: Optional[str]

In [144]:
impoved_system_message = f"""
Extract the following structured data from the provided HTML. The extracted information should match the following fields exactly:

provider (string): Name of the broadband provider.

plan_name (string): Name of the plan.

disclosure_title (string): Title of the disclosure (e.g., "Mobile Broadband Consumer Disclosure").

monthly_price (string): Monthly cost of the plan (including the dollar sign).

monthly_price_notes (list of strings): Any notes about the monthly price (such as if it's not introductory or doesn't require a contract).

additional_fees (list of objects): Each object should have:

name (string): Name of the recurring fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

one_time_fees (list of objects): Each object should have:

name (string): Name of the one-time fee (e.g., "Federal Universal Service Fund").

amount (string): Fee amount (including the dollar sign or "Varies by location").

discounts_and_bundles_link (URL): URL link for discounts and bundles information (if available).

speeds (list of objects): Each object should represent a network type (5G Ultra Wideband, 5G, 4G LTE) with:

network_type (string): Name of the network type.

typical_download_speed (string): Typical download speed range (e.g., "195-634 Mbps").

typical_upload_speed (string): Typical upload speed range (e.g., "10-53 Mbps").

typical_latency (string): Typical latency range (e.g., "37-57 ms").

data_included (string): Data included with the plan (e.g., "Unlimited").

additional_data_charges (string): Cost for additional data usage (including the dollar sign).

network_policies (list of objects): Each object should have:

title (string): Policy title (e.g., "Network Management", "Privacy").

policy_link (URL): URL link to the policy.

customer_support (object):

website (URL): URL to the customer support page.

phone_number (string): Customer support phone number.

fcc_consumer_info_link (URL): Link to the FCC consumer information page.

reference_code (string): Reference code shown at the end of the facts (if available).

Important Notes:

Clean up extra whitespace and line breaks from extracted text.

Preserve formatting like dollar signs ("$") where present.

All URLs should be extracted completely and valid.

If a field is not present, leave it as null (or empty for optional fields)."""

In [146]:
jsonresponse = chat(
    messages=[
        {"role": "system", "content": impoved_system_message},
        {"role": "user", "content": sample_text_content_input_for_json},
    ],
    model="gemma3:4b",
    format=BroadbandFacts.model_json_schema(), options={"temperature": 0.1}
)

In [147]:
print(jsonresponse.message.content)

{
  "provider": "Straight Talk",
  "plan_name": "Platinum Unlimited Plan",
  "disclosure_title": "Mobile Broadband Consumer Disclosure",
  "monthly_price": "$225/month",
  "monthly_price_notes": [
    "This Monthly Price is not an introductory rate and does not require a contract."
  ],
  "additional_fees": [
    {
      "name": "Activation Fee",
      "amount": "$0.00"
    },
    {
      "name": "Provider Monthly Fees",
      "amount": "$0.00"
    },
    {
      "name": "One-Time Fees at the Time of Purchase",
      "amount": "Federal Universal Service Fund $0.37, Regulatory Cost Recovery $0.13, Early Termination Fee: $0.00"
    },
    {
      "name": "Government Taxes",
      "amount": "Varies by location"
    }
  ],
  "one_time_fees": [
    {
      "name": "Federal Universal Service Fund",
      "amount": "$0.37"
    },
    {
      "name": "Regulatory Cost Recovery",
      "amount": "$0.13"
    },
    {
      "name": "Early Termination Fee",
      "amount": "$0.00"
    }
  ],
  "dis

## Uploading to Neo4j

In [150]:
line5_json_data = jsonresponse.message.content

In [152]:
line5_input_json_data = json.loads(line5_json_data.split('network_policies')[0].strip()[:-5]+"}")

In [154]:
line5_input_json_data

{'provider': 'Straight Talk',
 'plan_name': 'Platinum Unlimited Plan',
 'disclosure_title': 'Mobile Broadband Consumer Disclosure',
 'monthly_price': '$225/month',
 'monthly_price_notes': ['This Monthly Price is not an introductory rate and does not require a contract.'],
 'additional_fees': [{'name': 'Activation Fee', 'amount': '$0.00'},
  {'name': 'Provider Monthly Fees', 'amount': '$0.00'},
  {'name': 'One-Time Fees at the Time of Purchase',
   'amount': 'Federal Universal Service Fund $0.37, Regulatory Cost Recovery $0.13, Early Termination Fee: $0.00'},
  {'name': 'Government Taxes', 'amount': 'Varies by location'}],
 'one_time_fees': [{'name': 'Federal Universal Service Fund',
   'amount': '$0.37'},
  {'name': 'Regulatory Cost Recovery', 'amount': '$0.13'},
  {'name': 'Early Termination Fee', 'amount': '$0.00'}],
 'discounts_and_bundles_link': None,
 'speeds': [{'network_type': '5G Ultra Wideband',
   'typical_download_speed': '195-634 Mbps',
   'typical_upload_speed': '10-53 Mbp

In [42]:
from neo4j import GraphDatabase
import json

In [111]:
# Replace with your Neo4j credentials
NEO4J_URI = "neo4j+s://3ea3293d.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "onNoUtCFlSUAz8p9x6j5aN5fZpi12TIsCgka-ZtQQkg"

In [123]:
def upload_to_neo4j(data):
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
    with driver.session() as session:
        session.write_transaction(_upload_data_tx, data)
    driver.close()


def _upload_data_tx(tx, data):
    provider = data.get('provider')
    plan_name = data.get('plan_name')

    # Step 1: Create provider and plan, and their relationship
    tx.run("""
        MERGE (p:Provider {name: $provider})
        MERGE (pl:Plan {
            name: $plan_name,
            disclosure_title: $disclosure_title,
            monthly_price: $monthly_price,
            monthly_price_notes: $monthly_price_notes,
            discounts_and_bundles_link: $discounts_and_bundles_link,
            data_included: $data_included,
            additional_data_charges: $additional_data_charges
        })
        MERGE (p)-[:OFFERS]->(pl)
    """, {
        "provider": provider,
        "plan_name": plan_name,
        "disclosure_title": data.get('disclosure_title'),
        "monthly_price": data.get('monthly_price'),
        "monthly_price_notes": '; '.join(data.get('monthly_price_notes', [])),
        "discounts_and_bundles_link": data.get('discounts_and_bundles_link'),
        "data_included": data.get('data_included'),
        "additional_data_charges": data.get('additional_data_charges')
    })

    # Step 2: Add fees
    for fee in data.get('additional_fees', []):
        _merge_fee_and_link(tx, plan_name, fee, "additional")
    for fee in data.get('one_time_fees', []):
        _merge_fee_and_link(tx, plan_name, fee, "one_time")

    # Step 3: Add speeds
    for speed in data.get('speeds', []):
        _merge_speed_and_link(tx, plan_name, speed)


def _merge_fee_and_link(tx, plan_name, fee, fee_type):
    if not fee.get('name') or not fee.get('amount'):
        return
    tx.run("""
        MATCH (pl:Plan {name: $plan_name})
        MERGE (f:Fee {
            name: $name,
            amount: $amount,
            type: $type
        })
        MERGE (pl)-[:HAS_FEE]->(f)
    """, {
        "plan_name": plan_name,
        "name": fee.get('name'),
        "amount": fee.get('amount'),
        "type": fee_type
    })


def _merge_speed_and_link(tx, plan_name, speed):
    if not speed.get('network_type'):
        return
    tx.run("""
        MATCH (pl:Plan {name: $plan_name})
        MERGE (s:Speed {
            network_type: $network_type,
            download_speed: $download,
            upload_speed: $upload,
            latency: $latency
        })
        MERGE (pl)-[:HAS_SPEED]->(s)
    """, {
        "plan_name": plan_name,
        "network_type": speed.get('network_type'),
        "download": speed.get('typical_download_speed'),
        "upload": speed.get('typical_upload_speed'),
        "latency": speed.get('typical_latency')
    })

In [125]:
# === Run the upload ===
upload_to_neo4j(line5_input_json_data)

  session.write_transaction(_upload_data_tx, data)
