In [1]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.chains import create_extraction_chain_pydantic
from langchain_community.llms import Ollama

In [3]:
# Initialize the language model
llm=Ollama(model='llama3.2:3b', temperature=0.01)

  llm=Ollama(model='llama3.2:3b', temperature=0.01)


## With Pydantic Formats

In [6]:
from ollama import chat

In [22]:
class Country(BaseModel):
  name: str
  capital: str
  languages: list[str]

response = chat(
  messages=[
    {
      'role': 'user',
      'content': 'Tell me about Canada.',
    }
  ],
  model='llama3.2:3b',
  format=Country.model_json_schema(),
)

country = Country.model_validate_json(response.message.content)
print(country)

name='Canada' capital='Ottawa' languages=['English', 'French']


## Format mentioned in Prompt directly

#### The following is an partial example HTML for demonstration

In [32]:
# The following is an example HTML for demonstration and testing
html_input = """
<html>
  <body>
    <h1>Premium Mobile Plan</h1>
    <div class="plan-details">
      <span class="name">Premium Plan</span>
      <span class="price">$99.99</span>
      <span class="currency">USD</span>
      <span class="billing-period">Monthly</span>
      <span class="discounted-price">$89.99</span>
      <span class="discount-duration">12 months</span>
      <span class="autopay-discount">True</span>
      <span class="part-number">PP-2025-01</span>
    </div>
    
    <div class="features">
      <ul>
        <li>
          <span class="description">High-speed internet</span>
          <span class="type">Data</span>
          <span class="subcategory">Unlimited, 5G</span>
        </li>
        <li>
          <span class="description">Global roaming</span>
          <span class="type">Roaming</span>
          <span class="subcategory">International</span>
        </li>
      </ul>
    </div>
    
    <div class="promotion">
      <span class="description">Get 10% off on your first month</span>
      <span class="value">10.0</span>
      <span class="type">Discount</span>
    </div>
    
    <div class="bundle">
      <span class="bundle-name">Data Bundle</span>
      <ul>
        <li>Extra 10GB Data</li>
        <li>Unlimited Calls</li>
      </ul>
    </div>
  </body>
</html>
"""

#### System Message Prompt and Structured output definition is here.

In [35]:
# Prompt and Structured output definition is here.
system_message = f"""
Extract the following structured data from the HTML:

Plan Information:
- Name of the plan
- Price, currency, billing period
- Discounted price, discount duration, autopay discount, and part number

Features:
- Feature description, type, and subcategories

Promotions:
- Promotion description, value, and type

Bundles:
- Bundle name and included features

Please return the data in the following JSON format:

{{
  "name": "", 
  "price": 0, 
  "currency": "", 
  "billing_period": "", 
  "discounted_price": 0, 
  "discount_duration": "", 
  "autopay_discount": false, 
  "part_number": "", 
  "features": [
    {{
      "description": "", 
      "type": "", 
      "subcategory": []
    }}
  ], 
  "promotions": [
    {{
      "description": "", 
      "value": 0, 
      "type": ""
    }}
  ], 
  "bundles": [
    {{
      "bundle_name": "", 
      "features": []
    }}
  ]
}}
"""

#### Structured Output is enabled only for a few models and has to be interacted with in a chat format when using Ollama.

In [39]:
response = chat(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": html_input},
    ],
    model="llama3.2:3b",
)

In [44]:
print(response)

model='llama3.2:3b' created_at='2025-04-27T14:25:06.6155091Z' done=True done_reason='stop' total_duration=75339109700 load_duration=55256800 prompt_eval_count=622 prompt_eval_duration=4632590200 eval_count=283 eval_duration=70643282600 message=Message(role='assistant', content='Here is the extracted data in JSON format:\n\n```json\n{\n  "name": "Premium Mobile Plan",\n  "price": 99.99,\n  "currency": "USD",\n  "billing_period": "Monthly",\n  "discounted_price": 89.99,\n  "discount_duration": "12 months",\n  "autopay_discount": true,\n  "part_number": "PP-2025-01",\n  "features": [\n    {\n      "description": "High-speed internet",\n      "type": "Data",\n      "subcategory": ["Unlimited", "5G"]\n    },\n    {\n      "description": "Global roaming",\n      "type": "Roaming",\n      "subcategory": ["International"]\n    }\n  ],\n  "promotions": [\n    {\n      "description": "Get 10% off on your first month",\n      "value": 10.0,\n      "type": "Discount"\n    }\n  ],\n  "bundles": [\n

#### Performing some additonal output parsing

In [46]:
# This needs to be custom written depending on the kind of response from the choose model, may change if the system prompt changes
to_validate = response['message']['content'].split("json")[1].split("```")[0]
print(to_validate)


{
  "name": "Premium Mobile Plan",
  "price": 99.99,
  "currency": "USD",
  "billing_period": "Monthly",
  "discounted_price": 89.99,
  "discount_duration": "12 months",
  "autopay_discount": true,
  "part_number": "PP-2025-01",
  "features": [
    {
      "description": "High-speed internet",
      "type": "Data",
      "subcategory": ["Unlimited", "5G"]
    },
    {
      "description": "Global roaming",
      "type": "Roaming",
      "subcategory": ["International"]
    }
  ],
  "promotions": [
    {
      "description": "Get 10% off on your first month",
      "value": 10.0,
      "type": "Discount"
    }
  ],
  "bundles": [
    {
      "bundle_name": "Data Bundle",
      "features": ["Extra 10GB Data", "Unlimited Calls"]
    }
  ]
}



#### Pydandic Validation

In [53]:
class Feature(BaseModel):
    description: Optional[str] = None
    type: Optional[str] = None
    subcategory: Optional[List[str]] = None

class Promotion(BaseModel):
    description: Optional[str] = None
    value: Optional[float] = None
    type: Optional[str] = None

class BundleBenefit(BaseModel):
    bundle_name: Optional[str] = None
    features: Optional[List[str]] = None

class PlanDetails(BaseModel):
    speed: Optional[str] = None
    cloud_storage: Optional[str] = None
    hotspot: Optional[str] = None
    device_protection: Optional[bool] = None
    international_calling: Optional[str] = None
    service_days: Optional[int] = None

class Plan(BaseModel):
    name: Optional[str] = None
    price: Optional[float] = None
    currency: Optional[str] = None
    billing_period: Optional[str] = None
    discounted_price: Optional[float] = None
    discount_duration: Optional[str] = None
    autopay_discount: Optional[bool] = None
    part_number: Optional[str] = None
    features: Optional[List[Feature]] = None
    promotions: Optional[List[Promotion]] = None
    bundles: Optional[List[BundleBenefit]] = None
    details: Optional[PlanDetails] = None

In [55]:
# Validate and parse the response content with Pydantic
plan = Plan.model_validate_json(to_validate)
print(plan)

name='Premium Mobile Plan' price=99.99 currency='USD' billing_period='Monthly' discounted_price=89.99 discount_duration='12 months' autopay_discount=True part_number='PP-2025-01' features=[Feature(description='High-speed internet', type='Data', subcategory=['Unlimited', '5G']), Feature(description='Global roaming', type='Roaming', subcategory=['International'])] promotions=[Promotion(description='Get 10% off on your first month', value=10.0, type='Discount')] bundles=[BundleBenefit(bundle_name='Data Bundle', features=['Extra 10GB Data', 'Unlimited Calls'])] details=None


### Format parameter in chat method attempt

In [63]:
response = chat(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": html_input},
    ],
    model="llama3.2:3b",
    format=Plan.model_json_schema(),
)

In [67]:
print(response.message.content)

{"name": "Premium Mobile Plan", "price": 99.99, "currency": "USD", "billing_period": "Monthly", "discounted_price": 89.99, "discount_duration": "12 months", "autopay_discount": true, "part_number": "PP-2025-01", "features": [{"description": "High-speed internet", "type": "Data", "subcategory": ["Unlimited", "5G"]}, {"description": "Global roaming", "type": "Roaming", "subcategory": ["International"]}],"promotions":[{"description": "Get 10% off on your first month", "value": 10.0, "type": "Discount"}],"bundles":[{"bundle_name": "Data Bundle", "features": ["Extra 10GB Data", "Unlimited Calls"]}]}


## The following is a complete entire StraightTalk Plan HTML Demonstration

In [8]:
class Feature(BaseModel):
    description: Optional[str] = Field(None, description="Description of the feature")
    type: Optional[str] = Field(None, description="Type of feature")
    subcategory: Optional[List[str]] = Field(None, description="Subcategories of the feature")

class Promotion(BaseModel):
    description: Optional[str] = Field(None, description="Description of the promotion")
    value: Optional[float] = Field(None, description="Value of the promotion")
    type: Optional[str] = Field(None, description="Type of promotion")

class BundleBenefit(BaseModel):
    bundle_name: Optional[str] = Field(None, description="Name of the bundle")
    features: Optional[List[str]] = Field(None, description="List of features included in the bundle")

class SavingsOffer(BaseModel):
    description: Optional[str] = Field(None, description="Description of the savings offer")
    offers: Optional[List[dict]] = Field(None, description="List of individual offers")

class PlanDetails(BaseModel):
    speed: Optional[str] = Field(None, description="Speed information")
    cloud_storage: Optional[str] = Field(None, description="Cloud storage details")
    hotspot: Optional[str] = Field(None, description="Hotspot information")
    device_protection: Optional[bool] = Field(None, description="Device protection availability")
    international_calling: Optional[str] = Field(None, description="International calling details")
    service_days: Optional[int] = Field(None, description="Number of service days")

class StraightTalkPlan(BaseModel):
    name: Optional[str] = Field(None, description="Name of the plan")
    price: Optional[float] = Field(None, description="Price of the plan")
    currency: Optional[str] = Field(None, description="Currency of the price")
    billing_period: Optional[str] = Field(None, description="Billing period")
    discounted_price: Optional[float] = Field(None, description="Discounted price if available")
    discount_duration: Optional[str] = Field(None, description="Duration of the discount")
    autopay_discount: Optional[bool] = Field(None, description="Indicates if autopay discount is available")
    part_number: Optional[str] = Field(None, description="Part number of the plan")
    url: Optional[str] = Field(None, description="URL to the plan details")
    best_value: Optional[bool] = Field(None, description="Indicates if the plan is marked as best value")
    current_plan: Optional[bool] = Field(None, description="Indicates if this is the current plan")
    features: Optional[List[Feature]] = Field(None, description="List of features included in the plan")
    promotions: Optional[List[Promotion]] = Field(None, description="List of promotions applicable to the plan")
    bundle_benefits: Optional[List[BundleBenefit]] = Field(None, description="List of bundle benefits")
    savings_offers: Optional[List[SavingsOffer]] = Field(None, description="List of savings offers")
    plan_details: Optional[PlanDetails] = Field(None, description="Detailed information about the plan")

In [76]:
no_schema_system_message = f"""
Extract the following structured data from the HTML:"""

In [99]:
file = open("plan_example_html_content.txt", "r")
plan_example_html_content = file.read()

In [101]:
response = chat(
    messages=[
        {"role": "system", "content": no_schema_system_message},
        {"role": "user", "content": plan_example_html_content},
    ],
    model="llama3.2:3b",
    format=StraightTalkPlan.model_json_schema(), temperature=0.01
)

In [102]:
print(response.message.content)

{"name": "Platinum Unlimited", "price": 65, "promotions": [], "plan_details": {"device_protection": true, "international_calling": "unlimited", "service_days": 30} }


## Improvement on the above using formats and terms from BroadBand Facts

In [10]:
from pydantic import BaseModel, HttpUrl
from typing import Optional, List

class Fee(BaseModel):
    name: str
    amount: str 

class SpeedInfo(BaseModel):
    network_type: str 
    typical_download_speed: str
    typical_upload_speed: str
    typical_latency: str

class NetworkPolicy(BaseModel):
    title: str 
    policy_link: HttpUrl

class CustomerSupport(BaseModel):
    website: HttpUrl
    phone_number: str

class BroadbandFacts(BaseModel):
    provider: str
    plan_name: str
    disclosure_title: str
    monthly_price: str
    monthly_price_notes: List[str]
    additional_fees: List[Fee]
    one_time_fees: List[Fee]
    discounts_and_bundles_link: Optional[HttpUrl]
    speeds: List[SpeedInfo]
    data_included: str
    additional_data_charges: str
    network_policies: List[NetworkPolicy]
    customer_support: CustomerSupport
    fcc_consumer_info_link: HttpUrl
    reference_code: Optional[str]

In [12]:
impoved_system_message = f"""
Extract the following structured data from the provided HTML. The extracted information should match the following fields exactly:

provider (string): Name of the broadband provider.

plan_name (string): Name of the plan.

disclosure_title (string): Title of the disclosure (e.g., "Mobile Broadband Consumer Disclosure").

monthly_price (string): Monthly cost of the plan (including the dollar sign).

monthly_price_notes (list of strings): Any notes about the monthly price (such as if it's not introductory or doesn't require a contract).

additional_fees (list of objects): Each object should have:

name (string): Name of the recurring fee (e.g., "Activation Fee").

amount (string): Fee amount (including the dollar sign).

one_time_fees (list of objects): Each object should have:

name (string): Name of the one-time fee (e.g., "Federal Universal Service Fund").

amount (string): Fee amount (including the dollar sign or "Varies by location").

discounts_and_bundles_link (URL): URL link for discounts and bundles information (if available).

speeds (list of objects): Each object should represent a network type (5G Ultra Wideband, 5G, 4G LTE) with:

network_type (string): Name of the network type.

typical_download_speed (string): Typical download speed range (e.g., "195-634 Mbps").

typical_upload_speed (string): Typical upload speed range (e.g., "10-53 Mbps").

typical_latency (string): Typical latency range (e.g., "37-57 ms").

data_included (string): Data included with the plan (e.g., "Unlimited").

additional_data_charges (string): Cost for additional data usage (including the dollar sign).

network_policies (list of objects): Each object should have:

title (string): Policy title (e.g., "Network Management", "Privacy").

policy_link (URL): URL link to the policy.

customer_support (object):

website (URL): URL to the customer support page.

phone_number (string): Customer support phone number.

fcc_consumer_info_link (URL): Link to the FCC consumer information page.

reference_code (string): Reference code shown at the end of the facts (if available).

Important Notes:

Clean up extra whitespace and line breaks from extracted text.

Preserve formatting like dollar signs ("$") where present.

All URLs should be extracted completely and valid.

If a field is not present, leave it as null (or empty for optional fields)."""

In [2]:
file = open("plan_example_html_content.txt", "r")
plan_example_html_content = file.read()

In [4]:
# file.close()

In [16]:
response = chat(
    messages=[
        {"role": "system", "content": impoved_system_message},
        {"role": "user", "content": plan_example_html_content},
    ],
    model="llama3.2:3b",
    format=BroadbandFacts.model_json_schema(), options={"temperature": 0.1}
)

In [17]:
print(response.message.content)

{"provider": "Verizon", "plan_name": "Platinum Unlimited", "disclosure_title": "Facts About Your Plan", "monthly_price": "$70.00", "monthly_price_notes": ["$0.00 for additional data usage"], "additional_fees": [], "one_time_fees": [], "discounts_and_bundles_link": "<a href=\"https://www.verizon.com/support/tracfone-open-internet-legal/\">Data Included</a>", "speeds": [{"network_type": "4G LTE", "typical_download_speed": "> 20 Mbps", "typical_upload_speed": "> 5 Mbps", "typical_latency": "< 50 ms"}, {"network_type": "5G", "typical_download_speed": "> 100 Mbps", "typical_upload_speed": "> 10 Mbps", "typical_latency": "< 30 ms"}], "data_included": "Unlimited", "additional_data_charges": "$0.00", "network_policies": [{"title": "Network Management", "policy_link": "https://www.verizon.com/about/our-company/network-management"}, {"title": "Privacy", "policy_link": "https://www.tracfonewirelessinc.com/en/Privacy%2BPolicy/"}, {"title": "Customer Support", "policy_link": "https://www.straightta

In [None]:
'''
Next activity 
1. Modify speed info formats.
2. Try other models for better numeric values accuracy.
3. Experiment with a better system prompt as well.
Experiment with temperature.
'''