In [9]:
from bs4 import BeautifulSoup
import sys

In [10]:

with open("page.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")



In [11]:
import json
schema = {
    "products": [
        {
            "name": "string - Name of the product",
            "price": "string - Price of the product (e.g., ₹29,999 or USD 399)",
            "rating": "string or float - Average star rating (e.g., 4.2)",
            "rating_count": "integer - Total number of people who rated the product",
            "review_count": "integer - Number of textual reviews available",
            "ram": "string - RAM details (e.g., 8GB)",
            "storage": "string - Storage details (e.g., 128GB)",
            "camera": "string - Camera specifications (e.g., 50MP dual rear, 16MP front)",
            "battery": "string - Battery details (e.g., 4500mAh with fast charging)",
        }
    ]
}


In [12]:
from ollama import chat
from ollama import ChatResponse
from ollama import Client

In [13]:
client = Client(
  host='http://localhost:11434',
)

response = client.chat(model='qwen2.5:3b', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])
# or access fields directly from the response object
print(response.message.content)

ConnectionError: Failed to connect to Ollama. Please check that Ollama is downloaded, running and accessible. https://ollama.com/download

In [14]:
def generate_generic_extraction_prompt(schema: dict, soup) -> str:
    import json

    readable_text = soup.get_text(separator="\n", strip=True)
    limited_text = readable_text[:2500]  # truncate to avoid LLM overload

    prompt = f"""
You are an intelligent data extractor.

Your task is to extract structured information from a webpage's text content, using the following schema as a guide:

Schema:
{json.dumps(schema, indent=2)}

### Rules:
- **Use only the fields defined in the schema.**
- If the exact field name is not available, it's acceptable to substitute it with a semantically close or commonly equivalent field (e.g., if "name" is missing but "model" appears, use "model" for "name").
- **Do not add new fields** that are not part of the schema.
- If a field is not present or cannot be confidently extracted, omit it or leave it blank.
- Output should be a **valid JSON object** that follows the structure of the schema.
- Only include information that appears explicitly in the content.

Here is the page content:
\"\"\"
{limited_text}
\"\"\"

Now extract and return data that fits the schema above, in JSON format only.
"""

    return prompt


In [None]:
prompt = generate_generic_extraction_prompt(schema, soup)

response = client.chat(model='gemma3:4b', messages=[
    {
        'role': 'user',
        'content': prompt,
    }
])

print(response['message']['content'])  


In [15]:
import os
import dotenv
dotenv.load_dotenv()

True

In [28]:
from groq import Groq
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [29]:
def extract_text_from_soup(soup: BeautifulSoup, max_chars: int = 2500) -> str:
    """
    Extract and truncate visible text from a BeautifulSoup object.
    """
    full_text = soup.get_text(separator="\n", strip=True)
    return full_text[:max_chars]

In [30]:
def build_extraction_prompt(schema: dict, page_text: str) -> str:
    """
    Create a prompt for the Groq model to extract structured data using the schema.
    """
    return f"""
Schema:
{json.dumps(schema, indent=2)}

Here is the page content:
\"\"\"
{page_text}
\"\"\"

Your task is to extract structured information from the webpage's text content, using the schema as a guide.
"""


In [31]:
def get_system_message() -> str:
    """
    Returns the system message that sets the model's behavior.
    """
    return """You are an intelligent data extractor.

Your task is to extract structured information from a webpage's text content. You have to strictly follow the provided schema as a guide. Sometimes its possible same field might not exist in the page but similar sounding field might exist, in that case you can use that field to fill the data.
No need to mention it, you can just use the field that is semantically close or commonly equivalent to the field in the schema. If you feel not all field data exists you can leave the field empty.

### Rules:
- Use only the fields defined in the schema.
- If the exact field name is not available, it's acceptable to substitute it with a semantically close or commonly equivalent field (e.g., if "name" is missing but "model" appears, use "model" for "name").
- Do not add new fields that are not part of the schema.
- If a field is not present or cannot be confidently extracted, omit it or leave it blank.
- Output should be a valid JSON object that follows the structure of the schema.
- Only include information that appears explicitly in the content."""


In [33]:
def extract_structured_data_from_soup(
    soup: BeautifulSoup,
    schema: dict,
    model: str = "meta-llama/llama-4-scout-17b-16e-instruct",
    max_chars: int = 2500,
    show_prompt: bool = False ):
    """
    Full pipeline to generate structured data from HTML soup using a Groq model.
    """
    page_text = extract_text_from_soup(soup, max_chars)
    prompt = build_extraction_prompt(schema, page_text)

    if show_prompt:
        print("PROMPT:\n", prompt)

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": get_system_message()},
            {"role": "user", "content": prompt},
        ],
        model=model,
    )

    return chat_completion.choices[0].message.content

In [None]:
model = "llama-3.3-70b-versatile" # Works
model ="deepseek-r1-distill-llama-70b" # Works but speaks a lot ( optimal as no limit )
model = "qwen-qwq-32b" # works but speaks a lot
model = "meta-llama/llama-4-maverick-17b-128e-instruct" # works

In [35]:
result_json = extract_structured_data_from_soup(soup, schema, model="meta-llama/llama-4-maverick-17b-128e-instruct")
print(result_json)

```json
{
 "products": [
 {
 "name": "Nothing Phone (2a)5G (Blue,128 GB)",
 "price": "₹19,999",
 "rating": "4.4",
 "rating_count": 97627,
 "review_count": 7907,
 "ram": "8 GB RAM",
 "storage": "128 GB ROM",
 "camera": "50MP (OIS) +50MP |32MP Front Camera",
 "battery": "5000 mAh Battery"
 },
 {
 "name": "Nothing Phone (3a) (Black,128 GB)",
 "price": "₹24,999",
 "rating": "4.5",
 "rating_count": 20038,
 "review_count": 1928,
 "ram": "8 GB RAM",
 "storage": "128 GB ROM",
 "camera": "50MP (Main) +50MP (2X Tele Photo) +8MP (Ultra-Wide) |32MP Front Camera",
 "battery": "5000 mAh Battery"
 },
 {
 "name": "Nothing Phone (3a) (White,256 GB)",
 "price": "",
 "rating": "4.5",
 "rating_count": 20038,
 "review_count": 1928,
 "ram": "8 GB RAM",
 "storage": "256 GB ROM",
 "camera": "50MP (Main) +50MP (2X Tele Photo) +8MP (Ultra-Wide) |32MP Front Camera",
 "battery": "5000 mAh Battery"
 }
 ]
}
```
