In [2]:
from bs4 import BeautifulSoup
import sys

In [3]:

with open("page.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")



In [4]:
import json
schema = {
    "products": [
        {
            "name": "string - Name of the product",
            "price": "string - Price of the product (e.g., ₹29,999 or USD 399)",
            "rating": "string or float - Average star rating (e.g., 4.2)",
            "rating_count": "integer - Total number of people who rated the product",
            "review_count": "integer - Number of textual reviews available",
            "ram": "string - RAM details (e.g., 8GB)",
            "storage": "string - Storage details (e.g., 128GB)",
            "camera": "string - Camera specifications (e.g., 50MP dual rear, 16MP front)",
            "battery": "string - Battery details (e.g., 4500mAh with fast charging)",
        }
    ]
}


In [5]:
from ollama import chat
from ollama import ChatResponse
from ollama import Client

In [7]:
client = Client(
  host='http://localhost:11434',
)

response = client.chat(model='qwen2.5:3b', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])
# or access fields directly from the response object
print(response.message.content)

The sky appears blue due to a phenomenon called Rayleigh scattering. Here's an explanation:

1. Sunlight consists of different wavelengths (colors) of light, including blue and violet.

2. When sunlight enters our atmosphere, it collides with air molecules and other small particles in the air.

3. Different wavelengths of light scatter differently when they encounter these particles. Shorter wavelength colors like violet and blue scatter more than longer wavelength colors like red and orange.

4. However, all colors scatter equally in every direction.

5. The scattering causes the shorter wavelengths (blue and violet) to be scattered in almost every direction as sunlight enters Earth's atmosphere.

6. When we look up at a clear sky, our eyes collect light from all directions. Since blue and violet are scattered more uniformly throughout the sky, it appears that there is more blue light than other colors reaching us.

7. The human eye is also somewhat less sensitive to blue light compar

In [8]:
def generate_extraction_prompt(schema: dict, soup) -> str:
    import json

    readable_text = soup.get_text(separator="\n", strip=True)
    limited_text = readable_text[:5000]  # Avoid overly long context

    prompt = f"""You are a data extractor.

Extract structured information from the following HTML-rendered page text.

Use this schema as the format:
{json.dumps(schema, indent=2)}

Only extract data that fits this schema. Do not invent missing fields. Output as JSON.

Page content:
\"\"\"
{limited_text}
\"\"\"
"""

    return prompt


In [9]:
prompt = generate_extraction_prompt(schema, soup)

response = client.chat(model='qwen2.5:3b', messages=[
    {
        'role': 'user',
        'content': prompt,
    }
])

print(response['message']['content'])  


```json
[
    {
        "model": "Nothing Phone (2a) 5G",
        "color": "Black",
        "ram_size": "8 GB",
        "storage_size": "128 GB",
        "score_rating": 4.4,
        "total_ratings": 97627
    },
    {
        "model": "Nothing Phone (2a) 5G",
        "color": "Black",
        "ram_size": "12 GB",
        "storage_size": "256 GB",
        "score_rating": 4.4,
        "total_ratings": 14919
    },
    {
        "model": "Nothing Phone (3a)",
        "color": "Blue",
        "ram_size": "8 GB",
        "storage_size": "128 GB",
        "score_rating": 4.5,
        "total_ratings": 20038
    },
    {
        "model": "Nothing Phone (3a)",
        "color": "White",
        "ram_size": "8 GB",
        "storage_size": "128 GB",
        "score_rating": 4.5,
        "total_ratings": 20038
    },
    {
        "model": "Nothing Phone (3a)",
        "color": "Blue",
        "ram_size": "8 GB",
        "storage_size": "128 GB",
        "score_rating": 4.5,
        "total_ratings"