In [None]:
# !playwright install chromium

In [20]:
from typing import List, Optional

import html2text
import nest_asyncio
import pandas as pd

from langchain_groq import ChatGroq

from playwright.async_api import async_playwright
from pydantic import BaseModel, Field
from tqdm import tqdm

nest_asyncio.apply() # fixes asyncio issue with Jupyter notebooks
tqdm.pandas() # fixes tqdm progress bar in Jupyter notebooks

import json

# debugging
import traceback

### Fetch Web content as Markdown

In [11]:
playwright = None
browser = None
USER_AGENT ="Mozilla/5.0 \
            (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/120.0.0 Safari/537.36"
try:
    playwright = await async_playwright().start() # start Playwright
    # Launch the browser
    browser = await playwright.chromium.launch(
        headless=True,
        # args=["--no-sandbox", "--disable-setuid-sandbox"], # only for containerized environments
        args=[ "--disable-gpu", "--disable-software-rasterizer"], # for local environments
    )
    # Create a new browser context
    context = await browser.new_context(
        user_agent=USER_AGENT,
        viewport={"width": 1280, "height": 800},
    )

    # Create a new page within the context
    page = await context.new_page()
    print(f"Using user-agent: {await page.evaluate('navigator.userAgent')}") # check user agent
    page.set_default_timeout(60000) # set default timeout to 60 seconds
    page.set_default_navigation_timeout(60000) # set default navigation timeout to 60 seconds

    # await page.goto("https://www.lego.com/en-us/categories/price-50-75-dollars") # go to Groq website
    await page.goto("https://reliefweb.int/report/myanmar/initial-environmental-action-plan")

    content = await page.content() # get the content of the page
except Exception as e:
    print(f"Error: {e}")
    traceback.print_exc() 
finally:
    # Close the browser and Playwright resources
    if browser:
        await browser.close()
    if playwright:
        await playwright.stop()
    print("Playwright resources closed.")

# print(content) # print the content of the page

Using user-agent: Mozilla/5.0             (Windows NT 10.0; Win64; x64)             AppleWebKit/537.36 (KHTML, like Gecko)             Chrome/120.0.0 Safari/537.36
Playwright resources closed.


In [12]:
# convert HTML to text to remove unnecessary HTML tags (less token to process)

def html_to_text(html: str) -> str:
    """
    Convert HTML to text using html2text.
    """
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.ignore_emphasis = True
    h.ignore_tables = True
    return h.handle(html)

# convert HTML to text
print(html_to_text(content))# convert HTML to text

Skip to main content

  * Help

Log in

ReliefWeb

Search

## Content Search

What are you looking for?

Search|t

  * Updates
  * Countries
  * Disasters
  * Organizations
  * Topics
  * Jobs
  * Training

Informing humanitarians worldwide 24/7 — a service provided by UN OCHA

Open chatClose chat

## Ask about this document

Myanmar

# Initial Environmental Action Plan

Format

     Manual and Guideline 
Sources

    

  * Shelter Cluster
  * UNHCR

Posted

     22 Dec 2023
Originally published

     22 Dec 2023
Origin

     View original

## Attachments

  * Download Report (PDF | 2.51 MB)

Myanmar confronts a complex environmental landscape, accentuated by a
heightened vulnerability to natural disasters and the escalating impacts of
climate change work.

In the latest update to the Global Climate Risk Index (2021), Myanmar was
identified as the secondmost affected country in the past two decades. The
impact of heatwaves, floods, cyclones, and earthquakes positions Myanmar as
one of 

### Setup LLM

In [13]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [14]:
from groq import Groq

api_key = os.getenv("GROQ_API_KEY")
client = Groq(api_key=api_key)

# test if the API key is valid and the Groq client is working
completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Say hello!"}
    ],
    temperature=0.7,
    max_completion_tokens=128,
    top_p=1,
    stream=False,
    stop=None,
)
print(completion.choices[0].message.content) # print the response from the model

Hello, how can I assist you today?


In [30]:
SYSTEM_PROMPT = """
You are an expert data assistant with strong knowledge of Markdown, website structures, HTML, CSS, JavaScript, and natural language processing.

Task:
Extract structured and relevant data from the provided raw text.

Requirements:

Output the data only in the specified structured format (JSON or CSV), as instructed.
Do not include any explanations or extra text—only output the data in a code block labeled as either json or csv.
When extracting data:
Preserve the data exactly as found; do not modify or interpret content.
Structure it strictly into JSON or CSV format as specified in the instruction.
Leave blank cells/fields for any information not present in the input.
Instructions Example:

If output should be JSON:
Output a code block starting with ```json
If output should be CSV:
Output a code block starting with ```csv
Your response must always follow these rules.
"""
# The text may contain various types of information, including but not limited to:

def create_scrape_prompt(page_content: str) -> str:
    """
    Create a prompt for the Groq model to extract structured data from the page content.
    """
    return f"""
    The following is the raw text extracted from a webpage. 
    Please extract the relevant data and output it in the specified format.
    Raw text:
    {page_content}
    Output format: JSON
    """.strip()

In [31]:
# scrape landing pages

class ProjectInformation(BaseModel):
    """
    Model to hold project information.
    """
    project_name: str = Field(..., description="Name of the project e.g., Widlife Conservation")
    project_description: str = Field(..., description="What is the project about? e.g., This project is about...")
    project_url: str = Field(..., description="URL of the project")
    project_category: Optional[str] = Field(None, description="Category of the project")
    project_benefits: Optional[str] = Field(None, description="Benefits of the project e.g., This project will benefit...")
    project_location: Optional[str] = Field(None, description="Location of the project e.g., Dubai, UAE")
    # project_image_url: Optional[str] = Field(None, description="Image URL of the project")

In [28]:
# build messages
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": create_scrape_prompt(html_to_text(content))},
]

resp_text = client.chat.completions.create(
    model = "llama-3.1-8b-instant",
    messages = messages,
    temperature = 0,
    max_completion_tokens = 512,
).choices[0].message.content

# Strip ```json ... ``` fences if the model included them
if resp_text.startswith("```"):
    resp_text = resp_text.split("```", 2)[1].strip()

In [32]:
try:
    data = json.loads(resp_text)
    # Convert the JSON data to a DataFrame
    df = pd.DataFrame(data)
except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    print(f"Response text: {resp_text}")

Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Response text: I can't provide a Parquet file as output. However, I can provide the data in JSON format, which can be easily converted to Parquet if needed.

```json
[
  {
    "title": "Initial Environmental Action Plan",
    "country": "Myanmar",
    "format": "Manual and Guideline",
    "sources": ["Shelter Cluster", "UNHCR"],
    "posted": "22 Dec 2023",
    "originally_published": "22 Dec 2023",
    "attachments": ["Download Report (PDF | 2.51 MB)"]
  },
  {
    "title": "Myanmar | Floods Typhoon Yagi - Operation Update #2 (MDRMM021)",
    "country": "Myanmar",
    "format": "Situation Report",
    "source": "IFRC",
    "posted": "20 Nov 2024",
    "originally_published": "20 Nov 2024"
  },
  {
    "title": "Myanmar Community Resilience Project: Environmental and Social Commitment Plan (ESCP)",
    "country": "Myanmar",
    "format": "Manual and Guideline",
    "source": "WFP",
    "posted": "30 Jan 2023",
    "original