In [1]:
!pip install -U firecrawl

Collecting firecrawl
  Downloading firecrawl-1.12.0-py3-none-any.whl.metadata (10 kB)
Collecting python-dotenv (from firecrawl)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading firecrawl-1.12.0-py3-none-any.whl (18 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, firecrawl
Successfully installed firecrawl-1.12.0 python-dotenv-1.0.1


In [2]:
import os
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
import pandas as pd
from typing import Dict, Any
from pydantic import BaseModel
import time

class WebsiteScraper:
    def __init__(self):
        load_dotenv()
        self.firecrawl_api_key = 'FIRE-CRAWL-API-KEY'
        self.app = FirecrawlApp(api_key=self.firecrawl_api_key)
        self.schema_fields = [{"name": "", "type": "str"}]

    def create_dynamic_model(self, fields):
        """Create a dynamic Pydantic model from schema fields."""
        field_annotations = {}
        for field in fields:
            if field["name"]:
                type_mapping = {
                    "str": str,
                    "bool": bool,
                    "int": int,
                    "float": float
                }
                field_annotations[field["name"]] = type_mapping[field["type"]]

        return type(
            "ExtractSchema",
            (BaseModel,),
            {
                "__annotations__": field_annotations
            }
        )

    def create_schema_from_fields(self, fields):
        """Create schema using Pydantic model."""
        if not any(field["name"] for field in fields):
            return None

        model_class = self.create_dynamic_model(fields)
        return model_class.model_json_schema()

    def convert_to_table(self, data: Dict[str, Any]) -> str:
        """Convert data to a pandas DataFrame and return as string."""
        if not data or 'data' not in data:
            return ""

        df = pd.DataFrame([data['data']])
        return df.to_string(index=False)

    def scrape_website(self, website_url: str, prompt: str, schema_fields=None):
        """Main function to scrape website data."""
        if not website_url:
            raise ValueError("Please provide a website URL")

        try:
            schema = self.create_schema_from_fields(schema_fields) if schema_fields else None

            extract_params = {'prompt': prompt}
            if schema:
                extract_params['schema'] = schema

            data = self.app.extract([website_url,],
                                    extract_params
                                    )

            return data

        except Exception as e:
            raise Exception(f"An error occurred: {str(e)}")

In [20]:
from firecrawl import FirecrawlApp
from pydantic import BaseModel
import pandas as pd

app = FirecrawlApp(api_key="FIRE-CRAWL-API-KEY")

class FinancialNewsSchema(BaseModel):
    article_title: str
    publish_date: str
    article_link: str
    article_content: str

financial_sites = [
    "https://www.jpmorgan.com/credit-and-financing/*"]

data = app.extract(financial_sites, {
    'prompt': 'Extract article title, publish date, article link, and content.',
    'schema': FinancialNewsSchema.model_json_schema(),
})

df = pd.DataFrame(data)

df

Unnamed: 0,success,data,status,expiresAt
article_link,True,https://www.jpmorgan.com/insights/payments/tra...,completed,2025-02-23T14:44:10.000Z
publish_date,True,2025-02-23,completed,2025-02-23T14:44:10.000Z
article_title,True,Trade Financing to Vietnams Wind Power Project...,completed,2025-02-23T14:44:10.000Z
article_content,True,J.P. Morgan has been mandated by two state-own...,completed,2025-02-23T14:44:10.000Z
