In [2]:
from config.appconfig import FIRECRAWL_API_KEY

import os
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
import pandas as pd
from typing import Dict, Any
from pydantic import BaseModel
import time

In [3]:
class WebsiteScraper:
    def __init__(self):
        load_dotenv()
        self.firecrawl_api_key = FIRECRAWL_API_KEY
        self.app = FirecrawlApp(api_key=self.firecrawl_api_key)
        self.schema_fields = [{"name": "", "type": "str"}]

    def create_dynamic_model(self, fields):
        """Create a dynamic Pydantic model from schema fields."""
        field_annotations = {}
        for field in fields:
            if field["name"]:
                type_mapping = {
                    "str": str,
                    "bool": bool,
                    "int": int,
                    "float": float
                }
                field_annotations[field["name"]] = type_mapping[field["type"]]
        
        return type(
            "ExtractSchema",
            (BaseModel,),
            {
                "__annotations__": field_annotations
            }
        )

    def create_schema_from_fields(self, fields):
        """Create schema using Pydantic model."""
        if not any(field["name"] for field in fields):
            return None
        
        model_class = self.create_dynamic_model(fields)
        return model_class.model_json_schema()

    def convert_to_table(self, data: Dict[str, Any]) -> str:
        """Convert data to a pandas DataFrame and return as string."""
        if not data or 'data' not in data:
            return ""
        
        df = pd.DataFrame([data['data']])
        return df.to_string(index=False)

    def scrape_website(self, website_url: str, prompt: str, schema_fields=None):
        """Main function to scrape website data."""
        if not website_url:
            raise ValueError("Please provide a website URL")

        try:
            schema = self.create_schema_from_fields(schema_fields) if schema_fields else None
            
            extract_params = {'prompt': prompt}
            if schema:
                extract_params['schema'] = schema

            data = self.app.extract([website_url,],
                                    extract_params
                                    )
            
            return data
            
        except Exception as e:
            raise Exception(f"An error occurred: {str(e)}")

In [6]:
scraper = WebsiteScraper()
    
# Get user input
website_url = "https://raqibcodes.netlify.app/*"
prompt = "extract the project titles, their details and corresponding links from the projects section"
    
# Optional: Add schema fields
schema_fields = [
    {"name": "Project_title", "type": "str"},
    {"name": "Details", "type": "str"},
    {"name": "Project_link", "type": "str"}
]

# Get results
result = scraper.scrape_website(website_url, prompt, [])
print("Results:\n")
print(result)

Results:

{'success': True, 'data': {'projects': [{'link': 'https://huggingface.co/spaces/raqibcodes/ai_voice_translator', 'title': 'AI Voice Translator', 'details': 'Record your message in English, and receive translations and transcriptions in multiple languages including Russian, French, Korean, Chinese, Spanish, Arabic and Japanese.'}, {'link': 'https://pdfgpt1.streamlit.app/', 'title': 'RAG Chatbot', 'details': 'Developed a Retrieval-Augmented Generation (RAG) chatbot with memory capabilities, supporting diverse document types. Integrated the Chroma vector store for efficient embedding storage and retrieval, optimized responses through advanced prompt engineering and LLM experimentation, and structured a modular, scalable, and collaborative codebase.'}, {'link': 'https://whatsappchats.streamlit.app/', 'title': 'AI-Powered WhatsApp Chat Analysis App', 'details': 'Developed and deployed an AI-powered WhatsApp chat analysis app with a chatbot that processes CSV chat files, enabling u

In [9]:
result['data']

{'projects': [{'link': 'https://huggingface.co/spaces/raqibcodes/ai_voice_translator',
   'title': 'AI Voice Translator',
   'details': 'Record your message in English, and receive translations and transcriptions in multiple languages including Russian, French, Korean, Chinese, Spanish, Arabic and Japanese.'},
  {'link': 'https://pdfgpt1.streamlit.app/',
   'title': 'RAG Chatbot',
   'details': 'Developed a Retrieval-Augmented Generation (RAG) chatbot with memory capabilities, supporting diverse document types. Integrated the Chroma vector store for efficient embedding storage and retrieval, optimized responses through advanced prompt engineering and LLM experimentation, and structured a modular, scalable, and collaborative codebase.'},
  {'link': 'https://whatsappchats.streamlit.app/',
   'title': 'AI-Powered WhatsApp Chat Analysis App',
   'details': 'Developed and deployed an AI-powered WhatsApp chat analysis app with a chatbot that processes CSV chat files, enabling users to uncove

In [8]:
class ExtractSchema(BaseModel):
    mission: str
    supports_sso: bool
    is_open_source: bool
    is_in_yc: bool

In [10]:
ExtractSchema.model_json_schema()

{'properties': {'mission': {'title': 'Mission', 'type': 'string'},
  'supports_sso': {'title': 'Supports Sso', 'type': 'boolean'},
  'is_open_source': {'title': 'Is Open Source', 'type': 'boolean'},
  'is_in_yc': {'title': 'Is In Yc', 'type': 'boolean'}},
 'required': ['mission', 'supports_sso', 'is_open_source', 'is_in_yc'],
 'title': 'ExtractSchema',
 'type': 'object'}

In [11]:
scraper.create_schema_from_fields(schema_fields)

{'properties': {'Project_title': {'title': 'Project Title', 'type': 'string'},
  'Details': {'title': 'Details', 'type': 'string'},
  'Project_link': {'title': 'Project Link', 'type': 'string'}},
 'required': ['Project_title', 'Details', 'Project_link'],
 'title': 'ExtractSchema',
 'type': 'object'}

In [None]:
# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

class ExtractSchema(BaseModel):
    project_title: str
    details: str
    project_link: str

data = app.extract([
  "https://raqibcodes.netlify.app/*"], {
    'prompt': 'Extract the project titles, their details and corresponding links from the projects section',
    'schema': ExtractSchema.model_json_schema(),
})
print(data)