In [16]:
from firecrawl import Firecrawl
from langchain_groq import ChatGroq
from typing import List , Optional , Annotated
from dotenv import load_dotenv
load_dotenv()
import os
from pydantic import BaseModel , Field
import json
from docx import Document

app =  Firecrawl(api_key= os.getenv("FIRECRAWL_API_KEY")) # pyright: ignore[reportArgumentType]

In [3]:
model = ChatGroq(model="llama3-70b-8192", temperature=0, api_key= os.getenv("GROQ_API_KEY"))  # pyright: ignore[reportArgumentType]

In [4]:
class searchResult(BaseModel):
    source_url : str
   

class searchResults(BaseModel):
    results: List[searchResult]    

In [5]:
struct_model = model.with_structured_output(searchResults)

In [6]:
search_result = app.search(
    query = "effect of vaping on lung health",
    limit = 3
)

final_data = struct_model.invoke(
    f"""
    Extract exactly 3 results.  
    Use 'url' → 'source_url'.  
    Return only valid JSON that matches the schema.  

    Search results:
    {search_result}
    """
)

links = [item.source_url for item in final_data.results]

print(links)

['https://pmc.ncbi.nlm.nih.gov/articles/PMC11580103/', 'https://www.hopkinsmedicine.org/health/wellness-and-prevention/what-does-vaping-do-to-your-lungs', 'https://www.lung.org/quit-smoking/e-cigarettes-vaping/impact-of-e-cigarettes-on-lung']


In [10]:
class contentSchema(BaseModel):
    title: str = Field( description="main Headline of the page" )
    page_content: str = Field( description="summery of Main content of the page" )
    images: Optional[List[str]] = Field( description="List of image URLs on the page" )

In [7]:
content_model = model.with_structured_output(contentSchema)

In [15]:
for link in links: 
    scrape_result = app.scrape(
    url = link,
    formats= [{
        "type": "json",
        "schema": contentSchema
        
    }],
    only_main_content=True 
    )

    json_data = scrape_result.json

    with open("output.txt", "a") as file: 
        file.write(json.dumps(json_data, indent=4))
        file.write("\n")  




print("Scraped data saved to output.txt")        

Scraped data saved to output.txt


In [17]:
docs = Document()
docs.add_heading('Scraped Web Page Data', level=1)

for idx, link in enumerate(links, 1):
    scrape_result2 = app.scrape(
        url = link,
        formats= [{
            "type": "json",
            "schema": contentSchema
            
        }],
        only_main_content=True
    )

    json_data2 = scrape_result2.json

    docs.add_heading(f"Result {idx} - {link}", level=2)

    text = json.dumps(json_data2, indent=4)
    docs.add_paragraph(text)

docs.save('scraped_data.docx')

print("Data Saved")



Data Saved
