In [3]:
import requests
from bs4 import BeautifulSoup
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
import re

import os
import httpx

from dotenv import load_dotenv


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sys


In [1]:
queries = [
    # Wearable Devices Verification
    "Does Meta produce wearable devices?",
    "List companies that are involved in wearable technology.",
    "Does Apple manufacture wearable gadgets like smartwatches or fitness bands?",
    "Is Samsung a leader in the wearable device market?",
    "Identify companies producing wearable healthcare devices.",

    # Company Verification Queries
    "Give me companies with revenue greater than 100 million USD.",
    "List all companies with annual revenue exceeding $1 billion.",
    "Verify whether Tesla's revenue is greater than $500 million.",
    "Do startups with revenue over $10 million exist in the fintech sector?",
    "Check whether Amazon's revenue surpasses $100 billion.",

    # Industry Verification Queries
    "Does IBM belong to the technology industry?",
    "Verify if ExxonMobil operates in the oil and gas industry.",
    "Is Google classified under the advertising and media sector?",
    "Identify whether companies like Pfizer belong to the pharmaceutical industry.",
    "Check if SpaceX is part of the aerospace and defense industry.",

    # Sector or Market Focus
    "Does Microsoft operate in the cloud computing sector?",
    "Which companies are involved in the green energy industry?",
    "Verify if Facebook is categorized under social media platforms.",

    # Competitive Presence
    "Who are the competitors of Nvidia in the GPU market?",
    "List companies that dominate the e-commerce industry."
]


In [4]:


async def get_source_from_pegasus(params):
    pegasus_url = os.getenv("CLOUDFUNCTION_SERVICE")
    headers = {"accept": "application/json"}
    async with httpx.AsyncClient(timeout=300) as client:
        try:
            response = await client.get(pegasus_url, headers=headers, params=params)
            response.raise_for_status()
            json_response = response.json()
            return json_response
        except httpx.HTTPStatusError as http_err:
            status_code = http_err.response.status_code
            print(f"Status code: {status_code}")
            return None
        except httpx.RequestError as req_err:
            print(f"Request error occurred: {str(req_err)}")
            return None
        except Exception as e:
            print(f"Exception: {e}")
            return None

In [None]:
pegasus_results_list = []
for query in queries:
    pegasus_result = await (get_source_from_pegasus(params={"query": query, "search_engine": "google"}))
    pegasus_result["query"] = query
    pegasus_results_list.append(pegasus_result)



In [None]:
len(pegasus_results_list)

In [None]:
pegasus_results_list[0]['query_result']

In [16]:
import json

with open("snippets.json", "w") as f:
    json.dump(pegasus_results_list, f, indent=2)

In [26]:
with open("snippets.json") as f:
    pegasus_results_list = json.load(f)

In [27]:
urls_queries_list = []
for item in pegasus_results_list:
    urls = []
    for snippets in item['query_result'][:5]:
        urls.append(snippets['link'])
    
    urls_queries_list.append({
        "query": item["query"],
        "urls" : urls
    })
    

In [28]:
urls_queries_list

[{'query': 'Does Meta produce wearable devices?',
  'urls': ['https://www.designboom.com/technology/meta-true-ar-glasses-orion-smartphones-hands-free-wearable-ai-device-09-26-2024/#:~:text=Meta%20unveils%20Orion%2C%20its%20dubbed,September%2025th%20and%2026th%2C%202024.',
   'https://www.pymnts.com/connectedeconomy/2024/zuckerberg-unveils-metas-wearable-tech-plans-for-the-connected-economy/',
   'https://www.inc.com/kit-eaton/meta-bets-on-augmented-reality-devices-as-future-of-wearable-tech.html',
   'https://finance.yahoo.com/news/wearable-devices-boldly-welcomes-meta-124500350.html',
   'https://mediagrouponlineinc.com/2024/06/20/meta-restructures-reality-labs-into-wearables-and-metaverse-divisions/']},
 {'query': 'List companies that are involved in wearable technology.',
  'urls': ['https://builtin.com/companies/type/wearables-companies',
   'https://wellfound.com/job-collections/top-wearables-companies-hiring-today',
   'https://explodingtopics.com/blog/wearable-startups',
   'htt

In [135]:

async def crawl_url(url, query):
    """
    Crawl a single URL asynchronously and return the result.
    """
    bm25_filter = BM25ContentFilter(
        user_query=query,
        bm25_threshold=1.2,
    )

    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)

    config = CrawlerRunConfig(
        markdown_generator=md_generator,
        word_count_threshold=10,
        excluded_tags=["nav", "footer", "header"],
        exclude_external_links=True,
        exclude_external_images=True,
        process_iframes=True,
        remove_overlay_elements=True,
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url, config=config)
        return result


async def process_chunking(result):
    """
    Process chunking for markdown content asynchronously.
    """
    chunker = SlidingWindowChunking(window_size=128, step=8)
    return chunker.chunk(((result.markdown)))


class SlidingWindowChunking:
    def __init__(self, window_size=400, step=350):
        self.window_size = window_size
        self.step = step

    def chunk(self, text):
        words = text.split()
        chunks = []
        for i in range(0, len(words) - self.window_size + 1, self.step):
            chunks.append(" ".join(words[i : i + self.window_size]))
        return chunks


async def crawl_and_chunk(urls, query):
    tasks = [crawl_url(url, query) for url in urls]
    results = await asyncio.gather(*tasks)

    chunking_tasks = []
    for result in results:
        if result.success:
            print(f"Markdown for {result.url} (BM25 query-based): was SUCCESSFULL")
            # print(result.markdown)
            # print("-" * 47)

            # Chunking process in parallel
            chunking_tasks.append((process_chunking(result)))
        else:
            print(f"Error crawling {result.url}: {result.error_message}")

    chunks_results = await asyncio.gather(*chunking_tasks)
    chunked_strings = []

    for chunks in chunks_results:
        # print("-" * 26, "List of Chunks", "-" * 26)
        # print("-" * 46)
        index = 0
        for chunk in chunks:
            chunked_strings.append(chunk)
            index += 1
            # print("-" * 26, index, "-" * 26)
            # print(chunk)
    return chunked_strings


_ = load_dotenv()



def remove_all_urls(text):
    # Define a regex pattern to match URLs (http, https, and other common formats)
    pattern = r"https?://[^\s]+|www\.[^\s]+"

    # Use re.sub() to replace matched URLs with an empty string
    cleaned_text = re.sub(pattern, "", text)

    return cleaned_text


async def main(query, urls):
    if not urls:
        print("No URLs provided.")
        return

    print("Crawling and processing URLs in parallel...")
    crawled_chunks = await crawl_and_chunk(urls, query)
    if not crawled_chunks:
        print("Warning: No chunks were generated from the URLs.")
        return []  # Or handle this case based on your application's needs
    extractor = CosineSimilarityExtractor(query)
    relevant_chunks = extractor.find_relevant_chunks(crawled_chunks)
    print("-" * 26, " Printing Relevant Chunks ", "-" * 26)
    print(len(relevant_chunks), len(crawled_chunks))
    
    returned_chunks = []
    print('#'*45)
    for chunk in relevant_chunks[:5]:
        print("-" * 47)
        print(chunk[0])
        returned_chunks.append(chunk[0])
    return returned_chunks


class CosineSimilarityExtractor:
    def __init__(self, query):
        self.query = query
        self.vectorizer = TfidfVectorizer()

    def find_relevant_chunks(self, chunks):
        if not chunks:
            raise ValueError("Chunks list is empty. Please provide valid input.")

        vectors = self.vectorizer.fit_transform([self.query] + chunks)
        
        if vectors.shape[0] <= 1:
            raise ValueError("Vectorization resulted in insufficient data for similarity computation.")
        
        similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
        return [(chunks[i], similarities[i]) for i in range(len(chunks))]



## BM25 - Base Markdown

In [130]:
crawled_results_chunks = []
for item in urls_queries_list:
    query = item["query"]
    urls = item["urls"]
    result = await main(query, urls)
    crawled_results_chunks.append({"query" : query, "chunks" : result})


    

Crawling and processing URLs in parallel...
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[FETCH]... ↓ https://www.inc.com/kit-eaton/meta-bets-on-augment... | Status: True | Time: 0.51s
[COMPLETE] ● https://www.inc.com/kit-eaton/meta-bets-on-augment... | Status: True | Total: 0.82s
[FETCH]... ↓ https://www.designboom.com/technology/meta-true-ar... | Status: True | Time: 1.05s
[COMPLETE] ● https://www.designboom.com/technology/meta-true-ar... | Status: True | Total: 1.31s
[FETCH]... ↓ https://www.pymnts.com/connectedeconomy/2024/zucke... | Status: True | Time: 1.49s
[COMPLETE] ● https://www.pymnts.com/connectedeconomy/2024/zucke... | Status: True | Total: 1.76s
[FETCH]... ↓ https://finance.yahoo.com/news/wearable-devices-bo... | Status: True | Time: 1.93s
[COMPLETE] ● https://finance.yahoo.com/news/wearable-devices-bo... | Status: True | Total: 2.15s
[FETCH]... ↓ https://mediagrouponl

In [131]:
len(crawled_results_chunks)

20

In [132]:
with open('markdown-bm25.json', 'w') as f:
    json.dump(crawled_results_chunks, f, indent=2)

In [93]:
len((crawled_results_chunks[0]['chunks']))

5

In [134]:
index = 0
token_count = 0
for item in ((crawled_results_chunks[:1])):
    # print(item)
    for chunk in item['chunks'][:1]:
        print(chunk)
        token_count += len(chunk)
        index += 1
print(token_count)
print(index)

[publish my workpromote my projectshare my vision ](https://www.designboom.com/technology/meta-true-ar-glasses-orion-smartphones-hands-free-wearable-ai-device-09-26-2024/<https:/www.designboom.com/readers-submit/> "submit your work") keep up with our daily and weekly stories daily - [see sample](https://www.designboom.com/technology/meta-true-ar-glasses-orion-smartphones-hands-free-wearable-ai-device-09-26-2024/<https:/www.designboom.com/newsletters/web-daily.php>) weekly - [see sample](https://www.designboom.com/technology/meta-true-ar-glasses-orion-smartphones-hands-free-wearable-ai-device-09-26-2024/<https:/www.designboom.com/newsletters/web-weekly.php>) ![designboom newsletter subscribe](https://www.designboom.com/wp-content/uploads/2024/12/newsletter-subscribe-12-2024.jpg) POPULAR NOW TECH! [ ![nékojita fufu is an attachable mini robot that blows on hot drinks & food to cool them down](https://static.designboom.com/wp-content/uploads/2025/01/yukai-engineering-clip-on-robots-CES-20

In [None]:


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python crawl.py <query>")
        sys.exit(1)

    query = sys.argv[1]

    pegasus_result = asyncio.run(get_source_from_pegasus(params={"query": query}))
    print(pegasus_result)
    urls_list = []
    for result in pegasus_result["query_result"][:5]:
        urls_list.append(result["link"])
        print(result)

    if not urls_list:
        print("No results found.")
        sys.exit(1)

    print("Top 5 URLs:")
    for idx, url in enumerate(urls_list, start=1):
        print(f"{idx}: {url}")

    # Run the crawl and chunking in parallel
    asyncio.run(main(query=query, urls=urls_list))