In [36]:
import requests
from bs4 import BeautifulSoup
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
import re

import os
import httpx

from dotenv import load_dotenv


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import sys


In [39]:
queries = [
    # Wearable Devices Verification
    "Does Meta produce wearable devices?",
    "List companies that are involved in wearable technology.",
    "Does Apple manufacture wearable gadgets like smartwatches or fitness bands?",
    "Is Samsung a leader in the wearable device market?",
    "Identify companies producing wearable healthcare devices.",

    # Company Verification Queries
    "Give me companies with revenue greater than 100 million USD.",
    "List all companies with annual revenue exceeding $1 billion.",
    "Verify whether Tesla's revenue is greater than $500 million.",
    "Do startups with revenue over $10 million exist in the fintech sector?",
    "Check whether Amazon's revenue surpasses $100 billion.",

    # Industry Verification Queries
    "Does IBM belong to the technology industry?",
    "Verify if ExxonMobil operates in the oil and gas industry.",
    "Is Google classified under the advertising and media sector?",
    "Identify whether companies like Pfizer belong to the pharmaceutical industry.",
    "Check if SpaceX is part of the aerospace and defense industry.",

    # Sector or Market Focus
    "Does Microsoft operate in the cloud computing sector?",
    "Which companies are involved in the green energy industry?",
    "Verify if Facebook is categorized under social media platforms.",

    # Competitive Presence
    "Who are the competitors of Nvidia in the GPU market?",
    "List companies that dominate the e-commerce industry.",
]

queries = [ # Factor Search Queries
    "Provide company background for the company bykea.",
    "What are the long term sales goals for bykea?",
    "what are some customer care policies for bykea?",
    "what is the industry and market position of bykea?",
    "what are bykea's carbon neutrality goals"
]


In [40]:


async def get_source_from_pegasus(params):
    pegasus_url = os.getenv("CLOUDFUNCTION_SERVICE")
    headers = {"accept": "application/json"}
    async with httpx.AsyncClient(timeout=300) as client:
        try:
            response = await client.get(pegasus_url, headers=headers, params=params)
            response.raise_for_status()
            json_response = response.json()
            return json_response
        except httpx.HTTPStatusError as http_err:
            status_code = http_err.response.status_code
            print(f"Status code: {status_code}")
            return None
        except httpx.RequestError as req_err:
            print(f"Request error occurred: {str(req_err)}")
            return None
        except Exception as e:
            print(f"Exception: {e}")
            return None

In [41]:
queries[0]

'Provide company background for the company bykea.'

In [42]:
pegasus_result = await (get_source_from_pegasus(params={"query": queries[0]}))

INFO:httpx:HTTP Request: GET https://ai-pegasus-staging-620091903831.us-central1.run.app/rotating_proxy?query=Provide+company+background+for+the+company+bykea. "HTTP/1.1 200 OK"


In [43]:
pegasus_results_list = []
for query in queries:
    pegasus_result = await (get_source_from_pegasus(params={"query": query}))
    pegasus_result["query"] = query
    pegasus_results_list.append(pegasus_result)



INFO:httpx:HTTP Request: GET https://ai-pegasus-staging-620091903831.us-central1.run.app/rotating_proxy?query=Provide+company+background+for+the+company+bykea. "HTTP/1.1 200 OK"


Request error occurred: Server disconnected without sending a response.


TypeError: 'NoneType' object does not support item assignment

In [10]:
pegasus_results_list

[]

In [7]:
web_text = """'All FiltersLocationLocationOffice TypeHybrid CompanyEmployees work in the office + from homeOn-site CompanyEmployees work in the office every dayFully Remote CompanyCompany operates without a physical officeIndustrySizeOpen JobsCompany Has Open JobsRemote JobsCompany Is Fully RemoteCompany Has Open Remote JobsTech StackClearApplyTop Wearables Companies(0)ADD COMPANY PROFILENo company matchesAdjust filters orclear allto view companiesDon’t see yourcompany?Let’s changethat.Create a\n                    profile'"""

In [8]:
snippet = "From medicine to motion tracking and glasses to gloves, here's our list of the top wearables companies hiring now. StrongArm Technologies, Inc."

In [30]:
query = queries[5]

In [31]:
pegasus_result = await (get_source_from_pegasus(params={"query": query}))
pegasus_result["query"] = query




INFO:httpx:HTTP Request: GET https://ai-pegasus-staging-620091903831.us-central1.run.app/rotating_proxy?query=Give+me+companies+with+revenue+greater+than+100+million+USD. "HTTP/1.1 200 OK"


In [32]:
pegasus_result

{'search_engine': 'yahoo',
 'query_result': [{'title': 'en.wikipedia.org · wiki · List_of_largest_companiesList of largest companies in the United States by revenue',
   'snippet': ' This list comprises the largest companies currently in the United States by revenue as of 2024, according to the Fortune 500 tally of companies and Forbes. The Fortune 500 list of companies includes only publicly traded companies, also including tax inversion companies. ',
   'link': 'https://r.search.yahoo.com/_ylt=Awr.2NeonIhn7gEAce5XNyoA;_ylu=Y29sbwNncTEEcG9zAzMEdnRpZAMEc2VjA3Ny/RV=2/RE=1738215849/RO=10/RU=https%3a%2f%2fen.wikipedia.org%2fwiki%2fList_of_largest_companies_in_the_United_States_by_revenue/RK=2/RS=ReQsT9t2upCNPpDCdnvRVsno59g-'},
  {'title': 'companiesmarketcap.com · largest-companies-by-revenueCompanies ranked by revenue - CompaniesMarketCap.com',
   'snippet': " This is the list of the world's largest public companies by revenue (TTM). What is the market capitalization of a company? ",
   

In [11]:
urls_list = []
for item in pegasus_result['query_result']:
    urls_list.append({
        "url" : item['link'],
        'snippet': item['snippet']
    })

In [12]:
urls_list

[{'url': 'https://www.weforum.org/stories/2024/10/augmented-reality-glasses-technology-news-october-2024/',
  'snippet': 'Technology giants Snap and Meta have unveiled new augmented reality (AR) glasses as the firms look to develop wearable tech hardware that can rival or replace smartphones.'},
 {'url': 'https://www.pymnts.com/connectedeconomy/2024/zuckerberg-unveils-metas-wearable-tech-plans-for-the-connected-economy/',
  'snippet': '25 wrz 2024 — Meta CEO Mark Zuckerberg unveiled an ambitious vision for the future of wearable technology. The event showcased significant innovations.'},
 {'url': 'https://www.inc.com/kit-eaton/meta-bets-on-augmented-reality-devices-as-future-of-wearable-tech.html',
  'snippet': "26 wrz 2024 — The future of wearable computing will sit on your face in gear that looks like goggles or sunglasses, Facebook's parent company predicts."},
 {'url': 'https://finance.yahoo.com/news/wearable-devices-boldly-welcomes-meta-124500350.html',
  'snippet': "26 wrz 2024 —

In [2]:

async def crawl_url(url, query):
    """
    Crawl a single URL asynchronously and return the result.
    """
    bm25_filter = BM25ContentFilter(
        user_query=query,
        bm25_threshold=1.2,
    )

    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)

    config = CrawlerRunConfig(
        markdown_generator=md_generator,
        word_count_threshold=10,
        excluded_tags=["nav", "footer", "header"],
        exclude_external_links=True,
        exclude_external_images=True,
        process_iframes=True,
        remove_overlay_elements=True,
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url, config=config)
        return result


async def process_chunking(result):
    """
    Process chunking for markdown content asynchronously.
    """
    chunker = SlidingWindowChunking(window_size=128, step=8)
    return chunker.chunk(((result.markdown)))


class SlidingWindowChunking:
    def __init__(self, window_size=400, step=350):
        self.window_size = window_size
        self.step = step

    def chunk(self, text):
        words = text.split()
        chunks = []
        for i in range(0, len(words) - self.window_size + 1, self.step):
            chunks.append(" ".join(words[i : i + self.window_size]))
        return chunks


async def crawl_and_chunk(urls, query):
    tasks = [crawl_url(url, query) for url in urls]
    results = await asyncio.gather(*tasks)

    chunking_tasks = []
    for result in results:
        if result.success:
            print(f"Markdown for {result.url} (BM25 query-based): was SUCCESSFULL")
            # print(result.markdown)
            # print("-" * 47)

            # Chunking process in parallel
            chunking_tasks.append((process_chunking(result)))
        else:
            print(f"Error crawling {result.url}: {result.error_message}")

    chunks_results = await asyncio.gather(*chunking_tasks)
    chunked_strings = []

    for chunks in chunks_results:
        # print("-" * 26, "List of Chunks", "-" * 26)
        # print("-" * 46)
        index = 0
        for chunk in chunks:
            chunked_strings.append(chunk)
            index += 1
            # print("-" * 26, index, "-" * 26)
            # print(chunk)
    return chunked_strings


_ = load_dotenv()


async def crawl_only(urls, query):
    tasks = [crawl_url(url, query) for url in urls]
    results = await asyncio.gather(*tasks)

    return results



def remove_all_urls(text):
    # Define a regex pattern to match URLs (http, https, and other common formats)
    pattern = r"https?://[^\s]+|www\.[^\s]+"

    # Use re.sub() to replace matched URLs with an empty string
    cleaned_text = re.sub(pattern, "", text)

    return cleaned_text


async def main(query, urls):
    if not urls:
        print("No URLs provided.")
        return

    print("Crawling and processing URLs in parallel...")
    crawled_chunks = await crawl_only(urls, query)
    if not crawled_chunks:
        print("Warning: No chunks were generated from the URLs.")
        return []  # Or handle this case based on your application's needs
    return crawled_chunks
    # extractor = CosineSimilarityExtractor(query)
    # relevant_chunks = extractor.find_relevant_chunks(crawled_chunks)
    # print("-" * 26, " Printing Relevant Chunks ", "-" * 26)
    # print(len(relevant_chunks), len(crawled_chunks))
    
    # returned_chunks = []
    # print('#'*45)
    # for chunk in relevant_chunks[:5]:
    #     print("-" * 47)
    #     print(chunk[0])
    #     returned_chunks.append(chunk[0])
    # return returned_chunks


class CosineSimilarityExtractor:
    def __init__(self, query):
        self.query = query
        self.vectorizer = TfidfVectorizer()

    def find_relevant_chunks(self, chunks):
        if not chunks:
            raise ValueError("Chunks list is empty. Please provide valid input.")

        vectors = self.vectorizer.fit_transform([self.query] + chunks)
        
        if vectors.shape[0] <= 1:
            raise ValueError("Vectorization resulted in insufficient data for similarity computation.")
        
        similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
        return [(chunks[i], similarities[i]) for i in range(len(chunks))]



# Expanding Snippets

In [3]:
import json

urls_snippets = []
with open('snippets.json') as f:
    urls_snippets = json.load(f)
urls_snippets_query_list = []
for item in urls_snippets:
    urls_list = []
    snippets = []
    for url in item['query_result']:
        urls_list.append(url['link'])
        snippets.append(url['snippet'])
    urls_snippets_query_list.append({
        "query": item['query'],
        "urls_list" : urls_list,
        "snippets_list": snippets
    })
example = urls_snippets_query_list[0]['urls_list']

In [4]:
urls_snippets_query_list[0]

{'query': 'Does Meta produce wearable devices?',
 'urls_list': ['https://www.designboom.com/technology/meta-true-ar-glasses-orion-smartphones-hands-free-wearable-ai-device-09-26-2024/#:~:text=Meta%20unveils%20Orion%2C%20its%20dubbed,September%2025th%20and%2026th%2C%202024.',
  'https://www.pymnts.com/connectedeconomy/2024/zuckerberg-unveils-metas-wearable-tech-plans-for-the-connected-economy/',
  'https://www.inc.com/kit-eaton/meta-bets-on-augmented-reality-devices-as-future-of-wearable-tech.html',
  'https://finance.yahoo.com/news/wearable-devices-boldly-welcomes-meta-124500350.html',
  'https://mediagrouponlineinc.com/2024/06/20/meta-restructures-reality-labs-into-wearables-and-metaverse-divisions/',
  'https://sherwood.news/tech/meta-ray-ban-apple-vision-pro-competition/',
  'https://www.theverge.com/2024/10/11/24267633/meta-hardware-glasses-quest-andrew-bosworth-interview',
  'https://medium.com/@jcorcione/meta-reorganizes-introduces-wearables-unit-despite-job-cuts-d19f1dd10aa6',
 

In [64]:
crawled_results = []

for item in urls_snippets_query_list:
    query = item['query']
    urls_list = item['urls_list']
    result = await main(query, urls_list)
    crawled_results.append({"query" : query, "crawled_pages_list" : result, "snippets_list": item['snippets_list']})
    
for item in crawled_results:
    crawled_pages = item['crawled_pages_list']
    crawled_markdowns = []
    for page in crawled_pages:
        crawled_markdowns.append(page.markdown)
    del item['crawled_pages_list']
    item['crawled_markdowns_list'] = crawled_markdowns




    

Crawling and processing URLs in parallel...
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[INIT].... → Crawl4AI 0.4.247
[FETCH]... ↓ https://sherwood.news/tech/meta-ray-ban-apple-visi... | Status: True | Time: 1.25s
[COMPLETE] ● https://sherwood.news/tech/meta-ray-ban-apple-visi... | Status: True | Total: 1.34s
[FETCH]... ↓ https://www.theverge.com/2024/10/11/24267633/meta-... | Status: True | Time: 0.99s
[COMPLETE] ● https://www.theverge.com/2024/10/11/24267633/meta-... | Status: True | Total: 1.07s
[FETCH]... ↓ https://medium.com/@jcorcione/meta-reorganizes-int... | Status: True | Time: 1.26s
[COMPLETE] ● https://medium.com/@jcorcione/meta-reorganizes-int... | Status: True | Total: 1.33s
[FETCH]... ↓ https://www.pymnts.com/connectedec

In [65]:
len(crawled_results)

20

In [66]:
with open('crawled_markdowns.json', 'w') as f:
    json.dump(crawled_results,f,indent=2)

## Snippet Matching Code

In [5]:
import re
from bs4 import BeautifulSoup
from rapidfuzz import fuzz

def normalize_text(text):
    """Normalize text to make it suitable for matching."""
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'(\d+),(\d+)', r'\1\2', text)  # Normalize numbers (e.g., 10,000 -> 10000)
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

def remove_noise_tokens(snippet):
    """Remove noise tokens such as dates or similar patterns."""
    snippet = re.sub(r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}', '', snippet)  # Remove dates
    snippet = re.sub(r'—|-', '', snippet)  # Remove long dashes
    snippet = snippet.strip()
    return snippet

def reduce_snippet_and_match(snippet, markdown_text, window_length=500, fuzzy_threshold=80):
    """Attempt to match the snippet by progressively reducing it."""
    # Normalize and preprocess inputs
    normalized_snippet = normalize_text(remove_noise_tokens(snippet))
    normalized_markdown = normalize_text(markdown_text)

    # Exact match
    start_idx = normalized_markdown.find(normalized_snippet)
    if start_idx != -1:
        return extract_context(markdown_text, start_idx, len(snippet), window_length)

    # Tokenize snippet
    tokens = normalized_snippet.split()

    # Bidirectional token reduction
    for direction in ["backward", "forward"]:
        for i in range(len(tokens), 0, -1):
            if direction == "backward":
                reduced_snippet = " ".join(tokens[:i])  # Remove tokens from the end
            else:
                reduced_snippet = " ".join(tokens[len(tokens) - i:])  # Remove tokens from the front
            
            start_idx = normalized_markdown.find(reduced_snippet)
            if start_idx != -1:
                return extract_context(markdown_text, start_idx, len(reduced_snippet), window_length)

    # Fuzzy matching as fallback
    for i in range(len(normalized_snippet.split()), 0, -1):
        reduced_snippet = " ".join(normalized_snippet.split()[:i])
        score = fuzz.partial_ratio(reduced_snippet, normalized_markdown)
        if score >= fuzzy_threshold:
            start_idx = normalized_markdown.find(reduced_snippet[:10])  # Approx match start
            if start_idx != -1:
                return extract_context(markdown_text, start_idx, len(reduced_snippet), window_length)

    return {"found": False, "message": "Snippet not found, even with reduced tokens."}

def extract_context(markdown_text, start_idx, snippet_length, window_length):
    """Extract context around the matched snippet."""
    before_start = max(0, start_idx - window_length)
    after_end = min(len(markdown_text), start_idx + snippet_length + window_length)
    context = markdown_text[before_start:after_end]
    return {
        "found": True,
        "start_index": start_idx,
        "end_index": start_idx + snippet_length,
        "context": context
    }


In [6]:
import re
import ftfy
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup

# Load Sentence Embedding Model for relevance-based ranking
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def clean_text(text):
    """
    Cleans raw text by removing markdown, HTML tags, and unnecessary noise.
    """
    text = ftfy.fix_text(text)  # Fix encoding issues
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove Emails
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags using BeautifulSoup
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text

def extract_companies(text):
    """
    Extracts company names, descriptions, and other metadata from the raw text.
    """
    company_pattern = re.compile(r"(\d+[\.)\-]?\s*)?([A-Za-z&\.\-\s]+(?:Inc|Ltd|Corp|Group|Corporation)?)\s*(?:\(|:)?([\w\s,\.%&\-]+)?(?:revenue|sales|income)?[\:|—]?\s*(\d+(?:\.\d+)?\s*(million|billion|USD|dollars)?)?", re.MULTILINE)

    companies = []
    for match in company_pattern.finditer(text):
        # Extract matched groups, handling cases where fewer than 4 groups are captured
        groups = match.groups()

        # Ensure we don't unpack more or fewer than expected
        if len(groups) == 4:
            index, name, description, revenue = groups
        else:
            index, name, description, revenue = None, groups[0], groups[1] if len(groups) > 1 else None, None

        name = name.strip() if name else "Unknown Company"
        description = description.strip() if description else "No description"
        revenue = revenue.strip() if revenue else "Unknown"

        company_info = {
            "name": name,
            "description": description,
            "revenue": revenue
        }
        companies.append(company_info)

    return companies

def filter_companies_by_query(companies, query):
    """
    Filters companies based on a query. The query may reference any kind of company data (revenue, market cap, industry, etc.)
    """
    filtered_companies = []
    
    # For now, filter based on revenue mentioned in the query
    revenue_match = re.search(r"(\d+\.?\d*)\s*(billion|million)?\s*dollar", query, re.IGNORECASE)
    min_revenue_million = 500  # Default: 500 million
    if revenue_match:
        min_revenue_million = float(revenue_match.group(1))
        if revenue_match.group(2) and "billion" in revenue_match.group(2).lower():
            min_revenue_million *= 1000  # Convert billion to million

    for company in companies:
        revenue = company["revenue"]
        if revenue and revenue != "Unknown" and revenue.lower().find("billion") >= 0:
            revenue_value = float(revenue.split()[0].replace(",", ""))
            if revenue_value >= min_revenue_million:
                company["revenue_million"] = revenue_value
                filtered_companies.append(company)

    return filtered_companies

def preprocess_text(text, query):
    """
    Cleans the text, extracts relevant company data, and organizes it to be used as context for LLM-based queries.
    """
    # Step 1: Clean and Normalize Text
    cleaned_text = clean_text(text)

    # Step 2: Extract Companies and Descriptions
    companies = extract_companies(cleaned_text)

    # Step 3: Filter Companies Based on Query
    filtered_companies = filter_companies_by_query(companies, query)

    # Step 4: Rank the relevance of extracted chunks for context
    relevant_chunks = []

    # Creating relevant chunks for LLM context, considering description and other details
    for company in filtered_companies:
        relevant_chunk = f"{company['name']}: {company['description']} with revenue {company['revenue']}"
        relevant_chunks.append(relevant_chunk)

    return {
        "cleaned_text": cleaned_text,
        "relevant_chunks": relevant_chunks
    }



  from .autonotebook import tqdm as notebook_tqdm
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [7]:
import json
crawled_results = []
with open('crawled_markdowns.json') as f:
    crawled_results = json.load(f)

In [21]:
empirical_analysis_data = []
for result in crawled_results:
    context_chunks = []
    corresponding_snippets = []
    for snippet, markdown in zip(result['snippets_list'], result['crawled_markdowns_list']):
        if snippet and markdown:
            match_object = reduce_snippet_and_match(snippet, markdown, window_length=1000)
            if match_object.get('context'):
                context_chunks.append(preprocess_text(query=result['query'], text=match_object['context'])['cleaned_text'])
                corresponding_snippets.append(snippet)
    empirical_analysis_data.append({
        "query" : result['query'],
        "snippets_list": result['snippets_list'],
        "corresponding_snippets": corresponding_snippets,
        "crawled_chunks": context_chunks
    })

  text = BeautifulSoup(text, "html.parser").get_text()


In [31]:
# with open('empirical_analysis_data_v2.json', 'w') as f:
crawled_data = []
with open('crawled_cleaned_content_v2.json') as f:
    crawled_data = json.load(f)

In [None]:
for page in crawled_data:
    for content, snippet in zip(page['content'], page['snippets'])

In [32]:
empirical_analysis_data = []
for result in crawled_data:
    context_chunks = []
    corresponding_snippets = []
    for snippet, markdown in zip(result['snippets'], result['content']):
        if snippet and markdown:
            match_object = reduce_snippet_and_match(snippet, markdown, window_length=1000)
            if match_object.get('context'):
                context_chunks.append(preprocess_text(query=result['query'], text=match_object['context'])['cleaned_text'])
                corresponding_snippets.append(snippet)
    empirical_analysis_data.append({
        "query" : result['query'],
        "snippets": result['snippets'],
        "corresponding_snippets": corresponding_snippets,
        "crawled_chunks": context_chunks
    })


  text = BeautifulSoup(text, "html.parser").get_text()


In [34]:
(empirical_analysis_data[0])

{'query': 'Does Meta produce wearable devices?',
 'snippets': ['Meta unveils Orion, its dubbed first true AR glassesAR glassesA pair of smartglasses can be considered an augmented reality device if it performs pose tracking. Superimposing information onto a field of view is achieved through an optical head-mounted display (OHMD) or embedded wireless glasses with transparent heads-up display (HUD) or augmented reality (AR) overlay.https://en.wikipedia.org › wiki › SmartglassesSmartglasses - Wikipedia that look into the future of smartphones as hands-free and wearable AI devices. Mark Zuckerberg led the reveal during his keynote speech at Meta Connect 2024, which ran between September 25th and 26th, 2024.',
  '25.09.2024 — Meta CEO Mark Zuckerberg unveiled an ambitious vision for the future of wearable technology. The event showcased significant innovations.',
  "26.09.2024 — The future of wearable computing will sit on your face in gear that looks like goggles or sunglasses, Facebook's 

In [35]:
with open('crawled_cleaned_content_v2.json', 'w') as f:
    json.dump(empirical_analysis_data, f, indent=2)

In [84]:
example_snippet = crawled_results[0]['snippets_list'][1]
example_markdown = crawled_results[0]['crawled_markdowns_list'][1]

In [85]:
matched_index = reduce_snippet_and_match(example_snippet, example_markdown)

In [93]:
type(remove_all_urls(matched_index['context']))

str

In [87]:
example_snippet

'25.09.2024 — Meta CEO Mark Zuckerberg unveiled an ambitious vision for the future of wearable technology. The event showcased significant innovations.'

In [48]:
import json
with open('fast_crawl_v1.json') as f:
    fast_crawl_results = json.load(f)


In [49]:
fast_crawl_results

[{'query': 'Does Meta produce wearable devices?',
  'urls': ['https://www.designboom.com/technology/meta-true-ar-glasses-orion-smartphones-hands-free-wearable-ai-device-09-26-2024/#:~:text=Meta%20unveils%20Orion%2C%20its%20dubbed,September%2025th%20and%2026th%2C%202024.',
   'https://www.pymnts.com/connectedeconomy/2024/zuckerberg-unveils-metas-wearable-tech-plans-for-the-connected-economy/',
   'https://sherwood.news/tech/meta-ray-ban-apple-vision-pro-competition/',
   'https://www.theverge.com/2024/10/11/24267633/meta-hardware-glasses-quest-andrew-bosworth-interview',
   'https://medium.com/@jcorcione/meta-reorganizes-introduces-wearables-unit-despite-job-cuts-d19f1dd10aa6',
   'https://insidetelecom.com/wearable-technology-competition-between-apple-and-meta/',
   'https://twit.tv/posts/tech/meta-and-future-wearable-tech',
   'https://en.wikipedia.org/wiki/Smartglasses'],
  'content': ['royal enfield releases first electric motorcycle ‘flying flea’ with customizable riding modestechn

In [63]:
for item in fast_crawl_results:
    filtered_content_list = []
    snippet_list = []
    for content, snippet in zip(item['content'], item['snippets']):
        filtered_content = reduce_snippet_and_match(snippet, content, window_length=1000)
        if filtered_content and snippet:
            if filtered_content.get('context'):
                filtered_content_list.append(preprocess_text(query=item['query'], text=filtered_content['context'])['cleaned_text'])
                snippet_list.append(snippet)
    item['filtered_content'] = filtered_content_list
    item['snippets'] = snippet_list




  text = BeautifulSoup(text, "html.parser").get_text()


In [67]:
for item in fast_crawl_results:
    print(len(item['filtered_content']) , len(item['snippets']))

3 3
7 7
6 6
0 0
7 7
3 3
2 2
5 5
0 0
8 8
6 6
7 7
8 8
7 7
6 6
5 5
7 7
5 5
0 0
7 7


In [66]:
with open('fast_crawl_v1.json', 'w') as f:
    json.dump(fast_crawl_results, f, indent=2)