In [9]:
import re
import ftfy
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup

# Load Sentence Embedding Model for relevance-based ranking
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def clean_text(text):
    """
    Cleans raw text by removing markdown, HTML tags, and unnecessary noise.
    """
    text = ftfy.fix_text(text)  # Fix encoding issues
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove Emails
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags using BeautifulSoup
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text

def extract_companies(text):
    """
    Extracts company names, descriptions, and other metadata from the raw text.
    """
    company_pattern = re.compile(r"(\d+[\.)\-]?\s*)?([A-Za-z&\.\-\s]+(?:Inc|Ltd|Corp|Group|Corporation)?)\s*(?:\(|:)?([\w\s,\.%&\-]+)?(?:revenue|sales|income)?[\:|—]?\s*(\d+(?:\.\d+)?\s*(million|billion|USD|dollars)?)?", re.MULTILINE)

    companies = []
    for match in company_pattern.finditer(text):
        # Extract matched groups, handling cases where fewer than 4 groups are captured
        groups = match.groups()

        # Ensure we don't unpack more or fewer than expected
        if len(groups) == 4:
            index, name, description, revenue = groups
        else:
            index, name, description, revenue = None, groups[0], groups[1] if len(groups) > 1 else None, None

        name = name.strip() if name else "Unknown Company"
        description = description.strip() if description else "No description"
        revenue = revenue.strip() if revenue else "Unknown"

        company_info = {
            "name": name,
            "description": description,
            "revenue": revenue
        }
        companies.append(company_info)

    return companies

def filter_companies_by_query(companies, query):
    """
    Filters companies based on a query. The query may reference any kind of company data (revenue, market cap, industry, etc.)
    """
    filtered_companies = []
    
    # For now, filter based on revenue mentioned in the query
    revenue_match = re.search(r"(\d+\.?\d*)\s*(billion|million)?\s*dollar", query, re.IGNORECASE)
    min_revenue_million = 500  # Default: 500 million
    if revenue_match:
        min_revenue_million = float(revenue_match.group(1))
        if revenue_match.group(2) and "billion" in revenue_match.group(2).lower():
            min_revenue_million *= 1000  # Convert billion to million

    for company in companies:
        revenue = company["revenue"]
        if revenue and revenue != "Unknown" and revenue.lower().find("billion") >= 0:
            revenue_value = float(revenue.split()[0].replace(",", ""))
            if revenue_value >= min_revenue_million:
                company["revenue_million"] = revenue_value
                filtered_companies.append(company)

    return filtered_companies

def preprocess_text(text, query):
    """
    Cleans the text, extracts relevant company data, and organizes it to be used as context for LLM-based queries.
    """
    # Step 1: Clean and Normalize Text
    cleaned_text = clean_text(text)

    # Step 2: Extract Companies and Descriptions
    companies = extract_companies(cleaned_text)

    # Step 3: Filter Companies Based on Query
    filtered_companies = filter_companies_by_query(companies, query)

    # Step 4: Rank the relevance of extracted chunks for context
    relevant_chunks = []

    # Creating relevant chunks for LLM context, considering description and other details
    for company in filtered_companies:
        relevant_chunk = f"{company['name']}: {company['description']} with revenue {company['revenue']}"
        relevant_chunks.append(relevant_chunk)

    return {
        "cleaned_text": cleaned_text,
        "relevant_chunks": relevant_chunks
    }

# Example Raw Noisy Text (from crawled chunks)
raw_text = """
[Jump to content]( From Wikipedia, the free encyclopedia Walmart has been the world's largest company by revenue since 2014. 673.82B revenue)
# Biggest U.S. Companies by Revenue
1. [WMT] Walmart Inc. 673.82B
2. [AMZN] Amazon.com, Inc. 620.13B
3. [UNH] UnitedHealth Group 393.90B
4. [AAPL] Apple Inc. 391.04B
5. [BRK.B] Berkshire Hathaway Inc. 369.89B
"""

query = "List all companies with 500 million dollar above revenue"

# Process the text to extract companies with the required revenue
processed_output = preprocess_text(raw_text, query)

# Display the output
print("\n🔹 Cleaned Text:", processed_output["cleaned_text"])
print("\n🔹 Relevant Chunks for LLM Context:", processed_output["relevant_chunks"])



🔹 Cleaned Text:  [Jump to content]( From Wikipedia, the free encyclopedia Walmart has been the world's largest company by revenue since 2014. 673.82B revenue) # Biggest U.S. Companies by Revenue 1. [WMT] Walmart Inc. 673.82B 2. [AMZN] Amazon.com, Inc. 620.13B 3. [UNH] UnitedHealth Group 393.90B 4. [AAPL] Apple Inc. 391.04B 5. [BRK.B] Berkshire Hathaway Inc. 369.89B 

🔹 Relevant Chunks for LLM Context: []


In [10]:
import json
crawled_markdowns = []
with open('crawled_markdowns.json') as f:
    crawled_markdowns = json.load(f)

In [18]:
cleaned_chunks = []
context_tokens = 0
for item in crawled_markdowns[0]['crawled_markdowns_list']:
    query = crawled_markdowns[0]['query']
    result = preprocess_text(item, query)
    context_tokens += len(result['cleaned_text'].split())
    cleaned_chunks.append(result)

In [21]:
type(cleaned_chunks[0]['cleaned_text'])

str

In [22]:
context_tokens

24971

In [1]:
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
from newspaper3k import Article
import time
from concurrent.futures import ThreadPoolExecutor

ModuleNotFoundError: No module named 'newspaper3k'

ModuleNotFoundError: No module named 'newspaper3k'

In [3]:
%pip install newspaper3k

Note: you may need to restart the kernel to use updated packages.
