In [None]:
!pip install requests beautifulsoup4 --quiet tiktoken prettytable tqdm

In [44]:
import requests
from typing import List, Callable, Dict
from prettytable import PrettyTable, ALL
from tqdm import tqdm
import csv
import tiktoken
import requests
from bs4 import BeautifulSoup
import firecrawl
import getpass

FIRECRAWL_API_KEY = getpass.getpass("Mendable API Key: ")


Mendable API Key: ··········


In [46]:
sites = [
    {
        "name": "Daraz",
        "url": "https://www.daraz.com.np/products/haylou-ls-02-touch-screen-square-shape-smart-watch-heart-monitoring-call-alert-msg-ip68-waterproof-black-i105587015-s1027376026.html?spm=a2a0e.searchlistcategory.sku.1.21774688wvJyXz&search=1"
    },
    {
        "name": "Amazon",
        "url": "https://www.amazon.in/Memoir-Surgical-Silver-Plated-Stainless/dp/B0771BPK9X/?_encoding=UTF8&pd_rd_w=LeHvj&content-id=amzn1.sym.0009acdf-e53c-4ef1-bf63-a384e1ba591f%3Aamzn1.symc.cdb151ed-d8fe-485d-b383-800c8b0e3fd3&pf_rd_p=0009acdf-e53c-4ef1-bf63-a384e1ba591f&pf_rd_r=FE2909ZA35203CTX8PPP&pd_rd_wg=XLEG1&pd_rd_r=ed7a7a51-162b-4e6e-aa53-7bd4f853ebfe&ref_=pd_hp_d_atf_ci_mcx_mr_hp_atf_m"
    },
    {
        "name": "Flipkart",
        "url": "https://www.flipkart.com/philips-s1151-03-shaver-men/p/itmf70804a8d10f6?pid=SHVGY2SSQYQU47GV&lid=LSTSHVGY2SSQYQU47GVM471GG&marketplace=FLIPKART&store=zlw%2F79s%2Fu3j&srno=b_1_3&otracker=browse&otracker1=hp_rich_navigation_PINNED_neo%2Fmerchandising_NA_NAV_EXPANDABLE_navigationCard_cc_3_L2_view-all&fm=organic&iid=239dc8f3-0419-4e31-a6cf-23f0e6241998.SHVGY2SSQYQU47GV.SEARCH&ppt=browse&ppn=browse&ssid=8t2ig4ekf40000001716740695571"
    }
]

In [42]:

def count_tokens(input_string: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")

    tokens = tokenizer.encode(input_string)

    return len(tokens)

def calculate_cost(input_string: str, cost_per_million_tokens: float = 5) -> float:
    num_tokens = count_tokens(input_string)

    total_cost = (num_tokens / 1_000_000) * cost_per_million_tokens

    return total_cost

def view_scraped_content(scrape_url_functions: List[Dict[str, Callable[[str], str]]], sites_list: List[Dict[str, str]], characters_to_display: int = 500, table_max_width: int = 50, cost_per_million_tokens: float = 5) -> List[Dict[str, str]]:
    content_table_headers = ["Site Name"] + [f"{func['name']} content" for func in scrape_url_functions]
    cost_table_headers = ["Site Name"] + [f"{func['name']} cost" for func in scrape_url_functions]

    content_table = PrettyTable()
    content_table.field_names = content_table_headers

    cost_table = PrettyTable()
    cost_table.field_names = cost_table_headers

    scraped_data = []

    for site in sites_list:
        content_row = [site['name']]
        cost_row = [site['name']]
        site_data = {"provider": site['name'], "sites": []}

        for scrape_function in scrape_url_functions:
            function_name = scrape_function['name']
            try:
                content = scrape_function['function'](site['url'])
                content_snippet = content[:characters_to_display]
                content_row.append(content_snippet)

                num_tokens = count_tokens(content)
                cost = calculate_cost(num_tokens, cost_per_million_tokens)
                cost_row.append(f"${cost:.6f}")

                site_data["sites"].append({"name": function_name, "content": content})
            except Exception as e:
                error_message = f"Error: {str(e)}"
                content_row.append(error_message)
                cost_row.append("Error")

                site_data["sites"].append({"name": function_name, "content": error_message})
                continue

        content_table.add_row(content_row)
        cost_table.add_row(cost_row)
        scraped_data.append(site_data)

    content_table.max_width = table_max_width
    content_table.hrules = ALL

    print("Content Table:")
    print(content_table)

    with open('content_table.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(content_table)

    return scraped_data

def beautiful_soup_scrape_url(url: str) -> str:
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        return str(soup)
    except Exception as e:
        raise RuntimeError(f"Error fetching URL {url}: {str(e)}")

def scrape_jina_ai(url: str) -> str:
    try:
        response = requests.get("https://r.jina.ai/" + url)
        response.raise_for_status()
        return response.text
    except Exception as e:
        raise RuntimeError(f"Error fetching Jina AI URL {url}: {str(e)}")

In [47]:
scraping_functions = [
    {"name": "Beautiful Soup", "function": beautiful_soup_scrape_url},
    {"name": "Jina AI", "function": scrape_jina_ai}
]
view_scraped_content(scraping_functions, sites)

Processing site Daraz using Beautiful Soup: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]
Processing site Daraz using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]
Processing site Amazon using Beautiful Soup: 100%|██████████| 1/1 [00:00<00:00,  1.51it/s]
Processing site Amazon using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
Processing site Flipkart using Beautiful Soup: 100%|██████████| 1/1 [00:15<00:00, 15.92s/it]
Processing site Flipkart using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  2.78it/s]

Content Table:
+-----------+----------------------------------------------------+----------------------------------------------------+
| Site Name |               Beautiful Soup content               |                  Jina AI content                   |
+-----------+----------------------------------------------------+----------------------------------------------------+
|   Daraz   |                                                    |   Title: Haylou LS-02 Touch Screen Square Shape    |
|           | <a href="https://bixi.alicdn.com/punish/punish:res |  Smart Watch Heart Monitoring Call Alert Msg IP68  |
|           | ource:template:darazSpace:default_30989755.html?qr |                 Waterproof (Black)                 |
|           | code=srDpXTjO8YgqMJvGueSZOw|ZlNuCA|GZ0tIw_0&amp;uu |                                                    |
|           | id=b2b0e95d38cef1882a309bc6b9e4993b&amp;action=den |                    URL Source:                     |
|           | y&amp;origi




[{'provider': 'Daraz',
  'sites': [{'name': 'Beautiful Soup',
    'content': '\n<a href="https://bixi.alicdn.com/punish/punish:resource:template:darazSpace:default_30989755.html?qrcode=srDpXTjO8YgqMJvGueSZOw|ZlNuCA|GZ0tIw_0&amp;uuid=b2b0e95d38cef1882a309bc6b9e4993b&amp;action=deny&amp;origin=https%3A%2F%2Fwww.daraz.com.np%3A443%2Fproducts%2Fhaylou-ls-02-touch-screen-square-shape-smart-watch-heart-monitoring-call-alert-msg-ip68-waterproof-black-i105587015-s1027376026.html" id="a-link"></a>\n<script>\r\nvar host = location.host;\r\nvar parts = host.split(\'.\');\r\nif (parts.length > 2){\r\n  host = parts.pop();\r\n  host = "." + parts.pop() + "." + host;\r\n}\r\nvar exp = new Date();\r\nvar maxAge = -100;\r\nexp.setTime(exp.getTime() + maxAge);\r\nvar cookie = "x5secdata=;maxAge=" + maxAge + ";expires=" + exp.toUTCString() + ";path=/;domain=" + host + ";";\r\ndocument.cookie = cookie;\r\ndocument.cookie = cookie + \'Secure;SameSite=None\';\r\ndocument.getElementById("a-link").click();\r