Get my competitors' pricing from their websites

In [10]:
competitor_sites = [
    {
        "name": "Connected Stories",
        "url": "https://www.connected-stories.com/pricing"
    },
    {
        "name": "ZBrain",
        "url": "https://zbrain.ai/pricing/"
    },
    {
        "name": "Pecan AI",
        "url": "https://www.pecan.ai/pricing/"
    },
    {
        "name": "Bolt Chat AI",
        "url": "https://www.boltchatai.com/pricing/"
    },
]

Let's setup cost calculations

We can calculate how much it'll cost by using OpenAI's tiktoken library.

In [1]:
pip install tiktoken --quiet

Note: you may need to restart the kernel to use updated packages.


In [3]:
import tiktoken

def count_tokens(input_string: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")

    tokens = tokenizer.encode(input_string)

    return len(tokens)

def calculate_cost(input_string: str, cost_per_million_tokens: float = 5) -> float:
    num_tokens = count_tokens(input_string)

    total_cost = (num_tokens / 1_000_000) * cost_per_million_tokens

    return total_cost

# Example usage:
input_string = "The indications are that AI assistants purposely built for pharmaceutical marketing leverage custom enterprise pricing models negotiated between vendors and clients rather than publicly listed pricing tiers. This aligns with the specialized and highly regulated nature of AI solutions for this industry."

cost = calculate_cost(input_string)
print(f"The total cost for using gpt-4o is: $US {cost:.6f}")

The total cost for using gpt-4o is: $US 0.000225


Let's see the results in a table

In [4]:
pip install prettytable tqdm --quiet

Note: you may need to restart the kernel to use updated packages.


In [7]:
from typing import List, Callable, Dict
from prettytable import PrettyTable, ALL
from tqdm import tqdm

def view_scraped_content(scrape_url_functions: List[Dict[str, Callable[[str], str]]], sites_list: List[Dict[str, str]], characters_to_display: int = 500, table_max_width: int = 50) -> List[Dict[str, str]]:
    content_table_headers = ["Site Name"] + [f"{func['name']} content" for func in scrape_url_functions]
    cost_table_headers = ["Site Name"] + [f"{func['name']} cost" for func in scrape_url_functions]

    content_table = PrettyTable()
    content_table.field_names = content_table_headers

    cost_table = PrettyTable()
    cost_table.field_names = cost_table_headers

    scraped_data = []

    for site in sites_list:
        content_row = [site['name']]
        cost_row = [site['name']]
        site_data = {"provider": site['name'], "sites": []}

        for scrape_function in scrape_url_functions:
            function_name = scrape_function['name']
            for _ in tqdm([site], desc=f"Processing site {site['name']} using {function_name}"):
                try:
                    content = scrape_function['function'](site['url'])
                    content_snippet = content[:characters_to_display]
                    content_row.append(content_snippet)

                    cost = calculate_cost(content)
                    cost_row.append(f"${cost:.6f}")

                    site_data["sites"].append({"name": function_name, "content": content})
                except Exception as e:
                    error_message = f"Error: {str(e)}"
                    content_row.append(error_message)
                    cost_row.append("Error")

                    site_data["sites"].append({"name": function_name, "content": error_message})
                    continue

        content_table.add_row(content_row)
        cost_table.add_row(cost_row)
        scraped_data.append(site_data)

    content_table.max_width = table_max_width
    content_table.hrules = ALL

    cost_table.max_width = table_max_width
    cost_table.hrules = ALL

    print("Content Table:")
    print(content_table)

    print("\nCost Table:\nThis is how much it would cost to use gpt-4o to parse this content for extraction.")
    print(cost_table)

    return scraped_data


Setup the scrapper
we can setup anything here, but now i'm going with Reader API by Jina AI 

In [8]:
import requests

def scrape_jina_ai(url: str) -> str:
  response = requests.get("https://r.jina.ai/" + url)
  return response.text

Let's run scraper

In [12]:
list_of_scraper_functions = [
      {"name": "Jina AI", "function": scrape_jina_ai}
      ]

all_content = view_scraped_content(list_of_scraper_functions, competitor_sites, 700, 20)

Processing site Connected Stories using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  1.47it/s]
Processing site ZBrain using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Processing site Pecan AI using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
Processing site Bolt Chat AI using Jina AI: 100%|██████████| 1/1 [00:00<00:00,  1.37it/s]

Content Table:
+-------------------+----------------------+
|     Site Name     |   Jina AI content    |
+-------------------+----------------------+
| Connected Stories | Title: Stories - The |
|                   |  NEXT generation of  |
|                   | Creative Management  |
|                   | Platforms Powered by |
|                   |          AI          |
|                   |                      |
|                   | URL Source: https:// |
|                   |    www.connected-    |
|                   | stories.com/pricing  |
|                   |                      |
|                   |  Markdown Content:   |
|                   |       Pricing        |
|                   |                      |
|                   |  Choose the perfect  |
|                   |    plan for your     |
|                   |  business needs or   |
|                   |  get in touch with   |
|                   |         us!          |
|                   |                   




Now let's use OpenAI and extract just the information we need
Let's see how accurate the extraction task is.

First, we create an extraction function using OpenAI's gpt-4o to get only the pricing content from each scraped website from Jina.

In [13]:
import getpass
from openai import OpenAI

OPENAI_API_KEY = getpass.getpass('Enter your OpenAI API key: ')

client = OpenAI(api_key=OPENAI_API_KEY)

def extract(user_input: str):
  entity_extraction_system_message = {"role": "system", "content": "Get me the three pricing tiers from this website's content, and return as a JSON with three keys: {cheapest: {name: str, price: float}, middle: {name: str, price: float}, most_expensive: {name: str, price: float}}"}

  messages = [entity_extraction_system_message]
  messages.append({"role": "user", "content": user_input})

  response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        stream=False,
        response_format={"type": "json_object"}
    )

  return response.choices[0].message.content

Then, we create a utility function to display that content in a table

In [14]:
def display_extracted_content(results: List[Dict[str, any]], num_objects: int):
    table = PrettyTable()
    table.field_names = ["Site", "Provider Name", "Extracted Content"]

    # Ensure num_objects does not exceed the length of the results list
    num_objects = min(num_objects, len(results))

    # Process the specified number of items from the results list with a progress bar
    for result in tqdm(results[:num_objects], desc="Processing results"):
        provider_name = result["provider"]

        for site in result["sites"]:
            function_name = site["name"]
            content = site["content"]

            # Progress bar for each function
            for _ in tqdm(range(1), desc=f"Extracting content with {provider_name} for {function_name}"):
                extracted_content = extract(content)
                table.add_row([provider_name, function_name, extracted_content])

    table.max_width = 50  # Set the maximum width for better display
    table.hrules = ALL

    print("Extracted Content Table:")
    print(table)

In [15]:
display_extracted_content(all_content, num_objects=9)

Extracting content with Connected Stories for Jina AI:   0%|          | 0/1 [00:02<?, ?it/s]
Processing results:   0%|          | 0/4 [00:02<?, ?it/s]


APIConnectionError: Connection error.