In [2]:
import json
import tiktoken

def main():
    # Step 1: Read and parse the JSON file.
    with open("2024_2025_spring_courses.json", "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Step 2: Convert the JSON data to a string.
    json_text = json.dumps(data)
    
    # Step 3: Explicitly get the encoding.
    # Since "GPT-4o" is not automatically mapped, we'll use "cl100k_base" as the encoding.
    encoding = tiktoken.get_encoding("cl100k_base")
    
    # Step 4: Encode the text and count the tokens.
    tokens = encoding.encode(json_text)
    token_count = len(tokens)
    
    # Step 5: Calculate the equivalent price.
    # Example rate: $0.03 per 1,000 tokens
    price_per_1000_tokens = 0.03  
    equivalent_price = (token_count / 1000) * price_per_1000_tokens
    
    print(f"Token count: {token_count}")
    print(f"Equivalent price: ${equivalent_price:.4f}")

if __name__ == "__main__":
    main()


Token count: 25958
Equivalent price: $0.7787


In [3]:
import json
import tiktoken

def main():
    # Step 1: Read and parse the JSON file.
    with open("2024_2025_spring_courses.json", "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Step 2: Convert the JSON data to a string.
    # This serialized string is what we will tokenize.
    json_text = json.dumps(data)
    
    # Step 3: Calculate the total number of characters.
    char_count = len(json_text)
    
    # Step 4: Calculate the total number of words.
    # Here, we split the text by whitespace.
    word_count = len(json_text.split())
    
    # Step 5: Get the tokenizer encoding.
    # Since GPT-4o isn't directly available, we use "cl100k_base" which is common for GPT-4.
    encoding = tiktoken.get_encoding("cl100k_base")
    
    # Step 6: Encode the text and count tokens.
    tokens = encoding.encode(json_text)
    token_count = len(tokens)
    
    # Calculate the average number of characters per token.
    avg_chars_per_token = char_count / token_count if token_count > 0 else 0
    
    # Step 7: Calculate the equivalent price.
    # Example rate: $0.03 per 1,000 tokens.
    price_per_1000_tokens = 0.03
    equivalent_price = (token_count / 1000) * price_per_1000_tokens
    
    # Step 8: Print out all details.
    print("Token and Character Statistics:")
    print("---------------------------------")
    print(f"Total characters: {char_count}")
    print(f"Total words: {word_count}")
    print(f"Total tokens (using 'cl100k_base' encoding): {token_count}")
    print(f"Average characters per token: {avg_chars_per_token:.2f}")
    print()
    print("Cost Calculation:")
    print("-----------------")
    print(f"Price per 1000 tokens: ${price_per_1000_tokens}")
    print(f"Equivalent price: ${equivalent_price:.4f}")
    print()
    print("Calculation Details:")
    print("---------------------")
    print("1. The JSON file is read and serialized using json.dumps(), and its length is used for the character count.")
    print("2. Word count is determined by splitting the text on whitespace (using str.split()).")
    print("3. Tokens are calculated by encoding the text with tiktoken's 'cl100k_base' encoding.")
    print("4. Average characters per token is: total characters / total tokens.")
    print("5. The cost is computed by scaling the token count to a price per 1,000 tokens.")

if __name__ == "__main__":
    main()


Token and Character Statistics:
---------------------------------
Total characters: 104335
Total words: 12160
Total tokens (using 'cl100k_base' encoding): 25958
Average characters per token: 4.02

Cost Calculation:
-----------------
Price per 1000 tokens: $0.03
Equivalent price: $0.7787

Calculation Details:
---------------------
1. The JSON file is read and serialized using json.dumps(), and its length is used for the character count.
2. Word count is determined by splitting the text on whitespace (using str.split()).
3. Tokens are calculated by encoding the text with tiktoken's 'cl100k_base' encoding.
4. Average characters per token is: total characters / total tokens.
5. The cost is computed by scaling the token count to a price per 1,000 tokens.


In [4]:
import json
import tiktoken

def main():
    # Step 1: Read and parse the JSON file.
    with open("2024_2025_spring_courses.json", "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Step 2: Convert the JSON data to a string.
    # This serialized string is what we will tokenize.
    json_text = json.dumps(data)
    
    # Step 3: Calculate the total number of characters.
    char_count = len(json_text)
    
    # Step 4: Calculate the total number of words.
    # Here, we split the text by whitespace.
    word_count = len(json_text.split())
    
    # Step 5: Get the tokenizer encoding.
    # Since GPT-4o isn't directly available, we use "cl100k_base",
    # which is common for GPT-4 based models.
    encoding = tiktoken.get_encoding("cl100k_base")
    
    # Step 6: Encode the text and count tokens.
    tokens = encoding.encode(json_text)
    token_count = len(tokens)
    
    # Calculate the average number of characters per token.
    avg_chars_per_token = char_count / token_count if token_count > 0 else 0
    
    # Step 7: Calculate the equivalent price.
    # Using the blended pricing for GPT-4o:
    # - Input tokens: $2.50 per 1M tokens
    # - Output tokens: $10.00 per 1M tokens
    # Assuming 80% of tokens are input and 20% output:
    #   Blended cost = (0.8 * 2.50) + (0.2 * 10.00) = $4.00 per 1M tokens,
    # which is equivalent to $0.004 per 1,000 tokens.
    price_per_1000_tokens = 0.004  # $0.004 per 1,000 tokens
    equivalent_price = (token_count / 1000) * price_per_1000_tokens
    
    # Step 8: Print out all details.
    print("Token and Character Statistics:")
    print("---------------------------------")
    print(f"Total characters: {char_count}")
    print(f"Total words: {word_count}")
    print(f"Total tokens (using 'cl100k_base' encoding): {token_count}")
    print(f"Average characters per token: {avg_chars_per_token:.2f}")
    print()
    print("Cost Calculation:")
    print("-----------------")
    print(f"Blended price per 1,000 tokens: ${price_per_1000_tokens:.4f}")
    print(f"Equivalent price for {token_count} tokens: ${equivalent_price:.4f}")
    print()
    print("Calculation Details:")
    print("---------------------")
    print("1. The JSON file is read and serialized using json.dumps(), and its length is used for the character count.")
    print("2. Word count is determined by splitting the text on whitespace (using str.split()).")
    print("3. Tokens are calculated by encoding the text with tiktoken's 'cl100k_base' encoding.")
    print("4. Average characters per token is computed as: total characters / total tokens.")
    print("5. The cost is computed using a blended rate of $4.00 per 1,000,000 tokens (or $0.004 per 1,000 tokens),")
    print("   which assumes 80% input and 20% output tokens for the GPT-4o model.")

if __name__ == "__main__":
    main()


Token and Character Statistics:
---------------------------------
Total characters: 104335
Total words: 12160
Total tokens (using 'cl100k_base' encoding): 25958
Average characters per token: 4.02

Cost Calculation:
-----------------
Blended price per 1,000 tokens: $0.0040
Equivalent price for 25958 tokens: $0.1038

Calculation Details:
---------------------
1. The JSON file is read and serialized using json.dumps(), and its length is used for the character count.
2. Word count is determined by splitting the text on whitespace (using str.split()).
3. Tokens are calculated by encoding the text with tiktoken's 'cl100k_base' encoding.
4. Average characters per token is computed as: total characters / total tokens.
5. The cost is computed using a blended rate of $4.00 per 1,000,000 tokens (or $0.004 per 1,000 tokens),
   which assumes 80% input and 20% output tokens for the GPT-4o model.


In [5]:
import json
import tiktoken

def main():
    # Step 1: Read and parse the JSON file.
    with open("2024_2025_spring_courses.json", "r", encoding="utf-8") as file:
        data = json.load(file)
    
    # Step 2: Convert the JSON data to a string.
    # This serialized string is what we will tokenize.
    json_text = json.dumps(data)
    
    # Step 3: Calculate the total number of characters.
    char_count = len(json_text)
    
    # Step 4: Calculate the total number of words.
    # Splitting the text by whitespace gives an approximate word count.
    word_count = len(json_text.split())
    
    # Step 5: Get the tokenizer encoding.
    # We use "cl100k_base" encoding which is common for GPT-4 and its variants.
    encoding = tiktoken.get_encoding("cl100k_base")
    
    # Step 6: Encode the text and count the tokens (these represent input tokens).
    tokens = encoding.encode(json_text)
    token_count = len(tokens)
    
    # Calculate the average number of characters per token.
    avg_chars_per_token = char_count / token_count if token_count > 0 else 0
    
    # Step 7: Calculate the equivalent cost for input tokens only.
    # Given pricing: $2.50 per 1,000,000 tokens
    # Therefore, cost per token = 2.50 / 1,000,000 dollars.
    input_cost_per_token = 2.50 / 1_000_000  # dollars per token
    equivalent_price_input = token_count * input_cost_per_token
    
    # Step 8: Print out all details.
    print("Token and Character Statistics:")
    print("---------------------------------")
    print(f"Total characters: {char_count}")
    print(f"Total words: {word_count}")
    print(f"Total input tokens (using 'cl100k_base' encoding): {token_count}")
    print(f"Average characters per token: {avg_chars_per_token:.2f}")
    print()
    print("Input Cost Calculation:")
    print("-------------------------")
    print(f"Price per 1,000,000 input tokens: $2.50")
    print(f"Equivalent cost for {token_count} input tokens: ${equivalent_price_input:.4f}")
    print()
    print("Calculation Details:")
    print("---------------------")
    print("1. The JSON file is read and serialized using json.dumps(), and its length is used for the character count.")
    print("2. Word count is determined by splitting the text on whitespace (using str.split()).")
    print("3. Input tokens are calculated by encoding the text with tiktoken's 'cl100k_base' encoding.")
    print("4. The cost is computed using the input pricing of $2.50 per 1,000,000 tokens.")
    
if __name__ == "__main__":
    main()


Token and Character Statistics:
---------------------------------
Total characters: 104335
Total words: 12160
Total input tokens (using 'cl100k_base' encoding): 25958
Average characters per token: 4.02

Input Cost Calculation:
-------------------------
Price per 1,000,000 input tokens: $2.50
Equivalent cost for 25958 input tokens: $0.0649

Calculation Details:
---------------------
1. The JSON file is read and serialized using json.dumps(), and its length is used for the character count.
2. Word count is determined by splitting the text on whitespace (using str.split()).
3. Input tokens are calculated by encoding the text with tiktoken's 'cl100k_base' encoding.
4. The cost is computed using the input pricing of $2.50 per 1,000,000 tokens.


In [7]:
pip install streamlit tiktoken


Collecting packaging<24,>=16.8 (from streamlit)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading packaging-23.2-py3-none-any.whl (53 kB)
    torch (>=1.7torchvision)
          ~~~~~~^[0m[33m
[0mInstalling collected packages: packaging
  Attempting uninstall: packaging
    Found existing installation: packaging 24.1
    Uninstalling packaging-24.1:
      Successfully uninstalled packaging-24.1
Successfully installed packaging-23.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import tkinter as tk
from tkinter import filedialog
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_file():
    """
    Opens a file dialog for the user to select a txt file.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    file_path = filedialog.askopenfilename(
        title="Select a txt file", 
        filetypes=[("Text Files", "*.txt")]
    )
    return file_path

if __name__ == "__main__":
    file_path = choose_file()
    if not file_path:
        print("No file selected. Exiting.")
        sys.exit(1)
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()
    except Exception as e:
        print(f"Error processing text file: {e}")
        sys.exit(1)
    
    # Calculate token count using GPT‑4's "cl100k_base" encoding
    token_count = count_tokens(file_text)
    print(f"Token count: {token_count}")


Token count: 1700


In [9]:
import tkinter as tk
from tkinter import filedialog
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_file():
    """
    Opens a file dialog for the user to select a txt file.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    file_path = filedialog.askopenfilename(
        title="Select a txt file", 
        filetypes=[("Text Files", "*.txt")]
    )
    return file_path

if __name__ == "__main__":
    file_path = choose_file()
    if not file_path:
        print("No file selected. Exiting.")
        sys.exit(1)
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()
    except Exception as e:
        print(f"Error processing text file: {e}")
        sys.exit(1)
    
    # Calculate token count using GPT‑4's "cl100k_base" encoding
    token_count = count_tokens(file_text)
    
    # Calculate total characters
    char_count = len(file_text)
    
    print(f"Token count: {token_count}")
    print(f"Character count: {char_count}")


Token count: 1700
Character count: 9074


In [10]:
 import tkinter as tk
from tkinter import filedialog
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_file():
    """
    Opens a file dialog for the user to select a txt file.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    file_path = filedialog.askopenfilename(
        title="Select a txt file", 
        filetypes=[("Text Files", "*.txt")]
    )
    return file_path

if __name__ == "__main__":
    file_path = choose_file()
    if not file_path:
        print("No file selected. Exiting.")
        sys.exit(1)
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()
    except Exception as e:
        print(f"Error processing text file: {e}")
        sys.exit(1)
    
    # Calculate token count using GPT-4's "cl100k_base" encoding
    token_count = count_tokens(file_text)
    
    # Calculate total characters
    char_count = len(file_text)
    
    # Cost calculation
    price_per_1000_tokens = 0.004  # $0.004 per 1,000 tokens
    equivalent_price = (token_count / 1000) * price_per_1000_tokens
    
    print(f"Token count: {token_count}")
    print(f"Character count: {char_count}")
    print(f"Estimated cost: ${equivalent_price:.4f}")


Token count: 1700
Character count: 9074
Estimated cost: $0.0068


In [3]:
import tkinter as tk
from tkinter import filedialog
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback to cl100k_base if the model isn't recognized
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_file():
    """
    Opens a file dialog for the user to select a txt file.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    file_path = filedialog.askopenfilename(
        title="Select a txt file",
        filetypes=[("Text Files", "*.txt")]
    )
    return file_path

if __name__ == "__main__":
    file_path = choose_file()
    if not file_path:
        print("No file selected. Exiting.")
        sys.exit(1)
    
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()
    except Exception as e:
        print(f"Error processing text file: {e}")
        sys.exit(1)
    
    # Remove substrings that should not be counted
    excluded_substrings = [
        "Ask about courses (or type 'exit' to quit)",
        "Assistant: "
    ]
    for ex_substring in excluded_substrings:
        file_text = file_text.replace(ex_substring, "")
    
    # Calculate token count using GPT-4's "cl100k_base" encoding
    token_count = count_tokens(file_text)
    
    # Calculate total characters
    char_count = len(file_text)
    
    # Cost calculation
    price_per_1000_tokens = 0.004  # $0.004 per 1,000 tokens
    equivalent_price = (token_count / 1000) * price_per_1000_tokens
    
    # Print results
    print(f"Token count: {token_count}")
    print(f"Character count: {char_count}")
    print(f"Estimated cost: ${equivalent_price:.4f}")


Token count: 376
Character count: 2159
Estimated cost: $0.0015


In [5]:
import tkinter as tk
from tkinter import filedialog
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback to cl100k_base if the model isn't recognized
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_file():
    """
    Opens a file dialog for the user to select a txt file.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    file_path = filedialog.askopenfilename(
        title="Select a txt file",
        filetypes=[("Text Files", "*.txt")]
    )
    return file_path

if __name__ == "__main__":
    file_path = choose_file()
    if not file_path:
        print("No file selected. Exiting.")
        sys.exit(1)
    
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()
    except Exception as e:
        print(f"Error processing text file: {e}")
        sys.exit(1)
    
    # Remove substrings that should not be counted
    excluded_substrings = [
        "Ask about courses (or type 'exit' to quit)",
        "Assistant: "
    ]
    for ex_substring in excluded_substrings:
        file_text = file_text.replace(ex_substring, "")
    
    # Calculate token count using GPT-4's "cl100k_base" encoding
    token_count = count_tokens(file_text)
    
    # Calculate total characters
    char_count = len(file_text)
    
    # New cost calculation using the provided formula:
    # Cost_output = (token_count / 1,000,000) * 10
    cost_output = (token_count / 1000000) * 10
    
    # Print results
    print(f"Token count: {token_count}")
    print(f"Character count: {char_count}")
    print(f"Estimated cost (new formula): ${cost_output:.4f}")


Token count: 318
Character count: 1599
Estimated cost (new formula): $0.0032


In [15]:
import tkinter as tk
from tkinter import filedialog
import os
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback to cl100k_base if the model isn't recognized
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_folder():
    """
    Opens a folder dialog for the user to select a folder containing txt files.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    folder_path = filedialog.askdirectory(title="Select folder containing txt files")
    return folder_path

if __name__ == "__main__":
    folder_path = choose_folder()
    if not folder_path:
        print("No folder selected. Exiting.")
        sys.exit(1)
    
    # List all .txt files in the selected folder
    txt_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.txt')]
    if not txt_files:
        print("No .txt files found in the selected folder.")
        sys.exit(1)
    
    for txt_file in txt_files:
        file_path = os.path.join(folder_path, txt_file)
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                file_text = file.read()
        except Exception as e:
            print(f"Error processing file {txt_file}: {e}")
            continue
        
        # Remove substrings that should not be counted
        excluded_substrings = [
            "Ask about courses (or type 'exit' to quit)",
            "Assistant: "
        ]
        for ex_substring in excluded_substrings:
            file_text = file_text.replace(ex_substring, "")
        
        # Calculate token count using GPT-4's "cl100k_base" encoding
        token_count = count_tokens(file_text)
        # Calculate total characters
        char_count = len(file_text)
        # Cost calculation using the provided formula:
        # Cost_output = (token_count / 1,000,000) * 10
        cost_output = (token_count / 1000000) * 10
        
        # Print results for each file
        print(f"File: {txt_file}")
        print(f"Token count: {token_count}")
        print(f"Character count: {char_count}")
        print(f"Estimated cost (using new formula): ${cost_output:.4f}")
        print("-" * 40)


File: Question10 Time2.txt
Token count: 320
Character count: 1819
Estimated cost (using new formula): $0.0032
----------------------------------------
File: Question10 Time3.txt
Token count: 311
Character count: 1785
Estimated cost (using new formula): $0.0031
----------------------------------------
File: Question10 Time1.txt
Token count: 314
Character count: 1776
Estimated cost (using new formula): $0.0031
----------------------------------------
File: Question10 Time4.txt
Token count: 323
Character count: 1830
Estimated cost (using new formula): $0.0032
----------------------------------------
File: Question10 Time5.txt
Token count: 311
Character count: 1770
Estimated cost (using new formula): $0.0031
----------------------------------------


In [5]:
import tkinter as tk
from tkinter import filedialog
import os
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback to cl100k_base if the model isn't recognized
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_folder():
    """
    Opens a folder dialog for the user to select a folder containing txt files.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    folder_path = filedialog.askdirectory(title="Select folder containing txt files")
    return folder_path

if __name__ == "__main__":
    folder_path = choose_folder()
    if not folder_path:
        print("No folder selected. Exiting.")
        sys.exit(1)
    
    # List all .txt files in the selected folder
    txt_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.txt')]
    if not txt_files:
        print("No .txt files found in the selected folder.")
        sys.exit(1)
    
    for txt_file in txt_files:
        file_path = os.path.join(folder_path, txt_file)
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                file_text = file.read()
        except Exception as e:
            print(f"Error processing file {txt_file}: {e}")
            continue
        
        # Remove trailing blank lines at the bottom of the file while keeping internal blank lines
        lines = file_text.splitlines(keepends=True)
        while lines and not lines[-1].strip():
            lines.pop()
        file_text = ''.join(lines)
        
        # Remove substrings that should not be counted
        excluded_substrings = [
            "Ask about courses (or type 'exit' to quit)",
            "Assistant: "
        ]
        for ex_substring in excluded_substrings:
            file_text = file_text.replace(ex_substring, "")
        
        # Calculate token count using GPT-4's "cl100k_base" encoding
        token_count = count_tokens(file_text)
        # Calculate total characters
        char_count = len(file_text)
        # Cost calculation using the provided formula:
        # Cost_output = (token_count / 1,000,000) * 10
        cost_output = (token_count / 1000000) * 10
        
        # Print results for each file
        print(f"File: {txt_file}")
        print(f"Token count: {token_count}")
        print(f"Character count: {char_count}")
        print(f"Estimated cost (using new formula): ${cost_output:.4f}")
        print("-" * 40)


File: Question2_Time1.txt
Token count: 251
Character count: 1075
Estimated cost (using new formula): $0.0025
----------------------------------------
File: Question2_Time3.txt
Token count: 239
Character count: 1010
Estimated cost (using new formula): $0.0024
----------------------------------------
File: Question2_Time2.txt
Token count: 240
Character count: 1013
Estimated cost (using new formula): $0.0024
----------------------------------------
File: Question2_Time6.txt
Token count: 339
Character count: 1455
Estimated cost (using new formula): $0.0034
----------------------------------------
File: Question2_Time7.txt
Token count: 244
Character count: 1054
Estimated cost (using new formula): $0.0024
----------------------------------------
File: Question2_Time5.txt
Token count: 236
Character count: 989
Estimated cost (using new formula): $0.0024
----------------------------------------
File: Question2_Time4.txt
Token count: 312
Character count: 1344
Estimated cost (using new formula): 

In [24]:
import tkinter as tk
from tkinter import filedialog
import os
import tiktoken
import sys

def count_tokens(text, model="gpt-4o"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # Fallback to cl100k_base if the model isn't recognized
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def choose_files():
    """
    Opens a file dialog for the user to select individual txt files.
    """
    root = tk.Tk()
    root.withdraw()  # Hide the main Tkinter window
    file_paths = filedialog.askopenfilenames(
        title="Select txt files", 
        filetypes=[("Text files", "*.txt")]
    )
    return file_paths

if __name__ == "__main__":
    file_paths = choose_files()
    if not file_paths:
        print("No files selected. Exiting.")
        sys.exit(1)
    
    for file_path in file_paths:
        txt_file = os.path.basename(file_path)
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                file_text = file.read()
        except Exception as e:
            print(f"Error processing file {txt_file}: {e}")
            continue
        
        # Remove trailing blank lines at the bottom of the file while keeping internal blank lines
        lines = file_text.splitlines(keepends=True)
        while lines and not lines[-1].strip():
            lines.pop()
        file_text = ''.join(lines)
        
        # Remove substrings that should not be counted
        excluded_substrings = [
            "Ask about courses (or type 'exit' to quit)",
            "Assistant: "
        ]
        for ex_substring in excluded_substrings:
            file_text = file_text.replace(ex_substring, "")
        
        # Calculate token count using GPT-4's "cl100k_base" encoding
        token_count = count_tokens(file_text)
        # Calculate total characters
        char_count = len(file_text)
        # Cost calculation using the provided formula:
        # Cost_output = (token_count / 1,000,000) * 10
        cost_output = (token_count / 1000000) * 10
        
        # Print results for each file
        print(f"File: {txt_file}")
        print(f"Token count: {token_count}")
        print(f"Character count: {char_count}")
        print(f"Estimated cost (using new formula): ${cost_output:.4f}")
        print("-" * 40)


File: Question3 Time6.txt
Token count: 226
Character count: 1210
Estimated cost (using new formula): $0.0023
----------------------------------------


In [16]:
import tiktoken

def calculate_gpt4o_input_cost(text: str, price_per_million_tokens: float = 2.50):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    token_count = len(encoding.encode(text))
    cost = (token_count / 1_000_000) * price_per_million_tokens
    return token_count, cost

text_input = input("Enter your GPT-4o input: ")
tokens, cost = calculate_gpt4o_input_cost(text_input)

print(f"Total tokens: {tokens}")
print(f"Total cost: ${cost:.6f}")


Enter your GPT-4o input:  Are there specific scheduling guidelines or typical semester offerings for required courses like Network Programming (CSC 544) and Database Management Systems (MIS 555) that I should plan for?


Total tokens: 35
Total cost: $0.000087


In [21]:
import json
import tiktoken

def count_cached_input_tokens_and_cost(
    jsonl_file_path: str,
    model: str = "gpt-4o",
    price_per_million_tokens: float = 1.25
) -> None:
    """
    Reads a JSONL file line by line, uses tiktoken to count tokens,
    and calculates cost based on the provided rate.
    
    :param jsonl_file_path: Path to your JSONL file
    :param model: Model name for which you want to use tiktoken encoding
    :param price_per_million_tokens: Price in dollars per 1 million tokens
    """
    
    # Get the tiktoken encoding for the specified model
    encoding = tiktoken.encoding_for_model(model)

    total_tokens = 0
    
    with open(jsonl_file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Strip empty lines/spaces
            line = line.strip()
            if not line:
                continue
            
            # Load JSON (if you need to process fields, do it here)
            # data = json.loads(line)
            # For counting tokens, you can encode the entire raw line as is:
            tokens_in_line = len(encoding.encode(line))
            total_tokens += tokens_in_line
    
    # Calculate cost
    total_cost = (total_tokens / 1_000_000) * price_per_million_tokens
    
    print(f"Total tokens: {total_tokens}")
    print(f"Total cost: ${total_cost:.6f}")


if __name__ == "__main__":
    # Example usage:
    # Replace 'course_index_cache.jsonl' with the path to your JSONL file
    jsonl_path = "course_index_cache.json"
    count_cached_input_tokens_and_cost(jsonl_path)


Total tokens: 25566
Total cost: $0.031957


In [20]:
import json
import tiktoken

def count_cached_input_tokens_and_cost(
    json_file_path: str,
    model: str = "gpt-4o",
    price_per_million_tokens: float = 1.25
):
    """
    Reads a JSON file (single object or array of objects), uses tiktoken to count tokens,
    and calculates cost based on the provided rate (default: $1.25 per 1M tokens).
    
    :param json_file_path: Path to your JSON file
    :param model: Model name for which you want to use tiktoken encoding
    :param price_per_million_tokens: Price in dollars per 1 million tokens
    """

    # Get the tiktoken encoding for the specified model
    encoding = tiktoken.encoding_for_model(model)
    total_tokens = 0

    # Load the entire JSON file
    with open(json_file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    def encode_item(item):
        # Convert item to a JSON string to tokenize the entire structure
        item_as_string = json.dumps(item, ensure_ascii=False)
        return len(encoding.encode(item_as_string))

    # If the JSON is a list, iterate over each element
    if isinstance(data, list):
        for item in data:
            total_tokens += encode_item(item)
    # If the JSON is a single dictionary or another type, just encode it directly
    else:
        total_tokens += encode_item(data)

    # Calculate cost
    total_cost = (total_tokens / 1_000_000) * price_per_million_tokens

    print(f"Total tokens: {total_tokens}")
    print(f"Total cost: ${total_cost:.6f}")


if __name__ == "__main__":
    # Example usage:
    # Replace 'course_index_cache.json' with the path to your JSON file
    json_path = "course_index_cache.json"
    count_cached_input_tokens_and_cost(json_path)


Total tokens: 25571
Total cost: $0.031964
