In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os

import pandas as pd
import plotly.express as px
import PyPDF2
import tiktoken
from utils.tokens import num_tokens_from_string


In [3]:
folder_path = "data"

Now we are going to do the same but including the tokens.


In [4]:
def get_pdfs_data(folder_path, encoding):
    pdf_files = sorted(
        [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.endswith(".pdf")
        ]
    )
    pdfs_data = []
    for file in pdf_files:
        pdf = PyPDF2.PdfReader(open(file, "rb"))
        pdf_pages = len(pdf.pages)
        filename = os.path.basename(file)
        number_of_words = 0
        number_of_tokens = 0
        for i in range(pdf_pages):
            page = pdf.pages[i]
            text_in_page = page.extract_text()
            words = text_in_page.split()
            number_of_words += len(words)
            number_of_tokens += num_tokens_from_string(text_in_page, encoding=encoding)
        pdfs_data.append(
            {
                "filename": filename,
                "number_of_pages": pdf_pages,
                "number_of_words": number_of_words,
                "number_of_tokens": number_of_tokens,
            }
        )

    return pdfs_data

The "cl100k_base" encoding is the one used for text-embedding-ada-002, text-embedding-3-small and text-embedding-3-large, so the number of tokens is the same for all of these models.


In [5]:
encoding = tiktoken.get_encoding("cl100k_base")

In [6]:
result = get_pdfs_data(
    folder_path="../data/asc_842/lease_agreements/", encoding=encoding
)

In [7]:
df = pd.DataFrame(result)

In [8]:
df

Unnamed: 0,filename,number_of_pages,number_of_words,number_of_tokens
0,lease001.pdf,8,4645,9221
1,lease002.pdf,26,8693,17479
2,lease003.pdf,25,8741,17356
3,lease004.pdf,124,47873,98598
4,lease005.pdf,28,14391,31138
...,...,...,...,...
95,lease096.pdf,18,8227,16398
96,lease097.pdf,6,6437,13356
97,lease098.pdf,5,5190,9826
98,lease099.pdf,18,7900,15539


In [9]:
df_metadata = df[["filename", "number_of_pages", "number_of_words"]]
df_metadata.rename(
    columns={
        "filename": "Nombre de archivo",
        "number_of_pages": "Número de páginas",
        "number_of_words": "Número de palabras",
    },
    inplace=True,
)
df_metadata.to_excel("metadata.xlsx", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata.rename(


In [32]:
statistics_number_of_pages = df["number_of_pages"].describe()

print(statistics_number_of_pages)

count    100.000000
mean      22.880000
std       20.809516
min        2.000000
25%        7.750000
50%       16.000000
75%       30.000000
max      124.000000
Name: number_of_pages, dtype: float64


In [31]:
statistics_number_of_words = df["number_of_words"].describe()

print(statistics_number_of_words)

count      100.000000
mean     10534.190000
std       9668.317767
min        779.000000
25%       4678.750000
50%       6976.000000
75%      13255.750000
max      47873.000000
Name: number_of_words, dtype: float64


In [29]:
fig_tokens_hist = px.histogram(
    df,
    x="number_of_tokens",
    title="Distribución del número de tokens",
    labels={"number_of_tokens": "Número de tokens"},
    nbins=20,
)
fig_tokens_hist.update_layout(yaxis_title="Conteo")
fig_tokens_hist.show()

In [33]:
statistics_number_of_tokens = df["number_of_tokens"].describe()

print(statistics_number_of_tokens)

count      100.000000
mean     21157.360000
std      19478.288773
min       1602.000000
25%       9062.750000
50%      14070.000000
75%      26567.250000
max      98598.000000
Name: number_of_tokens, dtype: float64


Now, we are going to calculate the cost of each experiment, based on the number of tokens and the cost of each token. Each experiment consists of 8 runs, so we are going to calculate the cost of each run and then multiply it by 8. This only applies for the input tokens of the model, because the text embeddings is ran once.


## List of prices for text embeddings

- text-embedding-ada-002: $0.100 / 1M tokens
- text-embedding-3-large: $0.130 / 1M tokens
- text-embedding-3-small: $0.020 / 1M tokens


In [11]:
text_embedding_ada_002 = 0.100 / (10**6)
text_embedding_3_small_price_per_token = 0.02 / (10**6)
text_embedding_3_large_price_per_token = 0.13 / (10**6)

In [12]:
df["price_ada_v2"] = df["number_of_tokens"] * text_embedding_ada_002
df["price_text_embedding_3_small"] = (
    df["number_of_tokens"] * text_embedding_3_small_price_per_token
)
df["price_text_embedding_3_large"] = (
    df["number_of_tokens"] * text_embedding_3_large_price_per_token
)

In [13]:
df[
    [
        "filename",
        "price_ada_v2",
        "price_text_embedding_3_small",
        "price_text_embedding_3_large",
    ]
]

Unnamed: 0,filename,price_ada_v2,price_text_embedding_3_small,price_text_embedding_3_large
0,lease001.pdf,0.000922,0.000184,0.001199
1,lease002.pdf,0.001748,0.000350,0.002272
2,lease003.pdf,0.001736,0.000347,0.002256
3,lease004.pdf,0.009860,0.001972,0.012818
4,lease005.pdf,0.003114,0.000623,0.004048
...,...,...,...,...
95,lease096.pdf,0.001640,0.000328,0.002132
96,lease097.pdf,0.001336,0.000267,0.001736
97,lease098.pdf,0.000983,0.000197,0.001277
98,lease099.pdf,0.001554,0.000311,0.002020


In [14]:
total_price_for_embedding_docs_ada_v2 = round(df["price_ada_v2"].sum(), 2)
total_price_for_embedding_docs_text_embedding_3_small = round(
    df["price_text_embedding_3_small"].sum(), 2
)
total_price_for_embedding_docs_text_embedding_3_large = round(
    df["price_text_embedding_3_large"].sum(), 2
)

In [15]:
print(
    f"Total price for embedding documents (ADA v2): ${total_price_for_embedding_docs_ada_v2}"
)
print(
    f"Total price for embedding documents (text-embedding-3-small): ${total_price_for_embedding_docs_text_embedding_3_small}"
)
print(
    f"Total price for embedding documents (text-embedding-3-large): ${total_price_for_embedding_docs_text_embedding_3_large}"
)


Total price for embedding documents (ADA v2): $0.21
Total price for embedding documents (text-embedding-3-small): $0.04
Total price for embedding documents (text-embedding-3-large): $0.28


## List of prices for models

- gpt-4o-mini: $0.150 / 1M input tokens
- gpt-4o-2024-08-06: $2.50 / 1M input tokens
- gpt-3.5-turbo: $0.500 / 1M input tokens


In [16]:
gpt_4o_mini_price_per_token = 0.15 / (10**6)
gpt_4o_2024_08_06_price_per_token = 2.50 / (10**6)
gpt_3_5_turbo_price_per_token = 0.50 / (10**6)

In [17]:
df["price_per_experiment_gpt_4o_mini"] = (
    df["number_of_tokens"] * gpt_4o_mini_price_per_token
)
df["price_per_experiment_gpt_4o_2024_08_06"] = (
    df["number_of_tokens"] * gpt_4o_2024_08_06_price_per_token
)
df["price_per_experiment_gpt_3_5_turbo"] = (
    df["number_of_tokens"] * gpt_3_5_turbo_price_per_token
)

In [18]:
df[
    [
        "filename",
        "price_per_experiment_gpt_4o_mini",
        "price_per_experiment_gpt_4o_2024_08_06",
        "price_per_experiment_gpt_3_5_turbo",
    ]
]

Unnamed: 0,filename,price_per_experiment_gpt_4o_mini,price_per_experiment_gpt_4o_2024_08_06,price_per_experiment_gpt_3_5_turbo
0,lease001.pdf,0.001383,0.023053,0.004611
1,lease002.pdf,0.002622,0.043698,0.008739
2,lease003.pdf,0.002603,0.043390,0.008678
3,lease004.pdf,0.014790,0.246495,0.049299
4,lease005.pdf,0.004671,0.077845,0.015569
...,...,...,...,...
95,lease096.pdf,0.002460,0.040995,0.008199
96,lease097.pdf,0.002003,0.033390,0.006678
97,lease098.pdf,0.001474,0.024565,0.004913
98,lease099.pdf,0.002331,0.038848,0.007769


In [19]:
price_per_experiment_gpt_4o_mini = round(
    df["price_per_experiment_gpt_4o_mini"].sum(), 2
)
price_per_experiment_gpt_4o_2024_08_06 = round(
    df["price_per_experiment_gpt_4o_2024_08_06"].sum(), 2
)
price_per_experiment_gpt_3_5_turbo = round(
    df["price_per_experiment_gpt_3_5_turbo"].sum(), 2
)

In [20]:
print(f"Price per experiment (GPT-4o-mini): ${price_per_experiment_gpt_4o_mini}")
print(
    f"Price per experiment (GPT-4o-2024-08-06): ${price_per_experiment_gpt_4o_2024_08_06}"
)
print(f"Price per experiment (GPT-3.5-turbo): ${price_per_experiment_gpt_3_5_turbo}")

Price per experiment (GPT-4o-mini): $0.32
Price per experiment (GPT-4o-2024-08-06): $5.29
Price per experiment (GPT-3.5-turbo): $1.06


In [21]:
total_price_gpt_4o_mini = price_per_experiment_gpt_4o_mini * 8
total_price_gpt_4o_2024_08_06 = price_per_experiment_gpt_4o_2024_08_06 * 8
total_price_gpt_3_5_turbo = price_per_experiment_gpt_3_5_turbo * 8

In [22]:
print(f"Total price (GPT-4o-mini): ${total_price_gpt_4o_mini}")
print(f"Total price (GPT-4o-2024-08-06): ${total_price_gpt_4o_2024_08_06}")
print(f"Total price (GPT-3.5-turbo): ${total_price_gpt_3_5_turbo}")


Total price (GPT-4o-mini): $2.56
Total price (GPT-4o-2024-08-06): $42.32
Total price (GPT-3.5-turbo): $8.48
