In [1]:
import pandas as pd
import pyarrow

In [2]:
df1 = pd.read_parquet('mimic_cxr_data/train-00000-of-00002.parquet')
df2 = pd.read_parquet('mimic_cxr_data/train-00001-of-00002.parquet')

df = pd.concat([df1, df2])

In [3]:
df

Unnamed: 0,image,findings,impression
0,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.
1,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,Lung volumes remain low. There are innumerable...,Low lung volumes and mild pulmonary vascular c...
2,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,Lung volumes are low. This results in crowding...,Innumerable pulmonary metastases. Possible mil...
3,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,In comparison to study performed on of there i...,New mild pulmonary edema with persistent small...
4,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,The right costophrenic angle is not imaged. Ot...,An enteric tube courses below the level of the...
...,...,...,...
15311,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,New collapse of the left upper lobe around a l...,New upper lobe collapse and some lower lobe at...
15312,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,AP portable upright view of the chest. In this...,Improved aeration in the left upper lobe. Pers...
15313,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,The cardiomediastinal and hilar contours are w...,No acute cardiopulmonary process. No evidence ...
15314,b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01...,The NG tube courses into the left upper abdom...,Appropriately positioned ET and NG tubes. Biba...


In [4]:
df.iloc[0]['findings']

'The lungs are clear of focal consolidation, pleural effusion or pneumothorax. The heart size is normal. The mediastinal contours are normal. Multiple surgical clips project over the left breast, and old left rib fractures are noted. '

In [5]:
df.iloc[0]['impression']

'No acute cardiopulmonary process.'

In [6]:
import re
import pandas as pd

def count_sentences(text: str) -> int:
    if not isinstance(text, str):
        return 0
    # very simple sentence split; same spirit as your script (split on ".")
    parts = re.split(r"[.!?]", text)
    return sum(1 for p in parts if p.strip())

def count_row_sentences(row):
    findings = row.get("findings", None)
    impression = row.get("impression", None)
    text = ""
    if isinstance(findings, str):
        text += findings + " "
    if isinstance(impression, str):
        text += impression
    return count_sentences(text)

df["n_sentences"] = df.apply(count_row_sentences, axis=1)
total_sentences = int(df["n_sentences"].sum())
total_reports = len(df)

total_reports, total_sentences


(30633, 232160)

In [7]:
S = total_sentences
chat_calls = 3 * S
embedding_calls = 6 * S
total_requests = 9 * S

print('chat_calls:', chat_calls)
print('embedding_calls:', embedding_calls)
print('total_requests:', total_requests)

chat_calls: 696480
embedding_calls: 1392960
total_requests: 2089440


In [8]:
import random
import tiktoken
import re

enc_chat = tiktoken.encoding_for_model("gpt-3.5-turbo")
enc_emb = tiktoken.encoding_for_model("text-embedding-ada-002")

def build_query(sentence: str) -> str:
    return f"""Use the below sentence to answer the subsequent question.
    Emr_report:
    \"\"\"
    {sentence}
    \"\"\"
    Question: Does the patient have the specific disease in the chest based on the provied EMR report's sentence? 
    Answer form should be JSON object like following script. The JSON object has two key, "Result", and "Explanation".
    For [Result], if the sentence doesn't have enough information or evidence to classify, you should return "Uncertain". 
    If the sentence has the clear evidence that indicates absence of any abnormalities in chest, you should answer "No". 
    If the sentence has the clear observational evidence that indicates presence of any abnormalities in chest (only for present), you should answer "Yes". 

    For [Explanation], you should give a sentence more than 40 letters and less than 60 letters which explain the reason about why you choose those answers. You should elucidating the rationale behind your choice, not a direct repetition, of the input text.
    [Result] : Uncertain / No / Yes
    """

def count_chat_tokens(sentence: str) -> int:
    system = "You will be provided with a emr sentecne."
    user = build_query(sentence)
    # simple chat token count approximation:
    return len(enc_chat.encode(system)) + len(enc_chat.encode(user))

def count_sentence_tokens(sentence: str) -> int:
    return len(enc_emb.encode(sentence))

# sample a bunch of sentences from df
samples = []
for _, row in df.sample(n=200, random_state=42).iterrows():
    text = ""
    if isinstance(row.get("findings"), str):
        text += row["findings"] + " "
    if isinstance(row.get("impression"), str):
        text += row["impression"]
    parts = [s.strip() for s in re.split(r"[.!?]", text) if s.strip()]
    samples.extend(parts)

samples = samples[:200]

chat_tokens_per_call = sum(count_chat_tokens(s) for s in samples) / len(samples)
embed_tokens_per_sentence = sum(count_sentence_tokens(s) for s in samples) / len(samples)

chat_tokens_per_sentence_all_passes = chat_tokens_per_call * 3  # 3 passes
embed_tokens_per_sentence_all_passes = (embed_tokens_per_sentence * 2) * 3  # ctx+explanation per pass, 3 passes

chat_tokens_per_sentence_all_passes, embed_tokens_per_sentence_all_passes


(716.145, 71.01)

In [11]:
total_chat_tokens = chat_tokens_per_sentence_all_passes * total_sentences
total_embed_tokens = embed_tokens_per_sentence_all_passes * total_sentences

print('total_chat_tokens:', total_chat_tokens)
print('total_embed_tokens:', total_embed_tokens)

total_chat_tokens: 166260223.2
total_embed_tokens: 16485681.600000001


In [12]:
import re
import numpy as np
import tiktoken

# -------------------------
# CONFIG
# -------------------------

CHAT_MODEL = "gpt-3.5-turbo"
EMBED_MODEL = "text-embedding-ada-002"

NUM_PASSES = 3  # how many GPT "trials" per sentence (your script uses 3)

# pricing (you can tweak these)
CHAT_COST_PER_M = 0.050    # $ per 1,000,000 input tokens
EMB_COST_PER_K = 0.00013   # $ per 1,000 input tokens

# estimated tokens in Explanation text (for embedding calls)
# you can refine this later by sampling real outputs
EXPLANATION_TOKENS_EST = 40


# -------------------------
# TOKENIZERS
# -------------------------

enc_chat = tiktoken.encoding_for_model(CHAT_MODEL)
enc_emb = tiktoken.encoding_for_model(EMBED_MODEL)


# -------------------------
# HELPERS
# -------------------------

def build_report_text(row) -> str:
    """Mirror get_report_text from your labeling script."""
    findings = row.get("findings")
    impression = row.get("impression")

    parts = []
    if isinstance(findings, str) and findings.strip():
        parts.append(findings.strip())
    if isinstance(impression, str) and impression.strip():
        parts.append(impression.strip())

    return ". ".join(parts)


def sentence_split(text: str):
    """Very simple splitter, similar to your script's text.split('.') logic."""
    if not isinstance(text, str):
        return []
    raw = re.split(r"[.]", text)
    return [s.strip() for s in raw if s.strip()]


def build_query(sentence: str) -> str:
    """Same query template used in your labeling code."""
    return f"""Use the below sentence to answer the subsequent question.
    Emr_report:
    \"\"\"
    {sentence}
    \"\"\"
    Question: Does the patient have the specific disease in the chest based on the provied EMR report's sentence? 
    Answer form should be JSON object like following script. The JSON object has two key, "Result", and "Explanation".
    For [Result], if the sentence doesn't have enough information or evidence to classify, you should return "Uncertain". 
    If the sentence has the clear evidence that indicates absence of any abnormalities in chest, you should answer "No". 
    If the sentence has the clear observational evidence that indicates presence of any abnormalities in chest (only for present), you should answer "Yes". 

    For [Explanation], you should give a sentence more than 40 letters and less than 60 letters which explain the reason about why you choose those answers. You should elucidating the rationale behind your choice, not a direct repetition, of the input text.
    [Result] : Uncertain / No / Yes
    """


def chat_input_tokens_for_sentence(sentence: str) -> int:
    system_msg = "You will be provided with a emr sentecne."
    user_msg = build_query(sentence)
    return len(enc_chat.encode(system_msg)) + len(enc_chat.encode(user_msg))


def embedding_tokens_for_context(sentence: str) -> int:
    cleaned = sentence.replace("\n", " ")
    return len(enc_emb.encode(cleaned))


# -------------------------
# MAIN TOKEN + COST ESTIMATION
# -------------------------

all_sentences = []

for _, row in df.iterrows():
    text = build_report_text(row)
    sents = sentence_split(text)
    all_sentences.extend(sents)

S = len(all_sentences)
print(f"Total sentences (S): {S}")

chat_tokens_total = 0
emb_context_tokens_total = 0

for sent in all_sentences:
    chat_tok = chat_input_tokens_for_sentence(sent)
    emb_ctx_tok = embedding_tokens_for_context(sent)

    # per sentence across all passes
    chat_tokens_total += NUM_PASSES * chat_tok
    emb_context_tokens_total += NUM_PASSES * emb_ctx_tok

# explanation embedding tokens: estimated
emb_expl_tokens_total = NUM_PASSES * S * EXPLANATION_TOKENS_EST

emb_tokens_total = emb_context_tokens_total + emb_expl_tokens_total

# API calls
chat_calls = NUM_PASSES * S
embedding_calls = NUM_PASSES * 2 * S  # context + explanation per pass

# costs
chat_cost = (chat_tokens_total / 1_000_000) * CHAT_COST_PER_M
emb_cost = (emb_tokens_total / 1_000) * EMB_COST_PER_K
total_cost = chat_cost + emb_cost

print("\n--- API CALLS ---")
print(f"Chat calls:       {chat_calls:,}")
print(f"Embedding calls:  {embedding_calls:,}")

print("\n--- TOKENS ---")
print(f"Chat input tokens (total):        {chat_tokens_total:,}")
print(f"Embedding input tokens (total):   {emb_tokens_total:,}")
print(f"  - Context embeddings tokens:    {emb_context_tokens_total:,}")
print(f"  - Explanation embeddings tokens (est): {emb_expl_tokens_total:,}")

print("\n--- COST ESTIMATE ---")
print(f"Chat cost:        ${chat_cost:,.2f}")
print(f"Embedding cost:   ${emb_cost:,.2f}")
print(f"Total cost:       ${total_cost:,.2f}")


Total sentences (S): 232836

--- API CALLS ---
Chat calls:       698,508
Embedding calls:  1,397,016

--- TOKENS ---
Chat input tokens (total):        167,054,187
Embedding input tokens (total):   36,514,470
  - Context embeddings tokens:    8,574,150
  - Explanation embeddings tokens (est): 27,940,320

--- COST ESTIMATE ---
Chat cost:        $8.35
Embedding cost:   $4.75
Total cost:       $13.10
