# Project Set Up

In [1]:
import sys
import os
current_dir = os.getcwd()
from pathlib import Path
sys.path.append(str(Path(current_dir).parent))

In [2]:
from tools.data_loader import read_data
import json
import pandas as pd
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [23]:

# List of result folders
slm_folders = {
    "Qwen2.5-Math-1.5B-Instruct": "results_Qwen2.5-Math-1.5B-Instruct_500samples_math_500path",
    "Qwen2.5-Math-7B-Instruct": "results_Qwen2.5-Math-7B-Instruct_500samples_math_500path",
}
llm_folders = {
    "Gemini-2.5-Flash": "results_Gemini-2.5-Flash_500samples_math_500_path",
    "Gemini-2.5-Flash-Lite": "results_Gemini-2.5-Flash-Lite_500samples_math_500_path",
    "Gemini-2.5-Pro": "results_Gemini-2.5-Pro_500samples_math_500_path"
}

delegation_folders = {
    "Delegation Model": "results_Gemini-2.5-Flash_Qwen-1.5_500samples_math_500_path"
}

# Cost Analysis

In [12]:
input_cost_dict_per_1M_tokens = {
    "Gemini-2.5-Flash": 0.30,
    "Gemini-2.5-Flash-Lite": 0.10,
    "Gemini-2.5-Pro": 1.25,
    "Qwen2.5-Math-1.5B-Instruct": 0.10,
    "Qwen2.5-Math-7B-Instruct": 0.30,
}

output_cost_dict_per_1M_tokens = {
    "Gemini-2.5-Flash": 2.50,
    "Gemini-2.5-Flash-Lite": 0.40,
    "Gemini-2.5-Pro": 10,
    "Qwen2.5-Math-1.5B-Instruct": 0.10,
    "Qwen2.5-Math-7B-Instruct": 0.30,
}

In [13]:
# Cost analysis plan:
# 1. Extract cost data from results
# 2. Calculate total cost depending on the tokens
# 3. With one model task, calculate the total cost and average cost per query.

## One Model Analysis Table

In [39]:
def col_sum(df, col):
    return int(df[col].sum()) if col in df.columns else 0
def col_mean(df, col):
    return float(df[col].mean()) if col in df.columns and len(df)>0 else None



### SLM Analysis

In [None]:
# Analysis table for selected SLM models
from pathlib import Path
models_to_report = ["Qwen2.5-Math-1.5B-Instruct", "Qwen2.5-Math-7B-Instruct"]
rows = []


for model_name, folder in slm_folders.items():

    results_path = f"{folder}/results_slm.json"

    if results_path is None:
        print(f'Warning: results_slm.json not found for {model_name} (tried {results_path})')
        continue


    with open(results_path, 'r', encoding='utf-8') as f:
        data = json.load(f)


    results_list = data.get('results', [])

    if not results_list:
        print(f'No result entries for {model_name} in {results_path}')
        continue


    results_df = pd.DataFrame(results_list)


    total = len(results_df)

    if 'is_correct' in results_df.columns:
        accuracy = float(results_df['is_correct'].astype(float).mean())
    else:
        accuracy = None



    avg_latency = col_mean(results_df, 'latency_total')

    total_latency = float(results_df['latency_total'].sum()) if 'latency_total' in results_df.columns else None

    avg_input_tokens = col_mean(results_df, 'input_tokens')


    avg_output_tokens = col_mean(results_df, 'output_tokens')

    total_input_tokens = col_sum(results_df, 'input_tokens')


    total_output_tokens = col_sum(results_df, 'output_tokens')


    input_price = input_cost_dict_per_1M_tokens.get(model_name, 0.0)


    output_price = output_cost_dict_per_1M_tokens.get(model_name, input_price)


    total_input_cost = (total_input_tokens / 1_000_000) * input_price


    total_output_cost = (total_output_tokens / 1_000_000) * output_price


    total_cost = total_input_cost + total_output_cost

    
    avg_input_cost = total_input_cost / total if total else None
    avg_output_cost = total_output_cost / total if total else None
    avg_total_cost = total_cost / total if total else None


    rows.append({
        'Model_Name': model_name,
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Latency (s)': f"{avg_latency:.2f}",
        'Total Latency (s)': f"{total_latency:.2f}",
        'Avg Input Tokens': f"{avg_input_tokens:.2f}",
        'Avg Output Tokens': f"{avg_output_tokens:.2f}",
        'Total Input Tokens': f"{total_input_tokens:.2f}",
        'Total Output Tokens': f"{total_output_tokens:.2f}",
        'Total Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Cost ($)': f"{total_cost:.6f}",
        'Avg Input Cost ($)': f"{avg_input_cost:.6f}" if avg_input_cost is not None else None,
        'Avg Output Cost ($)': f"{avg_output_cost:.6f}" if avg_output_cost is not None else None,
        'Avg Total Cost ($)': f"{avg_total_cost:.6f}" if avg_total_cost is not None else None,
        'Samples': total
    })
if rows:
    df_analysis = pd.DataFrame(rows).set_index('Model_Name')
    pd.options.display.float_format = '{:,.6f}'.format
    display(df_analysis)
else:
    print('No results found for requested models.')

Unnamed: 0_level_0,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Total Input Tokens,Total Output Tokens,Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Input Cost ($),Avg Output Cost ($),Avg Total Cost ($),Samples
Model_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Qwen2.5-Math-1.5B-Instruct,67.80%,14.11,7057.36,90.88,554.12,45440.0,277060.0,0.004544,0.027706,0.03225,9e-06,5.5e-05,6.4e-05,500
Qwen2.5-Math-7B-Instruct,76.40%,18.22,9110.04,90.88,578.21,45440.0,289105.0,0.013632,0.086732,0.100364,2.7e-05,0.000173,0.000201,500


### LLM Analysis

In [None]:
# Analysis table for selected SLM models
from pathlib import Path
models_to_report = ["Gemini-2.5-Flash", "Gemini-2.5-Pro", "Gemini-2.5-Flash-Lite"]
rows = []

for model_name, folder in llm_folders.items():

    results_path = f"{folder}/results_llm.json"

    if results_path is None:
        print(f'Warning: results_llm.json not found for {model_name} (tried {results_path})')
        continue


    with open(results_path, 'r', encoding='utf-8') as f:
        data = json.load(f)


    results_list = data.get('results', [])

    if not results_list:
        print(f'No result entries for {model_name} in {results_path}')
        continue


    results_df = pd.DataFrame(results_list)


    total = len(results_df)

    if 'is_correct' in results_df.columns:
        accuracy = float(results_df['is_correct'].astype(float).mean())
    else:
        accuracy = None



    avg_latency = col_mean(results_df, 'latency_total')

    total_latency = float(results_df['latency_total'].sum()) if 'latency_total' in results_df.columns else None

    avg_input_tokens = col_mean(results_df, 'input_tokens')


    avg_output_tokens = col_mean(results_df, 'output_tokens') + col_mean(results_df, 'thinking_tokens')

    total_input_tokens = col_sum(results_df, 'input_tokens')

    # Include thinking_tokens
    total_output_tokens = col_sum(results_df, 'output_tokens') + col_sum(results_df, 'thinking_tokens')


    input_price = input_cost_dict_per_1M_tokens.get(model_name, 0.0)


    output_price = output_cost_dict_per_1M_tokens.get(model_name, input_price)


    total_input_cost = (total_input_tokens / 1_000_000) * input_price


    total_output_cost = (total_output_tokens / 1_000_000) * output_price


    total_cost = total_input_cost + total_output_cost

    
    avg_input_cost = total_input_cost / total if total else None
    avg_output_cost = total_output_cost / total if total else None
    avg_total_cost = total_cost / total if total else None


    rows.append({
        'Model_Name': model_name,
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Latency (s)': f"{avg_latency:.2f}",
        'Total Latency (s)': f"{total_latency:.2f}",
        'Avg Input Tokens': f"{avg_input_tokens:.2f}",
        'Avg Output Tokens': f"{avg_output_tokens:.2f}",
        'Total Input Tokens': f"{total_input_tokens:.2f}",
        'Total Output Tokens': f"{total_output_tokens:.2f}",
        'Total Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Cost ($)': f"{total_cost:.6f}",
        'Avg Input Cost ($)': f"{avg_input_cost:.6f}" if avg_input_cost is not None else None,
        'Avg Output Cost ($)': f"{avg_output_cost:.6f}" if avg_output_cost is not None else None,
        'Avg Total Cost ($)': f"{avg_total_cost:.6f}" if avg_total_cost is not None else None,
        'Samples': total
    })
if rows:
    df_analysis = pd.DataFrame(rows).set_index('Model_Name')
    pd.options.display.float_format = '{:,.6f}'.format
    display(df_analysis)
else:
    print('No results found for requested models.')

Unnamed: 0_level_0,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Total Input Tokens,Total Output Tokens,Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Input Cost ($),Avg Output Cost ($),Avg Total Cost ($),Samples
Model_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Gemini-2.5-Flash,89.20%,13.73,6864.09,117.99,2130.04,58996.0,1065018.0,0.017699,2.662545,2.680244,3.5e-05,0.005325,0.00536,500
Gemini-2.5-Flash-Lite,85.60%,3.72,1858.45,118.89,1367.53,59444.0,683765.0,0.005944,0.273506,0.27945,1.2e-05,0.000547,0.000559,500
Gemini-2.5-Pro,88.80%,18.66,9330.42,118.03,2695.94,59014.0,1347971.0,0.073767,13.47971,13.553478,0.000148,0.026959,0.027107,500


## Delegation Cost Analysis

In [50]:
def compute_model_table(df, mask, model_name_for_pricing):
    dfm = df[mask].copy()
    n = len(dfm)
    if n == 0:
        return None

    accuracy = None
    if 'is_correct' in dfm.columns:
        try:
            accuracy = float(dfm['is_correct'].astype(float).mean())
        except Exception:
            accuracy = None

    avg_latency = col_mean(dfm, 'latency_total')
    total_latency = col_sum(dfm, 'latency_total')

    avg_input = col_mean(dfm, 'input_tokens')
    avg_output = col_mean(dfm, 'output_tokens')
    avg_thinking = col_mean(dfm, 'thinking_tokens')

    total_input = col_sum(dfm, 'input_tokens')
    total_output = col_sum(dfm, 'output_tokens')
    total_thinking = col_sum(dfm, 'thinking_tokens')

    # For LLM-style models we treat thinking_tokens as part of output when charging
    combined_output_tokens = total_output + total_thinking

    input_price = input_cost_dict_per_1M_tokens.get(model_name_for_pricing, 0.0)
    output_price = output_cost_dict_per_1M_tokens.get(model_name_for_pricing, input_price)

    total_input_cost = (total_input / 1_000_000) * input_price
    total_output_cost = (combined_output_tokens / 1_000_000) * output_price
    total_cost = total_input_cost + total_output_cost

    row = {
        'Samples': n,
        'Model': model_name_for_pricing,
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Latency (s)': f"{avg_latency:.2f}",
        'Total Latency (s)': f"{total_latency:.2f}",
        'Avg Input Tokens': f"{avg_input:.2f}",
        'Avg Output Tokens': f"{avg_output:.2f}",
        'Avg Thinking Tokens': f"{avg_thinking:.2f}",
        'Total Input Tokens': f"{total_input:.2f}",
        'Total Output Tokens': f"{total_output:.2f}",
        'Total Thinking Tokens': f"{total_thinking:.2f}",
        'Combined Output Tokens (output+thinking)': f"{combined_output_tokens:.2f}",
        'Total Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Cost ($)': f"{total_cost:.6f}",
        'Avg Total Cost ($)': f"{(total_cost / n) if n else None:.6f}",
    }
    return pd.Series(row)

In [51]:
folder = delegation_folders["Delegation Model"]
results_dir = f"{folder}/results_router.json"
with open(results_dir, "r") as f:
    data = json.load(f)

results = pd.DataFrame(data["results"])

orchestrator_model = "Gemini-2.5-Flash-Lite"
slm_model = "Qwen2.5-Math-1.5B-Instruct"
llm_model = "Gemini-2.5-Flash"

slm_mask = results['hand_off_model'].astype(str).str.contains('SLM', case=False, na=False)
llm_mask = results['hand_off_model'].astype(str).str.contains('LLM', case=False, na=False)

In [52]:
# Table A: SLM hand-offs
print('\nTable A: Hand-off -> SLM (SLM Math Expert)')
table_a = compute_model_table(results, slm_mask, slm_model)
if table_a is not None:
    display_df_a = pd.DataFrame([table_a])
    display(display_df_a)
else:
    print('No SLM hand-off rows found')

# Table B: LLM hand-offs
print('\nTable B: Hand-off -> LLM (LLM Math Expert)')
table_b = compute_model_table(results, llm_mask, llm_model)
if table_b is not None:
    display_df_b = pd.DataFrame([table_b])
    display(display_df_b)
else:
    print('No LLM hand-off rows found')


Table A: Hand-off -> SLM (SLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,252,Qwen2.5-Math-1.5B-Instruct,82.94%,5.16,1300.0,142.31,400.2,0.0,35862.0,100851.0,0.0,100851.0,0.003586,0.010085,0.013671,5.4e-05



Table B: Hand-off -> LLM (LLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,248,Gemini-2.5-Flash,87.50%,17.68,4383.0,183.06,2591.04,1362.83,45399.0,642579.0,337983.0,980562.0,0.01362,2.451405,2.465025,0.00994


In [54]:
def compute_orchestrator_table(df, orchestrator_model_name):
    # Use planning_* tokens
    dfp = df.copy()
    n = len(dfp)
    if n == 0:
        return None

    total_planning_input = col_sum(dfp, 'planning_input_tokens')
    total_planning_output = col_sum(dfp, 'planning_output_tokens')
    total_planning_thinking = col_sum(dfp, 'planning_thinking_tokens')

    input_price = input_cost_dict_per_1M_tokens.get(orchestrator_model_name, 0.0)
    output_price = output_cost_dict_per_1M_tokens.get(orchestrator_model_name, input_price)

    total_input_cost = (total_planning_input / 1_000_000) * input_price
    total_output_cost = ((total_planning_output + total_planning_thinking) / 1_000_000) * output_price
    total_cost = total_input_cost + total_output_cost

    row = {
        'Samples': f"{n:.0f}",
        'Avg Planning Input Tokens': f"{col_mean(dfp, 'planning_input_tokens'):.2f}",
        'Avg Planning Output Tokens': f"{col_mean(dfp, 'planning_output_tokens'):.2f}",
        'Avg Planning Thinking Tokens': f"{col_mean(dfp, 'planning_thinking_tokens'):.2f}",
        'Total Planning Input Tokens': f"{total_planning_input:.2f}",
        'Total Planning Output Tokens':f"{total_planning_output:.2f}",
        'Total Planning Thinking Tokens': f"{total_planning_thinking:.2f}",
        'Total Planning Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Planning Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Planning Cost ($)': f"{total_cost:.6f}",
        'Avg Planning Cost per Sample ($)': f"{(total_cost / n) if n else None:.6f}",
    }
    return pd.Series(row)


In [55]:
# Table C: Orchestrator planning tokens
print('\nTable C: Orchestrator planning tokens (planning_input_tokens / planning_output_tokens)')
table_c = compute_orchestrator_table(results, orchestrator_model)
if table_c is not None:
    display_df_c = pd.DataFrame([table_c])
    display(display_df_c)
else:
    print('No orchestrator planning data found')


Table C: Orchestrator planning tokens (planning_input_tokens / planning_output_tokens)


Unnamed: 0,Samples,Avg Planning Input Tokens,Avg Planning Output Tokens,Avg Planning Thinking Tokens,Total Planning Input Tokens,Total Planning Output Tokens,Total Planning Thinking Tokens,Total Planning Input Cost ($),Total Planning Output Cost ($),Total Planning Cost ($),Avg Planning Cost per Sample ($)
0,500,226.89,295.92,278.92,113444.0,147961.0,139461.0,0.011344,0.114969,0.126313,0.000253


In [58]:
print("Total Cost of this Pipeline: ")
total_cost =display_df_a['Total Cost ($)'].astype(float).sum() + display_df_b['Total Cost ($)'].astype(float).sum() + display_df_c['Total Planning Cost ($)'].astype(float).sum()
print(f"${total_cost:.6f}")
print(f"The cost of Gemini-2.5-Flash takes {display_df_b['Total Cost ($)'].astype(float).sum()/total_cost*100:.2f}%")

Total Cost of this Pipeline: 
$2.605009
The cost of Gemini-2.5-Flash takes 94.63%
