# Project Set Up

In [23]:
import sys
import os
current_dir = os.getcwd()
from pathlib import Path
sys.path.append(str(Path(current_dir).parent))

In [24]:
from tools.data_loader import read_data
import json
import pandas as pd
import matplotlib.pyplot as plt

In [25]:

# List of result folders
slm_folders = {
    "Qwen2.5-Math-1.5B-Instruct": "results_Qwen2.5-Math-1.5B-Instruct_500samples_math_500path",
    "Qwen2.5-Math-7B-Instruct": "results_Qwen2.5-Math-7B-Instruct_500samples_math_500path",
}
llm_folders = {
    "Gemini-2.5-Flash": "results_Gemini-2.5-Flash_500samples_math_500_path",
    "Gemini-2.5-Flash-Lite": "results_Gemini-2.5-Flash-Lite_500samples_math_500_path",
    "Gemini-2.5-Pro": "results_Gemini-2.5-Pro_500samples_math_500_path"
}

delegation_folders = {
    "Delegation Model": "results_Gemini-2.5-Flash_Qwen-1.5_500samples_math_500_path"
}

# Cost Analysis

In [26]:
input_cost_dict_per_1M_tokens = {
    "Gemini-2.5-Flash": 0.30,
    "Gemini-2.5-Flash-Lite": 0.10,
    "Gemini-2.5-Pro": 1.25,
    "Qwen2.5-Math-1.5B-Instruct": 0.10,
    "Qwen2.5-Math-7B-Instruct": 0.30,
}

output_cost_dict_per_1M_tokens = {
    "Gemini-2.5-Flash": 2.50,
    "Gemini-2.5-Flash-Lite": 0.40,
    "Gemini-2.5-Pro": 10,
    "Qwen2.5-Math-1.5B-Instruct": 0.10,
    "Qwen2.5-Math-7B-Instruct": 0.30,
}

In [27]:
# Cost analysis plan:
# 1. Extract cost data from results
# 2. Calculate total cost depending on the tokens
# 3. With one model task, calculate the total cost and average cost per query.

## One Model Analysis Table

In [126]:
def col_sum(df, col):
    return int(df[col].sum()) if col in df.columns else 0
def col_mean(df, col):
    return float(df[col].mean()) if col in df.columns and len(df)>0 else None

### SLM Analysis

In [127]:
# Analysis table for selected SLM models
from pathlib import Path
models_to_report = ["Qwen2.5-Math-1.5B-Instruct", "Qwen2.5-Math-7B-Instruct"]
rows = []


for model_name, folder in slm_folders.items():

    results_path = f"{folder}/results_slm.json"

    if results_path is None:
        print(f'Warning: results_slm.json not found for {model_name} (tried {results_path})')
        continue


    with open(results_path, 'r', encoding='utf-8') as f:
        data = json.load(f)


    results_list = data.get('results', [])

    if not results_list:
        print(f'No result entries for {model_name} in {results_path}')
        continue


    results_df = pd.DataFrame(results_list)


    total = len(results_df)

    if 'is_correct' in results_df.columns:
        accuracy = float(results_df['is_correct'].astype(float).mean())
    else:
        accuracy = None



    avg_latency = col_mean(results_df, 'latency_total')

    total_latency = float(results_df['latency_total'].sum()) if 'latency_total' in results_df.columns else None

    avg_input_tokens = col_mean(results_df, 'input_tokens')


    avg_output_tokens = col_mean(results_df, 'output_tokens')

    total_input_tokens = col_sum(results_df, 'input_tokens')


    total_output_tokens = col_sum(results_df, 'output_tokens')


    input_price = input_cost_dict_per_1M_tokens.get(model_name, 0.0)


    output_price = output_cost_dict_per_1M_tokens.get(model_name, input_price)


    total_input_cost = (total_input_tokens / 1_000_000) * input_price


    total_output_cost = (total_output_tokens / 1_000_000) * output_price


    total_cost = total_input_cost + total_output_cost

    
    avg_input_cost = total_input_cost / total if total else None
    avg_output_cost = total_output_cost / total if total else None
    avg_total_cost = total_cost / total if total else None


    rows.append({
        'Model_Name': model_name,
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Latency (s)': f"{avg_latency:.2f}",
        'Total Latency (s)': f"{total_latency:.2f}",
        'Avg Input Tokens': f"{avg_input_tokens:.2f}",
        'Avg Output Tokens': f"{avg_output_tokens:.2f}",
        'Total Input Tokens': f"{total_input_tokens:.2f}",
        'Total Output Tokens': f"{total_output_tokens:.2f}",
        'Total Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Cost ($)': f"{total_cost:.6f}",
        'Avg Input Cost ($)': f"{avg_input_cost:.6f}" if avg_input_cost is not None else None,
        'Avg Output Cost ($)': f"{avg_output_cost:.6f}" if avg_output_cost is not None else None,
        'Avg Total Cost ($)': f"{avg_total_cost:.6f}" if avg_total_cost is not None else None,
        'Samples': total
    })
if rows:
    df_analysis = pd.DataFrame(rows).set_index('Model_Name')
    pd.options.display.float_format = '{:,.6f}'.format
    display(df_analysis)
else:
    print('No results found for requested models.')

Unnamed: 0_level_0,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Total Input Tokens,Total Output Tokens,Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Input Cost ($),Avg Output Cost ($),Avg Total Cost ($),Samples
Model_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Qwen2.5-Math-1.5B-Instruct,67.80%,14.11,7057.36,90.88,554.12,45440.0,277060.0,0.004544,0.027706,0.03225,9e-06,5.5e-05,6.4e-05,500
Qwen2.5-Math-7B-Instruct,76.40%,18.22,9110.04,90.88,578.21,45440.0,289105.0,0.013632,0.086732,0.100364,2.7e-05,0.000173,0.000201,500


### LLM Analysis

In [128]:
# Analysis table for selected SLM models
from pathlib import Path
models_to_report = ["Gemini-2.5-Flash", "Gemini-2.5-Pro", "Gemini-2.5-Flash-Lite"]
rows = []

for model_name, folder in llm_folders.items():

    results_path = f"{folder}/results_llm.json"

    if results_path is None:
        print(f'Warning: results_llm.json not found for {model_name} (tried {results_path})')
        continue


    with open(results_path, 'r', encoding='utf-8') as f:
        data = json.load(f)


    results_list = data.get('results', [])

    if not results_list:
        print(f'No result entries for {model_name} in {results_path}')
        continue


    results_df = pd.DataFrame(results_list)


    total = len(results_df)

    if 'is_correct' in results_df.columns:
        accuracy = float(results_df['is_correct'].astype(float).mean())
    else:
        accuracy = None



    avg_latency = col_mean(results_df, 'latency_total')

    total_latency = float(results_df['latency_total'].sum()) if 'latency_total' in results_df.columns else None

    avg_input_tokens = col_mean(results_df, 'input_tokens')


    avg_output_tokens = col_mean(results_df, 'output_tokens') + col_mean(results_df, 'thinking_tokens')

    total_input_tokens = col_sum(results_df, 'input_tokens')

    # Include thinking_tokens
    total_output_tokens = col_sum(results_df, 'output_tokens') + col_sum(results_df, 'thinking_tokens')


    input_price = input_cost_dict_per_1M_tokens.get(model_name, 0.0)


    output_price = output_cost_dict_per_1M_tokens.get(model_name, input_price)


    total_input_cost = (total_input_tokens / 1_000_000) * input_price


    total_output_cost = (total_output_tokens / 1_000_000) * output_price


    total_cost = total_input_cost + total_output_cost

    
    avg_input_cost = total_input_cost / total if total else None
    avg_output_cost = total_output_cost / total if total else None
    avg_total_cost = total_cost / total if total else None


    rows.append({
        'Model_Name': model_name,
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Latency (s)': f"{avg_latency:.2f}",
        'Total Latency (s)': f"{total_latency:.2f}",
        'Avg Input Tokens': f"{avg_input_tokens:.2f}",
        'Avg Output Tokens': f"{avg_output_tokens:.2f}",
        'Total Input Tokens': f"{total_input_tokens:.2f}",
        'Total Output Tokens': f"{total_output_tokens:.2f}",
        'Total Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Cost ($)': f"{total_cost:.6f}",
        'Avg Input Cost ($)': f"{avg_input_cost:.6f}" if avg_input_cost is not None else None,
        'Avg Output Cost ($)': f"{avg_output_cost:.6f}" if avg_output_cost is not None else None,
        'Avg Total Cost ($)': f"{avg_total_cost:.6f}" if avg_total_cost is not None else None,
        'Samples': total
    })
if rows:
    df_analysis = pd.DataFrame(rows).set_index('Model_Name')
    pd.options.display.float_format = '{:,.6f}'.format
    display(df_analysis)
else:
    print('No results found for requested models.')

Unnamed: 0_level_0,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Total Input Tokens,Total Output Tokens,Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Input Cost ($),Avg Output Cost ($),Avg Total Cost ($),Samples
Model_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Gemini-2.5-Flash,89.20%,13.73,6864.09,117.99,2130.04,58996.0,1065018.0,0.017699,2.662545,2.680244,3.5e-05,0.005325,0.00536,500
Gemini-2.5-Flash-Lite,85.60%,3.72,1858.45,118.89,1367.53,59444.0,683765.0,0.005944,0.273506,0.27945,1.2e-05,0.000547,0.000559,500
Gemini-2.5-Pro,88.80%,18.66,9330.42,118.03,2695.94,59014.0,1347971.0,0.073767,13.47971,13.553478,0.000148,0.026959,0.027107,500


## Delegation Cost Analysis

In [122]:
def compute_model_table(df, mask, model_name_for_pricing):
    dfm = df[mask].copy()
    n = len(dfm)
    if n == 0:
        return None

    accuracy = None
    if 'is_correct' in dfm.columns:
        try:
            accuracy = float(dfm['is_correct'].astype(float).mean())
        except Exception:
            accuracy = None

    avg_latency = col_mean(dfm, 'latency_total')
    total_latency = col_sum(dfm, 'latency_total')

    avg_input = col_mean(dfm, 'input_tokens')
    avg_output = col_mean(dfm, 'output_tokens')
    avg_thinking = col_mean(dfm, 'thinking_tokens')

    total_input = col_sum(dfm, 'input_tokens')
    total_output = col_sum(dfm, 'output_tokens')
    total_thinking = col_sum(dfm, 'thinking_tokens')

    # For LLM-style models we treat thinking_tokens as part of output when charging
    combined_output_tokens = total_output + total_thinking

    input_price = input_cost_dict_per_1M_tokens.get(model_name_for_pricing, 0.0)
    output_price = output_cost_dict_per_1M_tokens.get(model_name_for_pricing, input_price)

    total_input_cost = (total_input / 1_000_000) * input_price
    total_output_cost = (combined_output_tokens / 1_000_000) * output_price
    total_cost = total_input_cost + total_output_cost

    row = {
        'Samples': n,
        'Number of Correct': int(dfm['is_correct'].astype(float).sum()),
        'Model': model_name_for_pricing,
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Latency (s)': f"{avg_latency:.2f}",
        'Total Latency (s)': f"{total_latency:.2f}",
        'Avg Input Tokens': f"{avg_input:.2f}",
        'Avg Output Tokens': f"{avg_output:.2f}",
        'Avg Thinking Tokens': f"{avg_thinking:.2f}",
        'Total Input Tokens': f"{total_input:.2f}",
        'Total Output Tokens': f"{total_output:.2f}",
        'Total Thinking Tokens': f"{total_thinking:.2f}",
        'Combined Output Tokens (output+thinking)': f"{combined_output_tokens:.2f}",
        'Total Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Cost ($)': f"{total_cost:.6f}",
        'Avg Total Cost ($)': f"{(total_cost / n) if n else None:.6f}",
    }
    return pd.Series(row)

In [59]:
def compute_orchestrator_table(df, orchestrator_model_name):
    # Use planning_* tokens
    dfp = df.copy()
    n = len(dfp)
    if n == 0:
        return None

    accuracy = None
    if 'is_correct' in dfp.columns:
        try:
            accuracy = float(dfp['is_correct'].astype(float).mean())
        except Exception:
            accuracy = None

    total_planning_input = col_sum(dfp, 'planning_input_tokens')
    total_planning_output = col_sum(dfp, 'planning_output_tokens')
    total_planning_thinking = col_sum(dfp, 'planning_thinking_tokens')

    input_price = input_cost_dict_per_1M_tokens.get(orchestrator_model_name, 0.0)
    output_price = output_cost_dict_per_1M_tokens.get(orchestrator_model_name, input_price)

    total_input_cost = (total_planning_input / 1_000_000) * input_price
    total_output_cost = ((total_planning_output + total_planning_thinking) / 1_000_000) * output_price
    total_cost = total_input_cost + total_output_cost

    row = {
        'Samples': f"{n:.0f}",
        'Accuracy': f"{accuracy*100:.2f}%",
        'Avg Planning Input Tokens': f"{col_mean(dfp, 'planning_input_tokens'):.2f}",
        'Avg Planning Output Tokens': f"{col_mean(dfp, 'planning_output_tokens'):.2f}",
        'Avg Planning Thinking Tokens': f"{col_mean(dfp, 'planning_thinking_tokens'):.2f}",
        'Total Planning Input Tokens': f"{total_planning_input:.2f}",
        'Total Planning Output Tokens':f"{total_planning_output:.2f}",
        'Total Planning Thinking Tokens': f"{total_planning_thinking:.2f}",
        'Total Planning Input Cost ($)': f"{total_input_cost:.6f}",
        'Total Planning Output Cost ($)': f"{total_output_cost:.6f}",
        'Total Planning Cost ($)': f"{total_cost:.6f}",
        'Avg Planning Cost per Sample ($)': f"{(total_cost / n) if n else None:.6f}",
    }
    return pd.Series(row)


In [60]:
folder = delegation_folders["Delegation Model"]
results_dir = f"{folder}/results_router.json"
with open(results_dir, "r") as f:
    data = json.load(f)

results = pd.DataFrame(data["results"])

orchestrator_model = "Gemini-2.5-Flash-Lite"
slm_model = "Qwen2.5-Math-1.5B-Instruct"
llm_model = "Gemini-2.5-Flash"

slm_mask = results['hand_off_model'].astype(str).str.contains('SLM', case=False, na=False)
llm_mask = results['hand_off_model'].astype(str).str.contains('LLM', case=False, na=False)

In [61]:
# Table A: SLM hand-offs
print('\nTable A: Hand-off -> SLM (SLM Math Expert)')
table_a = compute_model_table(results, slm_mask, slm_model)
if table_a is not None:
    display_df_a = pd.DataFrame([table_a])
    display(display_df_a)
else:
    print('No SLM hand-off rows found')

# Table B: LLM hand-offs
print('\nTable B: Hand-off -> LLM (LLM Math Expert)')
table_b = compute_model_table(results, llm_mask, llm_model)
if table_b is not None:
    display_df_b = pd.DataFrame([table_b])
    display(display_df_b)
else:
    print('No LLM hand-off rows found')


Table A: Hand-off -> SLM (SLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,252,Qwen2.5-Math-1.5B-Instruct,82.94%,5.16,1300.0,142.31,400.2,0.0,35862.0,100851.0,0.0,100851.0,0.003586,0.010085,0.013671,5.4e-05



Table B: Hand-off -> LLM (LLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,248,Gemini-2.5-Flash,87.50%,17.68,4383.0,183.06,2591.04,1362.83,45399.0,642579.0,337983.0,980562.0,0.01362,2.451405,2.465025,0.00994


In [62]:
# Table C: Orchestrator planning tokens
print('\nTable C: Orchestrator planning tokens (planning_input_tokens / planning_output_tokens)')
table_c = compute_orchestrator_table(results, orchestrator_model)
if table_c is not None:
    display_df_c = pd.DataFrame([table_c])
    display(display_df_c)
else:
    print('No orchestrator planning data found')


Table C: Orchestrator planning tokens (planning_input_tokens / planning_output_tokens)


Unnamed: 0,Samples,Accuracy,Avg Planning Input Tokens,Avg Planning Output Tokens,Avg Planning Thinking Tokens,Total Planning Input Tokens,Total Planning Output Tokens,Total Planning Thinking Tokens,Total Planning Input Cost ($),Total Planning Output Cost ($),Total Planning Cost ($),Avg Planning Cost per Sample ($)
0,500,85.20%,226.89,295.92,278.92,113444.0,147961.0,139461.0,0.011344,0.114969,0.126313,0.000253


In [63]:
print("Total Cost of this Pipeline: ")
total_cost =display_df_a['Total Cost ($)'].astype(float).sum() + display_df_b['Total Cost ($)'].astype(float).sum() 
print(f"${total_cost:.6f}")
print(f"The cost of Gemini-2.5-Flash takes {display_df_b['Total Cost ($)'].astype(float).sum()/total_cost*100:.2f}%")

Total Cost of this Pipeline: 
$2.478696
The cost of Gemini-2.5-Flash takes 99.45%


## Delegation Cost Analysis but a little Cheat (Take examples from other model when they are handing off to LLM instead of using their)

In [37]:
folder = delegation_folders["Delegation Model"]
results_dir = f"{folder}/results_router.json"
with open(results_dir, "r") as f:
    data = json.load(f)

results = pd.DataFrame(data["results"])

In [38]:
ref_folder = llm_folders["Gemini-2.5-Flash-Lite"]
reference_dir = f"{ref_folder}/results_llm.json"

with open(reference_dir, 'r', encoding='utf-8') as f:
    data = json.load(f)

referece_list = data.get('results', [])

In [39]:
references = pd.DataFrame(referece_list)

In [40]:
# Find duplicate questions in references
duplicates = references[references.duplicated(subset=['question'], keep=False)]
print(len(duplicates))

0


In [41]:
# Find duplicate questions in results
duplicates_results = results[results.duplicated(subset=['question'], keep=False)]
print(len(duplicates_results))

0


In [42]:
# For each row in resulst, if the had_off_model is LLM, find the corresponding reference row by comparing equal in questions 
# Replace the answer in results with the answer in references
total = 0
for index, row in results.iterrows():
    if 'LLM' in str(row['hand_off_model']):
        question = row['question']
        # Find the reference row with the same question
        ref_row = references[references['question'] == question]
        if not ref_row.empty:
            total +=1
        results.at[index, 'prediction'] = ref_row['prediction'].values[0]
        results.at[index, 'extract_answer'] = ref_row['extract_answer'].values[0]
        results.at[index, 'is_correct'] = ref_row['is_correct'].values[0]
        results.at[index, 'thinking_tokens'] = ref_row['thinking_tokens'].values[0]
        results.at[index, 'output_tokens'] = ref_row['output_tokens'].values[0]

print(f'Total replaced answers from reference: {total}')
        

Total replaced answers from reference: 248


In [43]:
total = 0
for index, row in results.iterrows():
    if 'SLM' in str(row['hand_off_model']):
        question = row['question']
        # Find the reference row with the same question
        ref_row = references[references['question'] == question]
        if not ref_row.empty:
            total +=1
        results.at[index, 'prediction'] = ref_row['prediction'].values[0]
        results.at[index, 'extract_answer'] = ref_row['extract_answer'].values[0]
        results.at[index, 'is_correct'] = ref_row['is_correct'].values[0]
        results.at[index, 'thinking_tokens'] = ref_row['thinking_tokens'].values[0]
        results.at[index, 'output_tokens'] = ref_row['output_tokens'].values[0]

print(f'Total replaced answers from reference: {total}')

Total replaced answers from reference: 252


In [44]:
# Table A: SLM hand-offs
print('\nTable A: Hand-off -> SLM (SLM Math Expert)')
table_a = compute_model_table(results, slm_mask, slm_model)
if table_a is not None:
    display_df_a = pd.DataFrame([table_a])
    display(display_df_a)
else:
    print('No SLM hand-off rows found')


Table A: Hand-off -> SLM (SLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,252,Qwen2.5-Math-1.5B-Instruct,90.48%,5.16,1300.0,142.31,766.44,0.0,35862.0,193142.0,0.0,193142.0,0.003586,0.019314,0.0229,9.1e-05


In [45]:
# Table A: SLM hand-offs
print('\nTable A: Hand-off -> SLM (SLM Math Expert)')
table_a = compute_model_table(results, slm_mask, slm_model)
if table_a is not None:
    display_df_a = pd.DataFrame([table_a])
    display(display_df_a)
else:
    print('No SLM hand-off rows found')


Table A: Hand-off -> SLM (SLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,252,Qwen2.5-Math-1.5B-Instruct,90.48%,5.16,1300.0,142.31,766.44,0.0,35862.0,193142.0,0.0,193142.0,0.003586,0.019314,0.0229,9.1e-05


In [46]:
# Table B: LLM hand-offs
print('\nTable B: Hand-off -> LLM (LLM Math Expert)')
llm_model = "Gemini-2.5-Flash-Lite"
table_b = compute_model_table(results, llm_mask, llm_model)
if table_b is not None:
    display_df_b = pd.DataFrame([table_b])
    display(display_df_b)
else:
    print('No LLM hand-off rows found')


Table B: Hand-off -> LLM (LLM Math Expert)


Unnamed: 0,Samples,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,248,Gemini-2.5-Flash-Lite,80.65%,17.68,4383.0,183.06,1978.32,0.0,45399.0,490623.0,0.0,490623.0,0.00454,0.196249,0.200789,0.00081


In [47]:
# Table C: Orchestrator planning tokens
print('\nTable C: Orchestrator planning tokens (planning_input_tokens / planning_output_tokens)')
table_c = compute_orchestrator_table(results, orchestrator_model)
if table_c is not None:
    display_df_c = pd.DataFrame([table_c])
    display(display_df_c)
else:
    print('No orchestrator planning data found')


Table C: Orchestrator planning tokens (planning_input_tokens / planning_output_tokens)


Unnamed: 0,Samples,Accuracy,Avg Planning Input Tokens,Avg Planning Output Tokens,Avg Planning Thinking Tokens,Total Planning Input Tokens,Total Planning Output Tokens,Total Planning Thinking Tokens,Total Planning Input Cost ($),Total Planning Output Cost ($),Total Planning Cost ($),Avg Planning Cost per Sample ($)
0,500,85.60%,226.89,295.92,278.92,113444.0,147961.0,139461.0,0.011344,0.114969,0.126313,0.000253


In [48]:
print("Total Cost of this Pipeline: ")
total_cost =display_df_a['Total Cost ($)'].astype(float).sum() + display_df_b['Total Cost ($)'].astype(float).sum()
print(f"${total_cost:.6f}")
print(f"The cost of Gemini-2.5-Flash Lite takes {display_df_b['Total Cost ($)'].astype(float).sum()/total_cost*100:.2f}%")

Total Cost of this Pipeline: 
$0.223689
The cost of Gemini-2.5-Flash Lite takes 89.76%


## Confidence Task Delegation Task

In [64]:
folder = delegation_folders["Delegation Model"]
results_dir = f"{folder}/results_router.json"
with open(results_dir, "r") as f:
    data = json.load(f)

results = pd.DataFrame(data["results"])

In [87]:
confidence_df=pd.read_csv("confidence_task_result_enhanced.csv")

In [88]:
confidence_df['check'].sum() / len(confidence_df)

np.float64(0.72)

In [91]:
slm_work = confidence_df[confidence_df['confidence_token'] == "<|c_math|>"]

In [92]:
slm_work

Unnamed: 0,pred,label,full_pred,full_label,question,confidence_token,pair,check,confidence_correct,problem_id,input_tokens,output_tokens
0,"\left(3, \frac{\pi}{2}\right)","\left( 3, \frac{\pi}{2} \right)","To convert the point \((0,3)\) from rectangul...",We have that $r = \sqrt{0^2 + 3^2} = 3.$ Also...,<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('\\left(3, \\frac{\\pi}{2}\\right)', '\\left(...",False,False,prob_0,67,349
2,\frac{14}{3},\frac{14}{3},To solve for \( f(-2) + f(-1) + f(0) \) where...,$f(-2)+f(-1)+f(0)=\frac{3(-2)-2}{-2-2}+\frac{3...,<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('\\frac{14}{3}', '\\frac{14}{3}')",True,True,prob_2,67,457
3,9,9,To determine the number of positive whole-num...,First prime factorize $196=2^2\cdot7^2$. The ...,<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('9', '9')",True,True,prob_3,34,453
5,42,42,To determine the perimeter of the regular hex...,The side length of the hexagon is equal to the...,<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('42', '42')",True,True,prob_5,61,279
6,27,27,To find the smallest positive perfect cube th...,The sum of three consecutive integers takes th...,<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('27', '27')",True,True,prob_6,39,360
...,...,...,...,...,...,...,...,...,...,...,...,...
488,36,\$36,"To find the original price of the shirt, we n...","If the shirt is $60\%$ off, it is currently $....",<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('36', '\\$36')",False,False,prob_488,59,263
489,15,15,To find the minimum value of the function \( ...,"Since $0 < p \le x \le 15,$ the absolute value...",<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('15', '15')",True,True,prob_489,84,450
492,y = -2x,y = -2x,"To solve the problem, we need to understand t...",Let $\mathbf{v} = \begin{pmatrix} x \\ y \end{...,<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('y = -2x', 'y = -2x')",True,True,prob_492,105,421
493,20,20,"To solve the problem, we need to expand the e...","First, we calculate $(\sqrt{2} + \sqrt{3})^2$:...",<|im_start|>system\r\nYou are helpful assistan...,<|c_math|>,"('20', '20')",True,True,prob_493,87,701


In [97]:
results.head(2)

Unnamed: 0,problem_id,subject,question,ground_truth,prediction,extract_answer,is_correct,latency_total,reason,input_tokens,thinking_tokens,output_tokens,confidence_token
0,prob_0,Precalculus,"Convert the point $(0,3)$ in rectangular coord...","\left( 3, \frac{\pi}{2} \right)",To convert a point from rectangular coordinate...,"(3, \frac{\pi}{2})",False,5.905849,STOP,137,0,505,<|c_math|>
1,prob_1,Intermediate Algebra,Define\n\[p = \sum_{k = 1}^\infty \frac{1}{k^2...,p - q,We are given two sums:\n\[p = \sum_{k = 1}^\in...,p-q,True,18.794235,STOP,193,1553,2571,<|u_math|>


In [99]:
# Loop through the slm_work
# Compare with results, if the question matches, turn the hand_off_model to SLM, answer to the confidence_df answer
for index, row in slm_work.iterrows():
    id = row['problem_id']
    question = row['question']
    full_pred = row['full_pred']
    pred = row['pred']
    check = row['check']
    input_token = row['input_tokens']
    output_token = row['output_tokens']
    # Find the result row with the same question
    res_row = results[results['problem_id'] == id]
    if not res_row.empty:
        results.at[res_row.index[0], 'prediction'] = full_pred
        results.at[res_row.index[0], 'extract_answer'] = pred
        results.at[res_row.index[0], 'is_correct'] = check
        results.at[res_row.index[0], 'input_tokens'] = input_token
        results.at[res_row.index[0], 'output_tokens'] = output_token
        results.at[res_row.index[0], 'thinking_tokens'] = 0


In [125]:
references

Unnamed: 0,problem_id,subject,question,ground_truth,prediction,extract_answer,is_correct,latency_total,reason,input_tokens,thinking_tokens,output_tokens
0,prob_0,Precalculus,"Convert the point $(0,3)$ in rectangular coord...","\left( 3, \frac{\pi}{2} \right)",To convert a point from rectangular coordinate...,"(3, \frac{\pi}{2})",False,2.166934,STOP,95,0,532
1,prob_1,Intermediate Algebra,Define\n\[p = \sum_{k = 1}^\infty \frac{1}{k^2...,p - q,Let the given sum be $S.$ We have\n\[S = \sum_...,p-q,True,3.528530,STOP,148,0,1249
2,prob_2,Algebra,"If $f(x) = \frac{3x-2}{x-2}$, what is the valu...",\frac{14}{3},We are given the function $f(x) = \frac{3x-2}{...,\frac{14}{3},True,1.989725,STOP,95,0,497
3,prob_3,Number Theory,How many positive whole-number divisors does 1...,9,To find the number of positive whole-number di...,9,True,2.880073,STOP,62,0,781
4,prob_4,Algebra,The results of a cross-country team's training...,\text{Evelyn},The problem asks us to determine which student...,Evelyn,False,3.891252,STOP,416,0,1217
...,...,...,...,...,...,...,...,...,...,...,...,...
495,prob_495,Intermediate Algebra,What is the domain of the function $f(x) = \fr...,"(2,12) \cup (12,102)",To find the domain of the function $f(x) = \fr...,"(2, 12) \cup (12, 102)",True,1.887390,STOP,100,0,518
496,prob_496,Intermediate Algebra,Let $z = 1+i$ and $w = \dfrac{3z+1}{5z+7}$. Fi...,\frac{5}{13},We are given $z = 1+i$ and $w = \dfrac{3z+1}{5...,\frac{5}{13},True,3.387243,STOP,78,0,1249
497,prob_497,Geometry,An equiangular octagon has four sides of lengt...,\frac{7}{2},Let the octagon be denoted by $ABCDEFGH$.\nSin...,\frac{7}{2},True,9.102789,STOP_RUN_2_TIME,94,0,4096
498,prob_498,Intermediate Algebra,A sequence $(a_n)$ is defined as follows:\n\[a...,-1,We are given the recurrence relation $a_{i+1} ...,-1,True,6.330208,STOP,109,0,3070


In [114]:
# For each row in resulst, if the confidence_token is "<|u_math|>", find the corresponding reference row by comparing equal in questions 
# Replace the answer in results with the answer in references
total = 0
for index, row in results.iterrows():
    if row['confidence_token'] == "<|u_math|>" or row['confidence_token'] == "No Confidence Token":
        question = row['question']
        # Find the reference row with the same question
        ref_row = references[references['question'] == question]
        if not ref_row.empty:
            total +=1
        results.at[index, 'prediction'] = ref_row['prediction'].values[0]
        results.at[index, 'extract_answer'] = ref_row['extract_answer'].values[0]
        results.at[index, 'is_correct'] = ref_row['is_correct'].values[0]
        results.at[index, 'thinking_tokens'] = ref_row['thinking_tokens'].values[0]
        results.at[index, 'output_tokens'] = ref_row['output_tokens'].values[0]

print(f'Total replaced answers from reference: {total}')
        

Total replaced answers from reference: 155


In [119]:
slm_mask = results['confidence_token'].astype(str) == "<|c_math|>"
llm_mask = results['confidence_token'].astype(str) != "<|c_math|>"

In [129]:
slm_model = "Qwen2.5-Math-1.5B-Instruct"
llm_model = "Gemini-2.5-Flash-Lite"

In [130]:
# Table A: SLM hand-offs
print('\nTable A: Hand-off -> SLM (SLM Math Expert)')
table_a = compute_model_table(results, slm_mask, slm_model)
if table_a is not None:
    display_df_a = pd.DataFrame([table_a])
    display(display_df_a)
else:
    print('No SLM hand-off rows found')

# Table B: LLM hand-offs
print('\nTable B: Hand-off -> LLM (LLM Math Expert)')
table_b = compute_model_table(results, llm_mask, llm_model)
if table_b is not None:
    display_df_b = pd.DataFrame([table_b])
    display(display_df_b)
else:
    print('No LLM hand-off rows found')


Table A: Hand-off -> SLM (SLM Math Expert)


Unnamed: 0,Samples,Number of Correct,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,345,298,Qwen2.5-Math-1.5B-Instruct,86.38%,9.09,3137.0,69.04,455.68,0.0,23818.0,157208.0,0.0,157208.0,0.002382,0.015721,0.018103,5.2e-05



Table B: Hand-off -> LLM (LLM Math Expert)


Unnamed: 0,Samples,Number of Correct,Model,Accuracy,Avg Latency (s),Total Latency (s),Avg Input Tokens,Avg Output Tokens,Avg Thinking Tokens,Total Input Tokens,Total Output Tokens,Total Thinking Tokens,Combined Output Tokens (output+thinking),Total Input Cost ($),Total Output Cost ($),Total Cost ($),Avg Total Cost ($)
0,155,110,Gemini-2.5-Flash-Lite,70.97%,16.43,2545.0,212.14,2188.31,0.0,32881.0,339188.0,0.0,339188.0,0.003288,0.135675,0.138963,0.000897


In [134]:
results['is_correct'].sum() / len(results)

np.float64(0.816)

In [131]:
print("Total Cost of this Pipeline: ")
total_cost =display_df_a['Total Cost ($)'].astype(float).sum() + display_df_b['Total Cost ($)'].astype(float).sum()
print(f"${total_cost:.6f}")
print(f"The cost of Gemini-2.5-Flash Lite takes {display_df_b['Total Cost ($)'].astype(float).sum()/total_cost*100:.2f}%")

Total Cost of this Pipeline: 
$0.157066
The cost of Gemini-2.5-Flash Lite takes 88.47%
