In [106]:
import pandas as pd

from src.model_wrapper import base_model

base_path_results: str = '../../benchmark_results/clembench/evaluated_runs/H100'
model_files: list[str] = ['llama_8B', 'llama_70B', 'mistral']
base_model_llama_8B: str = 'Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0'
final_model_llama_8B: str = 'llama3.1-8B-sft-e1-DFINAL_0.7K-steps-t0.0--llama3.1-8B-sft-e1-DFINAL_0.7K-steps-t0.0'
base_model_llama_70B: str = 'Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0'
final_model_llama_70B: str = 'llama3.1-70B-sft-e1-DFINAL_0.6K-steps-t0.0--llama3.1-70B-sft-e1-DFINAL_0.6K-steps-t0.0'
base_model_mistral: str = 'Unsloth-Mistral-Small-24B-Instruct-2501-t0.0--Unsloth-Mistral-Small-24B-Instruct-2501-t0.0'
final_model_mistral: str = 'Unsloth-Mistral-Small-24B-Instruct-2501-t0.0--Unsloth-Mistral-Small-24B-Instruct-2501-t0.0'

std_cols: list = ['game', 'model', 'experiment', 'episode']

In [97]:
def load_raw_csv(mode_file: str, base_model: str, final_model: str, game: str) -> pd.DataFrame:
    path: str = base_path_results + '/' + mode_file + '/raw.csv'
    df = pd.read_csv(path)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    df.metric = df.metric.astype('string')
    df.value = df.value.astype('string')
    metrics_for_game_df = df[df.game == game]
    unique_metrics: list = list(metrics_for_game_df['metric'].unique())
    df_pivoted = group_raw_csv(df, base_model, final_model)
    
    return reduce_to_metrics_for_game(df_pivoted, unique_metrics)

def group_raw_csv(df: pd.DataFrame, bm, fm) -> pd.DataFrame:
    df_pivoted = df.pivot(index=['game', 'model', 'experiment', 'episode'],
                          columns='metric',
                          values='value')

    # Reset the index to make the index columns regular columns
    df_pivoted = df_pivoted.reset_index()
    return df_pivoted

def reduce_to_metrics_for_game(df: pd.DataFrame, columns_to_keep) -> pd.DataFrame:
    std_cols: list = ['game', 'model', 'experiment', 'episode']
    columns_to_keep = std_cols + columns_to_keep
    columns_to_drop = [ c for c in list(df.columns) if c not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=1)

In [99]:
df = load_raw_csv('llama_8B',base_model_llama_8B, final_model_llama_8B, 'adventuregame')

In [102]:
df.head()

metric,game,model,experiment,episode,Aborted,Lose,Main Score,Parsed Request Count,Played,Request Count,...,successful_actions,taking_from_inventory,turn_limit_loss,turn_ratio,turns_over_par,undefined_action,undefined_action_verb,undefined_argument_type,undefined_repr_str,world_state_discrepancy
0,adventuregame,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_home_deliver_three_basic_easy,episode_0,0,0,100.0,13,1,13,...,12,0,0,0.925,3.0,0,0,0,0,0
1,adventuregame,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_home_deliver_three_basic_easy,episode_1,0,0,100.0,31,1,31,...,24,0,0,0.5,19.0,0,0,0,0,0
2,adventuregame,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_home_deliver_three_basic_easy,episode_10,0,1,0.0,17,1,17,...,13,0,0,,,0,0,0,0,0
3,adventuregame,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_home_deliver_three_basic_easy,episode_11,0,1,66.66666666666666,43,1,43,...,22,0,0,,,0,1,0,0,0
4,adventuregame,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_home_deliver_three_basic_easy,episode_12,1,0,0.0,50,0,50,...,26,0,1,,,0,5,0,0,0


In [107]:
for col in df.columns:
    if col not in std_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [108]:
df_all_average_stats = df.groupby(['game', 'model', 'experiment', 'episode']).mean()

In [146]:
df_base_model = df[df.model == base_model_llama_8B].drop(['game', 'model','episode', 'experiment'], axis=1).fillna(0)
df_final_model = df[df.model == final_model_llama_8B].drop(['game', 'model', 'episode', 'experiment'], axis=1).fillna(0)

In [148]:
# print experiment level results:
all_metrics = list(df_base_model.columns)
for metric in all_metrics:
    base_model_metrics = list(df_base_model[metric])
    final_model_metrics = list(df_final_model[metric])
    
    print(f'{metric:<30} {sum(base_model_metrics):.2f} -> {sum(final_model_metrics):.2f}')

Aborted                        329.00 -> 331.00
Lose                           325.00 -> 267.00
Main Score                     20029.12 -> 27506.78
Parsed Request Count           14284.00 -> 12788.00
Played                         488.00 -> 486.00
Request Count                  14460.00 -> 12963.00
Request Success Ratio          575.91 -> 557.41
Success                        163.00 -> 219.00
Violated Request Count         178.00 -> 179.00
achieved_goal_ratio            43.33 -> 20.00
action_parsing_fail            1250.00 -> 356.00
action_resolution_fail         1358.00 -> 2114.00
bad_plan_dismiss_ratio         123.83 -> 113.69
bad_plan_follow_ratio          4.17 -> 14.30
domain_trait_type_mismatch     263.00 -> 44.00
domain_type_discrepancy        48.00 -> 1.00
entity_already_inventory       12.00 -> 2.00
entity_not_accessible          671.00 -> 1861.00
entity_state_mismatch          43.00 -> 14.00
entity_trait_mismatch          0.00 -> 0.00
finish_speed                   8.69 -> 0.4