In [460]:
import pandas as pd

from src.model_wrapper import base_model

base_path_results: str = '/Users/I518095/Documents/GitHub/clem-project_playpen/benchmark_results/clembench/evaluated_runs/final_models'

base_model_llama_8B: str = 'Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0'
final_model_llama_8B: str = 'llama3.1-8B-sft-e1-DFINAL_0.7K-steps-t0.0--llama3.1-8B-sft-e1-DFINAL_0.7K-steps-t0.0'
base_model_llama_70B: str = 'Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0--Unsloth-meta-llama-3.1-70B-bnb-4bit-t0.0'
base_model_metal_llama_70B: str = 'Meta-Llama-3.1-70B-Instruct-t0.0--Meta-Llama-3.1-70B-Instruct-t0.0'


final_model_llama_70B: str = 'llama3.1-70B-sft-e1-DFINAL_0.6K-steps-t0.0--llama3.1-70B-sft-e1-DFINAL_0.6K-steps-t0.0'
base_model_mistral: str = 'Unsloth-Mistral-Small-24B-Instruct-2501-t0.0--Unsloth-Mistral-Small-24B-Instruct-2501-t0.0'
final_model_mistral: str = 'Mistral-small-2501-DFINAL_0.6K-steps-t0.0--Mistral-small-2501-DFINAL_0.6K-steps-t0.0'

std_cols: list = ['game', 'model', 'experiment', 'episode']
model_files: list[str] = ['llama_8B', 'llama_70B', 'mistral']

current_base_model = base_model_llama_8B
current_final_model = final_model_llama_8B
model_file = 'llama_8B'
current_game = "codenames"
raw_csv = "raw.csv"

In [461]:
def load_raw_csv(mode_file: str, base_model: str, final_model: str, game: str) -> pd.DataFrame:
    path: str = base_path_results + '/' + mode_file + '/' + raw_csv
    df = pd.read_csv(path)
    df.drop(columns=['Unnamed: 0'], inplace=True)
    df.metric = df.metric.astype('string')
    df.value = df.value.astype('string')
    df = df[df['game'] == game]
    metrics_for_game_df = df[df.game == game]
    unique_metrics: list = list(metrics_for_game_df['metric'].unique())
    df_pivoted = group_raw_csv(df, base_model, final_model)
    
    return reduce_to_metrics_for_game(df_pivoted, unique_metrics)

def group_raw_csv(df: pd.DataFrame, bm, fm) -> pd.DataFrame:
    df_pivoted = df.pivot(index=['game', 'model', 'experiment', 'episode'],
                          columns='metric',
                          values='value')

    # Reset the index to make the index columns regular columns
    df_pivoted = df_pivoted.reset_index()
    return df_pivoted

def reduce_to_metrics_for_game(df: pd.DataFrame, columns_to_keep) -> pd.DataFrame:
    std_cols: list = ['game', 'model', 'experiment', 'episode']
    columns_to_keep = std_cols + columns_to_keep
    columns_to_drop = [ c for c in list(df.columns) if c not in columns_to_keep]
    return df.drop(columns=columns_to_drop, axis=1)

df = load_raw_csv(model_file ,current_base_model, current_final_model, current_game)

In [462]:
for col in df.columns:
    if col not in std_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [463]:
df_base_model = df[df.model == current_base_model].drop(['game', 'model','episode', 'experiment'], axis=1).fillna(0)
df_final_model = df[df.model == current_final_model].drop(['game', 'model', 'episode', 'experiment'], axis=1).fillna(0)

In [464]:
# print experiment level results:
all_metrics = list(df_base_model.columns)
print("Metric", model_file, ":", current_game)
for metric in all_metrics:
    base_model_metrics = list(df_base_model[metric])
    final_model_metrics = list(df_final_model[metric])
    
    print(f'{metric:<30} {sum(base_model_metrics)/len(base_model_metrics):.2f} -> {sum(final_model_metrics)/len(base_model_metrics):.2f}')

Metric llama_8B : codenames
Aborted                        0.57 -> 0.75
Average Cluegiver Number of Targets 2.66 -> 2.55
Average Cluegiver Team F1      0.52 -> 0.42
Average Cluegiver Team Precision 0.96 -> 0.72
Average Cluegiver Team Recall  0.37 -> 0.32
Average Guesser Number of Guesses 1.76 -> 1.36
Average Guesser Number of Revealed Words 1.27 -> 0.92
Average Guesser Number of Unrevealed Guesses 0.49 -> 0.44
Average Guesser Target F1      0.35 -> 0.23
Average Guesser Target Precision 0.38 -> 0.27
Average Guesser Target Recall  0.33 -> 0.21
Average Guesser Team F1        0.19 -> 0.13
Average Guesser Team Precision 0.39 -> 0.25
Average Guesser Team Recall    0.13 -> 0.09
Cluegiver strip words          0.00 -> 0.05
Efficiency                     0.76 -> 0.52
Episode Negative Recall        0.88 -> 0.93
Episode Recall                 0.40 -> 0.23
Game ended through assassin    0.12 -> 0.11
Guesser strip words            0.07 -> 0.00
Lose                           0.36 -> 0.20
Main Score  

In [466]:
df_curr_base_tuned = df[df.model == current_base_model].drop(['game', 'model', 'episode', 'experiment'], axis=1).fillna(0)
df_curr_base_all_aborted = df_curr_base_tuned[df_curr_base_tuned.Aborted == 1]
df_curr_base_all_lost = df_curr_base_tuned[df_curr_base_tuned.Lose == 1]
df_curr_base_all_success = df_curr_base_tuned[df_curr_base_tuned.Success == 1]

df_curr_fine_tuned = df[df.model == current_final_model].drop(['game', 'model', 'episode', 'experiment'], axis=1).fillna(0)
df_curr_fine_all_aborted = df_curr_fine_tuned[df_curr_fine_tuned.Aborted == 1]
df_curr_fine_all_lost = df_curr_fine_tuned[df_curr_fine_tuned.Lose == 1]
df_curr_fine_all_success = df_curr_fine_tuned[df_curr_fine_tuned.Success == 1]

In [467]:
# print experiment level results:
all_metrics = list(df_base_model.columns)
for metric in all_metrics:
    base_model_metrics = list(df_curr_base_all_success[metric])
    final_model_metrics = list(df_curr_fine_all_success[metric])

    print(f'{metric:<30} {sum(base_model_metrics)/len(base_model_metrics):.2f} -> {sum(final_model_metrics)/len(base_model_metrics):.2f}')

Aborted                        0.00 -> 0.00
Average Cluegiver Number of Targets 2.33 -> 2.05
Average Cluegiver Team F1      0.68 -> 0.58
Average Cluegiver Team Precision 1.00 -> 0.77
Average Cluegiver Team Recall  0.55 -> 0.49
Average Guesser Number of Guesses 2.33 -> 2.05
Average Guesser Number of Revealed Words 2.00 -> 1.86
Average Guesser Number of Unrevealed Guesses 0.33 -> 0.19
Average Guesser Target F1      0.80 -> 0.71
Average Guesser Target Precision 0.81 -> 0.73
Average Guesser Target Recall  0.79 -> 0.70
Average Guesser Team F1        0.53 -> 0.53
Average Guesser Team Precision 0.81 -> 0.72
Average Guesser Team Recall    0.43 -> 0.45
Cluegiver strip words          0.00 -> 0.00
Efficiency                     0.98 -> 0.77
Episode Negative Recall        0.85 -> 0.74
Episode Recall                 1.00 -> 0.78
Game ended through assassin    0.00 -> 0.00
Guesser strip words            0.00 -> 0.00
Lose                           0.00 -> 0.00
Main Score                     100.00 ->

In [468]:
df_curr_fine_tuned = df[df.model == current_base_model]
df_curr_fine_all_aborted = df_curr_fine_tuned[df_curr_fine_tuned.Aborted == 1]

df_curr_fine_all_aborted.head()

metric,game,model,experiment,episode,Aborted,Average Cluegiver Number of Targets,Average Cluegiver Team F1,Average Cluegiver Team Precision,Average Cluegiver Team Recall,Average Guesser Number of Guesses,...,Main Score,Number of turns,Parsed Request Count,Played,Request Count,Request Success Ratio,Success,Violated Request Count,experiment name,experiment variable
0,codenames,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_low,episode_0,1,3.0,0.5,1.0,0.333,0.0,...,,1,1,0,2,0.5,0,1,,
1,codenames,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_low,episode_1,1,3.0,0.5,1.0,0.333,0.0,...,,1,1,0,2,0.5,0,1,,
2,codenames,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_low,episode_2,1,0.0,0.0,0.0,0.0,0.0,...,,1,0,0,1,0.0,0,1,,
3,codenames,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_low,episode_3,1,2.2,0.421,0.8,0.288,1.8,...,,5,8,0,9,0.889,0,1,,
4,codenames,Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unslot...,0_low,episode_4,1,3.0,0.5,1.0,0.333,0.0,...,,1,1,0,2,0.5,0,1,,


In [469]:
transcript_html_filenames = []

for i, row in df_curr_fine_all_aborted.iterrows():
    html_dir = base_path_results + '/' +  model_file  + '/' +   current_base_model + '/' + row["game"] + '/' + row["experiment"] + '/' + row["episode"] + '/transcript.html'
    transcript_html_filenames.append(html_dir)

In [470]:
len(transcript_html_filenames)

74

In [471]:
transcript_html_filenames[0]

'/Users/I518095/Documents/GitHub/clem-project_playpen/benchmark_results/clembench/evaluated_runs/final_models/llama_8B/Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0/codenames/0_low/episode_0/transcript.html'

In [474]:
import webbrowser

# open all transcripts
new = 2
for html in transcript_html_filenames[10:50]:
    url = f"file://{html}" 
    print(url)
    webbrowser.open(url,new=new)

file:///Users/I518095/Documents/GitHub/clem-project_playpen/benchmark_results/clembench/evaluated_runs/final_models/llama_8B/Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0/codenames/10_ambiguous/episode_0/transcript.html
file:///Users/I518095/Documents/GitHub/clem-project_playpen/benchmark_results/clembench/evaluated_runs/final_models/llama_8B/Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0/codenames/10_ambiguous/episode_2/transcript.html
file:///Users/I518095/Documents/GitHub/clem-project_playpen/benchmark_results/clembench/evaluated_runs/final_models/llama_8B/Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0/codenames/10_ambiguous/episode_3/transcript.html
file:///Users/I518095/Documents/GitHub/clem-project_playpen/benchmark_results/clembench/evaluated_runs/final_models/llama_8B/Unsloth-meta-llama-3.1-4bit-plain-t0.0--Unsloth-meta-llama-3.1-4bit-plain-t0.0/codenames/10_ambiguous/episode_5/tran

# Findings Codenames
Looking at the aborted episodes
Before finetuning
5 / 50 observed problems come from p1
the rest p2

- p2 does most errors by guessing the same word as player one used for GUESS: with 20 cases
- Next is guessing the wrong words 13
- then providing too many guesses 7
- hallucinating words 5
After finetuning
- errors occur 50/50


## Finding textmapworld graph-reasoning
Ouput seems to be cut-off more after finetuning

llama 70B before major issues with closing tag
mistral too
llama 8B before tining 22/24 times turn limit
