# Strategy Score on Output (using GPT-4o)

In [1]:
import os
import sys
import json
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import requests
import pandas as pd

from pandas import DataFrame, json_normalize
from typing import List, Dict, Any, Tuple, Union, Optional

LOGS_PATH: str = "../evaluations/results/"

In [None]:
import dotenv
dotenv.load_dotenv()

True

In [3]:
sys.path.append("..")

from utils import load_agent_logs_df, read_jsonl_as_json, load_game_summary

In [4]:
EXPT_NAMES: List[str] = [
    "2025-01-25_phi_llama_100_games",
    "2025-01-27_llama_phi_100_games",
    "2025-01-28_phi_phi_100_games",
    "2025-01-28_llama_llama_100_games",
    ]

In [5]:
DESCRIPTIONS: List[str] = [
    "Crew: Phi, Imp: Llama",
    "Crew: Llama, Imp: Phi",
    "Crew: Phi, Imp: Phi",
    "Crew: Llama, Imp: Llama",
    ]

In [6]:
summary_logs_paths: List[str] = [
    os.path.join(LOGS_PATH, f"{expt_name}_strategy_skill.json") for expt_name in EXPT_NAMES
]

In [13]:
summary_dfs: List[DataFrame] = []

for summary_logs_path in summary_logs_paths:
    # read json line by line
    summary_logs: List[Dict[str, Any]] = read_jsonl_as_json(summary_logs_path)
    summary_df: DataFrame = json_normalize(summary_logs)
    # sort by game_index and then step
    summary_df = summary_df.sort_values(by=["game_index", "step"])
    summary_dfs.append(summary_df)
    print(f"Loaded {len(summary_df)} logs from {summary_logs_path}")

Loaded 7246 logs from ../evaluations/results/2025-01-25_phi_llama_100_games_strategy_skill.json
Loaded 6908 logs from ../evaluations/results/2025-01-27_llama_phi_100_games_strategy_skill.json
Loaded 7390 logs from ../evaluations/results/2025-01-28_phi_phi_100_games_strategy_skill.json
Loaded 6951 logs from ../evaluations/results/2025-01-28_llama_llama_100_games_strategy_skill.json


In [15]:
summary_dfs[0].head()

Unnamed: 0,game_index,step,player_name,player_identity,memory,action,thought,strategy_score,explanation
63,Game 1,0,Player 4: black,Crewmate,- Player 1 (blue) was seen seemingly completin...,CALL MEETING using the emergency button at Caf...,- The death of Player 1 is suspicious and indi...,9,The player effectively identifies the urgency ...
209,Game 1,0,Player 3: pink,Impostor,I am currently in the Cafeteria with Players 1...,KILL Player 1: blue,"Since I'm an Impostor, my goal is to eliminate...",4,"The player, as an Impostor, is aware of the ri..."
390,Game 1,0,Player 1: blue,Crewmate,- No observations or actions have been recorde...,COMPLETE TASK - Fix Wiring,"- The game is in the task phase, and my priori...",8,The player is acting strategically by completi...
894,Game 1,0,Player 2: orange,Crewmate,- Player 1 (blue) is seemingly doing a task in...,COMPLETE TASK - Fix Wiring,- My immediate task is to begin completing the...,8,The player is effectively completing their ass...
0,Game 1,1,Player 2: orange,Crewmate,- I completed the task of Fix Wiring in the Ca...,"SPEAK: ""Let's verify our timelines. Can each o...",- The meeting phase is crucial for identifying...,8,The player is strategically facilitating the d...


In [39]:
# plot distribution of strategy_score for player_identity Impostor for each experiment

fig = go.Figure()

for i, summary_df in enumerate(summary_dfs):
    fig.add_trace(go.Violin(
        x=[DESCRIPTIONS[i]] * len(summary_df),
        y=np.sort(np.array(summary_df[summary_df["player_identity"] == "Impostor"]["strategy_score"], dtype=np.float64)),
        name=DESCRIPTIONS[i],
        box_visible=True,
        meanline_visible=True
    ))
    
fig.update_layout(
    title="Strategy Scores for Impostors",
    yaxis_title="Strategy Score",
    xaxis_title="",
    showlegend=True
)

# don't show x-axis labels
fig.update_xaxes(showticklabels=False)


fig.update_layout({'plot_bgcolor': 'rgba(255, 255, 255, 1)',})
# show fine grid lines on both axes on both subplotsß
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

# legend inside the plot in a box
fig.update_layout(legend=dict(x=1, y=1, traceorder="normal", bgcolor="white", bordercolor="black", borderwidth=1))

# width and height
fig.update_layout(width=750, height=500)

# make y axis start from 0
# fig.update_yaxes(range=[0, 0.6])
fig.update_yaxes(range=[0, 10])

# # ticks on both axes
# fig.update_xaxes(tickmode='linear', tick0=0, dtick=500)
fig.update_yaxes(tickmode='linear', tick0=0, dtick=1)

# everthing latex font (for research paper)
fig.update_layout(font=dict(family='serif', size=15, color='black'))
fig.update_xaxes(title_font=dict(family='serif', size=18, color='black'))
fig.update_yaxes(title_font=dict(family='serif', size=18, color='black'))
fig.update_xaxes(tickfont=dict(family='serif', size=18, color='black'))
fig.update_yaxes(tickfont=dict(family='serif', size=18, color='black'))
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=False)

fig.show()

In [40]:
# plot distribution of strategy_score for player_identity Crewmate for each experiment

fig = go.Figure()

for i, summary_df in enumerate(summary_dfs):
    fig.add_trace(go.Violin(
        x=[DESCRIPTIONS[i]] * len(summary_df),
        y=np.sort(np.array(summary_df[summary_df["player_identity"] == "Crewmate"]["strategy_score"], dtype=np.float64)),
        name=DESCRIPTIONS[i],
        box_visible=True,
        meanline_visible=True
    ))
    
fig.update_layout(
    title="Strategy Scores for Crewmates",
    yaxis_title="Strategy Score",
    xaxis_title="",
    showlegend=True
)

# don't show x-axis labels
fig.update_xaxes(showticklabels=False)


fig.update_layout({'plot_bgcolor': 'rgba(255, 255, 255, 1)',})
# show fine grid lines on both axes on both subplotsß
# fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

# legend inside the plot in a box
fig.update_layout(legend=dict(x=1, y=1, traceorder="normal", bgcolor="white", bordercolor="black", borderwidth=1))

# width and height
fig.update_layout(width=750, height=500)

# make y axis start from 0
# fig.update_yaxes(range=[0, 0.6])
fig.update_yaxes(range=[0, 10])

# # ticks on both axes
# fig.update_xaxes(tickmode='linear', tick0=0, dtick=500)
fig.update_yaxes(tickmode='linear', tick0=0, dtick=1)

# everthing latex font (for research paper)
fig.update_layout(font=dict(family='serif', size=15, color='black'))
fig.update_xaxes(title_font=dict(family='serif', size=18, color='black'))
fig.update_yaxes(title_font=dict(family='serif', size=18, color='black'))
fig.update_xaxes(tickfont=dict(family='serif', size=18, color='black'))
fig.update_yaxes(tickfont=dict(family='serif', size=18, color='black'))
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=False)

fig.show()