In [None]:
from tqdm import tqdm

# Generate dataset

# Analysis

## Load the results

In [None]:
from benchmark.analysis.utils.data_loading import load_csv

# Load the CSV file
df = load_csv('llm_game_results.csv')

## Models-pairs performances

In [None]:
# Create a new column 'Model Pair' treating the model combinations symmetrically
df['Model Pair'] = df.apply(lambda row: tuple(sorted([row['Model 1'], row['Model 2']])), axis=1)


# Compute success rate for all games
df['Win'] = df['Status'].apply(lambda x: 'wins' in x)
success_rate = df.groupby('Model Pair').agg(Success_Rate=('Win', 'mean'))

# Filter only the rows where the status is 'wins'
wins_df = df[df['Status'] == 'wins']
wins_df['Round Length 1'] = wins_df['Past words player 1'].apply(len)
wins_df['Round Length 2'] = wins_df['Past words player 2'].apply(len)
wins_df['Average Round Length'] = (wins_df['Round Length 1'] + wins_df['Round Length 2']) / 2  # Average both players' rounds

# Group by 'Model Pair' and compute the average rounds for winning games
avg_rounds = wins_df.groupby('Model Pair').agg(Average_Rounds=('Average Round Length', 'mean'))

# Merge success rate and average rounds into one result
result = success_rate.merge(avg_rounds, on='Model Pair')
print(result)

## Visualise convergence

In [None]:
from importlib import reload

reload(embeding_visualization)
from benchmark.analysis.utils.embeding_utils import get_embeddings
import matplotlib.pyplot as plt

import numpy as np


# Initialize dictionary to hold distances by model combination
model_combinations = {}
rounds = 6  # Calculate for the last 3 rounds, adapt as needed
color_index = 0  # Initialize color index

# Calculate embeddings and distances
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    if row['Status'] == "wins": 
        embeddings_1 = get_embeddings(row['Past words player 1'][-rounds:])
        embeddings_2 = get_embeddings(row['Past words player 2'][-rounds:])
        if len(embeddings_1) >= rounds:
            row_distances = [euclidean(embeddings_1[i], embeddings_2[i]) for i in range(min(len(embeddings_1), len(embeddings_2)))]
            # Generate both model keys (Model 1 vs Model 2 and vice versa)
            model_key = (row['Model 1'], row['Model 2'])
            reverse_model_key = (row['Model 2'], row['Model 1'])

            # Check if either model_key or reverse_model_key exists
            if model_key in model_combinations:
                model_combinations[model_key].append(row_distances)
            elif reverse_model_key in model_combinations:
                model_combinations[reverse_model_key].append(row_distances)
            else:
                model_combinations[model_key] = [row_distances]


In [None]:

# Calculate and plot mean and std for the last few rounds for combinations with enough data
plt.figure(figsize=(14, 8))
colors = plt.cm.jet(np.linspace(0, 1, len(model_combinations)))

color_index = 0  # Initialize color index
for (model_key, distances_lists) in model_combinations.items():
    if len(distances_lists) >= rounds:
        # Calculate the mean and standard deviation for the last few rounds
        last_games = distances_lists[-rounds:]  # Get the last 'rounds' games
        last_games_array = np.array(last_games)
        
        # Calculating mean and std dev for the required rounds
        mean_of_last_games = np.mean(last_games_array, axis=0)
        std_of_last_games = np.std(last_games_array, axis=0)
        
        # Create time index for the rounds
        time_index = np.arange(rounds)

        # Plotting mean line
        plt.plot(time_index, mean_of_last_games, label=f'{model_key} Last {rounds}', marker='o', color=colors[color_index])
        
        # Plotting the std deviation area around the mean
        plt.fill_between(time_index, mean_of_last_games - std_of_last_games, mean_of_last_games + std_of_last_games, color=colors[color_index], alpha=0.3)
        
        color_index += 1  # Increment color index for the next model combination

plt.xlabel('Game Index')
plt.ylabel('Average Euclidean Distance')
plt.legend()
plt.grid(True)
plt.show()

## Comparison to average of two last words

In [None]:
from importlib import reload

reload(embeding_visualization)
from benchmark.analysis.utils.embeding_utils import get_embeddings
import numpy as np

tryout = 4

# Retrieving data
words_player1 = df['Past words player 1'].iloc[tryout]
words_player2 = df['Past words player 2'].iloc[tryout]
embeddings_1 = get_embeddings(words_player1)
embeddings_2 = get_embeddings(words_player2)

# Calculating average embeddings
average_embeddings = (np.array(embeddings_1) + np.array(embeddings_2)) / 2

# Ensuring lengths are correct
print("Length of Player 1 embeddings:", len(embeddings_1))
print("Length of Player 2 embeddings:", len(embeddings_2))

In [None]:
from benchmark.analysis.old_distance_calculous import plot_distances

plot_distances(embeddings_1, embeddings_2, average_embeddings)

### Table for each model

In [None]:

import pandas as pd
from tqdm import tqdm
from importlib import reload
import benchmark.analysis.old_distance_calculous
reload(benchmark.analysis.old_distance_calculous)
from benchmark.analysis.old_distance_calculous import calculate_distances

# Function to assign past words based on model
def assign_past_words(row, model):
    if row['Model 1'] == model:
        row['Past words current Model'] = row['Past words player 1']
        row['Past words other Model'] = row['Past words player 2']
    else:
        row['Past words current Model'] = row['Past words player 2']
        row['Past words other Model'] = row['Past words player 1']
    return row

# List of models to analyze
models = ["gpt-4-turbo", "gpt-4o-mini", "gpt-3.5-turbo-0125"]
tqdm.pandas()

results = []
for model in models:
    model_results = df[
        df['Status'].isin(['wins', 'loses, too many rounds']) &
        ((df['Model 1'] == model) | (df['Model 2'] == model))
    ].copy()

    model_results = model_results.apply(assign_past_words, axis=1, model=model)
    model_results[['Distances to Previous', 'Distances to Average']] = model_results.progress_apply(
        lambda row: pd.Series(calculate_distances(row)), axis=1
    )

    model_results['Average Distance to Previous'] = model_results['Distances to Previous'].apply(lambda x: np.mean(x) if x.size else 0)
    model_results['Average Distance to Average'] = model_results['Distances to Average'].apply(lambda x: np.mean(x) if x.size else 0)
    model_results['Std Dev Distance to Previous'] = model_results['Distances to Previous'].apply(lambda x: np.std(x) if x.size else 0)
    model_results['Std Dev Distance to Average'] = model_results['Distances to Average'].apply(lambda x: np.std(x) if x.size else 0)

    mean_distance_to_previous = model_results['Average Distance to Previous'].mean()
    mean_distance_to_average = model_results['Average Distance to Average'].mean()
    std_distance_to_previous = model_results['Std Dev Distance to Previous'].mean()
    std_distance_to_average = model_results['Std Dev Distance to Average'].mean()
    sample_size = len(model_results)
    strategy = "Mirroring Strategy" if mean_distance_to_previous < mean_distance_to_average else "Balancing Strategy"

    results.append({
        "Model": model,
        "Mean Distance to Previous": mean_distance_to_previous,
        "Mean Distance to Average": mean_distance_to_average,
        "Std Dev Distance to Previous": std_distance_to_previous,
        "Std Dev Distance to Average": std_distance_to_average,
        "Number of Samples": sample_size,
        "Predominant Strategy": strategy
    })

# Create and display results DataFrame
results_df = pd.DataFrame(results)
print(results_df)


## Dynamics visualization

In [None]:
from benchmark.analysis.visualization import create_fixed_color_lines

### Gpt4-mini lost

In [None]:
import plotly.graph_objects as go
import numpy as np
from sklearn.decomposition import PCA

tryout = 2

words_player1 = df['Past words player 1'].iloc[tryout]
words_player2 = df['Past words player 2'].iloc[tryout]
embeddings_1 = get_embeddings(df['Past words player 1'].iloc[tryout])
embeddings_2 = get_embeddings(df['Past words player 2'].iloc[tryout])


# merge the embeddings
embeddings = embeddings_1 + embeddings_2

# Use PCA to reduce to 3 dimensions
pca = PCA(n_components=3)
pca_result = pca.fit(np.array(embeddings))
reduced_embeddings_1 = pca.transform(embeddings_1)
reduced_embeddings_2 = pca.transform(embeddings_2)

# Create 3D scatter plot for player 1 and player 2
trace1 = go.Scatter3d(
    x=reduced_embeddings_1[:, 0],
    y=reduced_embeddings_1[:, 1],
    z=reduced_embeddings_1[:, 2],
    mode='markers+text',
    marker=dict(size=4, color='blue'),
    text=np.arange(len(words_player1)),
    name='Model 1'
)

trace2 = go.Scatter3d(
    x=reduced_embeddings_2[:, 0],
    y=reduced_embeddings_2[:, 1],
    z=reduced_embeddings_2[:, 2],
    mode='markers+text',
    marker=dict(size=4, color='red'),
    text=np.arange(len(words_player2)),
    name='Model 2'
)

if words_player2[-1] == words_player1[-1]:
    print("won")
    # Add the last point as a star for Player 2
    last_point_player = go.Scatter3d(
        x=[reduced_embeddings_2[-1, 0]],
        y=[reduced_embeddings_2[-1, 1]],
        z=[reduced_embeddings_2[-1, 2]],
        mode='markers+text',
        marker=dict(size=8, color='green', symbol="diamond-open"),
        text=[words_player2[-1]],
        name='Final word'
    )


# Create gradient lines for both players using Plotly colormaps
lines_player1 = create_fixed_color_lines(reduced_embeddings_1, len(words_player1), 'blue')
lines_player2 = create_fixed_color_lines(reduced_embeddings_2, len(words_player2), 'red')

# Combine all traces
data = [trace1, trace2] + lines_player1 + lines_player2
if words_player2[-1] == words_player1[-1]:
    data = data + [last_point_player]
# Define layout with larger figure size
layout = go.Layout(
    title='3D Scatter Plot with Colormap Gradient Lines',
    scene=dict(
        xaxis=dict(title='PCA1'),
        yaxis=dict(title='PCA2'),
        zaxis=dict(title='PCA3'),
    ),
    legend_title_text='Models',
    width=800,
    height=800,
)

# Create figure and show
fig = go.Figure(data=data, layout=layout)
fig.show()

### Win between gpt-4-turbo,gpt-4o-mini

In [None]:
words_player1

In [None]:
words_player2

In [None]:
import plotly.graph_objects as go
import numpy as np
from sklearn.decomposition import PCA

tryout = 125

words_player1 = df['Past words player 1'].iloc[tryout]
words_player2 = df['Past words player 2'].iloc[tryout]
embeddings_1 = get_embeddings(df['Past words player 1'].iloc[tryout])
embeddings_2 = get_embeddings(df['Past words player 2'].iloc[tryout])


# merge the embeddings
embeddings = embeddings_1 + embeddings_2

# Use PCA to reduce to 3 dimensions
pca = PCA(n_components=3)
pca_result = pca.fit(np.array(embeddings))
reduced_embeddings_1 = pca.transform(embeddings_1)
reduced_embeddings_2 = pca.transform(embeddings_2)

# Create 3D scatter plot for player 1 and player 2
trace1 = go.Scatter3d(
    x=reduced_embeddings_1[:, 0],
    y=reduced_embeddings_1[:, 1],
    z=reduced_embeddings_1[:, 2],
    mode='markers+text',
    marker=dict(size=4, color='blue'),
    text=np.arange(len(words_player1)),
    name='Model 1'
)

trace2 = go.Scatter3d(
    x=reduced_embeddings_2[:, 0],
    y=reduced_embeddings_2[:, 1],
    z=reduced_embeddings_2[:, 2],
    mode='markers+text',
    marker=dict(size=4, color='red'),
    text=np.arange(len(words_player2)),
    name='Model 2'
)

if words_player2[-1] == words_player1[-1]:
    print("won")
    # Add the last point as a star for Player 2
    last_point_player = go.Scatter3d(
        x=[reduced_embeddings_2[-1, 0]],
        y=[reduced_embeddings_2[-1, 1]],
        z=[reduced_embeddings_2[-1, 2]],
        mode='markers+text',
        marker=dict(size=8, color='green', symbol="diamond-open"),
        text=[words_player2[-1]],
        name='Final word'
    )


# Create gradient lines for both players using Plotly colormaps
lines_player1 = create_fixed_color_lines(reduced_embeddings_1, len(words_player1), 'blue')
lines_player2 = create_fixed_color_lines(reduced_embeddings_2, len(words_player2), 'red')

# Combine all traces
data = [trace1, trace2] + lines_player1 + lines_player2
if words_player2[-1] == words_player1[-1]:
    data = data + [last_point_player]
# Define layout with larger figure size
layout = go.Layout(
    title='3D Scatter Plot with Colormap Gradient Lines',
    scene=dict(
        xaxis=dict(title='PCA1'),
        yaxis=dict(title='PCA2'),
        zaxis=dict(title='PCA3'),
    ),
    legend_title_text='Models',
    width=800,
    height=800,
)

# Create figure and show
fig = go.Figure(data=data, layout=layout)
fig.show()

### Win between gpt-4-turbo

In [None]:
import plotly.graph_objects as go
import numpy as np
from sklearn.decomposition import PCA

tryout = 175

words_player1 = df['Past words player 1'].iloc[tryout]
words_player2 = df['Past words player 2'].iloc[tryout]
embeddings_1 = get_embeddings(df['Past words player 1'].iloc[tryout])
embeddings_2 = get_embeddings(df['Past words player 2'].iloc[tryout])


# merge the embeddings
embeddings = embeddings_1 + embeddings_2

# Use PCA to reduce to 3 dimensions
pca = PCA(n_components=3)
pca_result = pca.fit(np.array(embeddings))
reduced_embeddings_1 = pca.transform(embeddings_1)
reduced_embeddings_2 = pca.transform(embeddings_2)


# Create 3D scatter plot for player 1 and player 2
trace1 = go.Scatter3d(
    x=reduced_embeddings_1[:, 0],
    y=reduced_embeddings_1[:, 1],
    z=reduced_embeddings_1[:, 2],
    mode='markers+text',
    marker=dict(size=4, color='blue'),
    text=np.arange(len(words_player1)),
    name='Model 1'
)

trace2 = go.Scatter3d(
    x=reduced_embeddings_2[:, 0],
    y=reduced_embeddings_2[:, 1],
    z=reduced_embeddings_2[:, 2],
    mode='markers+text',
    marker=dict(size=4, color='red'),
    text=np.arange(len(words_player2)),
    name='Model 2'
)

if words_player2[-1] == words_player1[-1]:
    print("won")
    # Add the last point as a star for Player 2
    last_point_player = go.Scatter3d(
        x=[reduced_embeddings_2[-1, 0]],
        y=[reduced_embeddings_2[-1, 1]],
        z=[reduced_embeddings_2[-1, 2]],
        mode='markers+text',
        marker=dict(size=8, color='green', symbol="diamond-open"),
        text=[words_player2[-1]],
        name='Final word'
    )


# Create gradient lines for both players using Plotly colormaps
lines_player1 = create_fixed_color_lines(reduced_embeddings_1, len(words_player1), 'blue')
lines_player2 = create_fixed_color_lines(reduced_embeddings_2, len(words_player2), 'red')

# Combine all traces
data = [trace1, trace2] + lines_player1 + lines_player2
if words_player2[-1] == words_player1[-1]:
    data = data + [last_point_player]
# Define layout with larger figure size
layout = go.Layout(
    title='3D Scatter Plot with Colormap Gradient Lines',
    scene=dict(
        xaxis=dict(title='PCA1'),
        yaxis=dict(title='PCA2'),
        zaxis=dict(title='PCA3'),
    ),
    legend_title_text='Models',
    width=800,
    height=800,
)

# Create figure and show
fig = go.Figure(data=data, layout=layout)
fig.show()