# Werewolf Among Us: Human vs LLM Analysis

Bhavana Jonnalagadda

EDA and comparison of the datasets

In [26]:
import ast
import os
import json
import random 
from collections import Counter

import pandas as pd
import numpy as np

# Viz
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
# pio.renderers.default = "notebook+plotly_mimetype+png"
# For some reason, the correct setting to get the plots to show up in Quarto HTML?
pio.renderers.default = "notebook_connected+plotly_mimetype+png"

In [27]:
## Plotly graph gen setup

# Just have the green more prominent, move red down
cust_colorseq = ['#636EFA',
                '#00CC96',
                '#FFA15A',
                '#EF553B',
                '#AB63FA',
                '#19D3F3',
                '#FF6692',
                '#B6E880',
                '#FF97FF',
                '#FECB52']

# Generated figure margin
mn = 10

pio.templates["custom"] = go.layout.Template(layout=go.Layout(
        # xaxis=dict(showgrid=False,
        #            showline=True,
        #            linewidth=2,
        #            linecolor="black",
        #           ),
        #  yaxis=dict(showgrid=False,
        #             showline=True,
        #            linewidth=2,
        #            linecolor="black",
        #            ticks="outside", # Show ticks
        #            ),
        #  paper_bgcolor='rgba(255,255,255,1)',
        #  plot_bgcolor='rgba(255,255,255,1)',
        #  legend=dict(xanchor="right",
        #             yanchor="bottom",
        #             y=1.02,
        #             x=1,
        #             title=dict(text="Model")),
        # font=dict(size=15),
        margin=dict(l=mn, r=mn, t=mn + 30, b=mn),
        colorway=cust_colorseq,
                    ),
     data=go.layout.template.Data()
    )
pio.templates.default = "plotly+custom"

## Load data

In [4]:
hum_datapath = os.path.normpath("../../Data/Output/EDA_WAU")
llm_datapath = os.path.normpath("../../Data/Output/EDA_WA")

hum_rounds_df = pd.read_csv(os.path.join(hum_datapath, "allrounds.csv"), index_col=0)
hum_text_df = pd.read_csv(os.path.join(hum_datapath, "alltext.csv"), index_col=0)
hum_text_df["strategy"] = hum_text_df["strategy"].apply(ast.literal_eval)

llm_rounds_df = pd.read_csv(os.path.join(llm_datapath, "allrounds.csv"), index_col=0)
llm_rounds_df["players"] = llm_rounds_df["players"].apply(ast.literal_eval)
llm_rounds_df["roles"] = llm_rounds_df["roles"].apply(ast.literal_eval)

llm_text_df = pd.read_csv(os.path.join(llm_datapath, "alltext.csv"), index_col=0)
llm_text_df["players"] = llm_text_df["players"].apply(ast.literal_eval)
llm_text_df["roles"] = llm_text_df["roles"].apply(ast.literal_eval)
# llm_text_df["votes"] = llm_text_df["votes"].apply(ast.literal_eval, )
llm_text_df["strategy"] = llm_text_df["strategy"].apply(ast.literal_eval)

In [5]:
hum_rounds_df.head()

In [6]:
llm_rounds_df.head()

In [7]:
hum_text_df.head()

In [8]:
llm_text_df.head()

## EDA

### General win counts

In [9]:
llm_outcomes = np.array(llm_rounds_df.groupby("game_id")["winner"].first().tolist()) 
llm_winperc = np.sum(llm_outcomes == "Villagers") / len(llm_outcomes)

hum_outcomes = np.array(hum_rounds_df.groupby("game_id")["winner"].first().tolist()) 
hum_winperc = np.sum(hum_outcomes == "Villagers") / len(hum_outcomes)

llm_winperc, len(llm_outcomes), hum_winperc, len(hum_outcomes)

(0.5454545454545454, 11, 0.37423312883435583, 163)

| Source Dataset | Villagers Win | Number of Games |
|----------------|---------------|-----------------|
| LLMs           | 54.545%       | 11              |
| Human          | 37.423%       | 163             |

In [28]:
fig = px.histogram(llm_rounds_df.groupby("game_id")[["round", "winner"]].last(), x="round", color="winner")
fig.update_layout(title="LLM Wins by # of Rounds", xaxis_title="Number of Rounds in the Game")
fig.show()

### Strategies used

In [29]:
## Overall strategy used bar plot
hum_strats = hum_text_df["strategy"].explode().value_counts().reset_index()
llm_strats = llm_text_df["strategy"].explode().value_counts().reset_index()

fig = go.Figure(
    data=[
        go.Bar(name='Human Strategies', x=hum_strats["strategy"], y=hum_strats["count"], yaxis='y', offsetgroup=1),
        go.Bar(name='LLM Strategies', x=llm_strats["strategy"], y=llm_strats["count"], yaxis='y2', offsetgroup=2),
    ],
    layout={
        'yaxis': {'title': 'Human Strategy Use Count'},
        'yaxis2': {'title': 'LLM Strategy Use Count', 'overlaying': 'y', 'side': 'right'},
        "title": "Overall Strategy Used in Speech"
    }
)
# Change the bar mode
fig.update_layout(barmode='group', height=400)
fig.show()

In [33]:
## Strategy used by player role bar plot
hum_strats = hum_text_df[["strategy", "end_role"]].explode("strategy").value_counts().reset_index()
hum_strats = hum_strats[hum_strats["strategy"] != "No Strategy"] # Don't include no strat
hum_strats["count"] = hum_strats.groupby("end_role")["count"].transform(lambda x: x/x.sum()) # Make scaled by total strategy use per role

fig = px.bar(hum_strats, x="end_role", y="count", color="strategy", barmode="group")
fig.update_layout(yaxis_title="Ratio of Role's Strategy Use", xaxis_title="Role", title="Humans: Strategy Used by Role", height=400)
fig.show()

In [34]:
llm_strats = llm_text_df[["players", "roles", "speaker", "strategy"]].explode(["players", "roles"])
llm_strats = llm_strats[llm_strats["players"] == llm_strats["speaker"]]
llm_strats = llm_strats.explode("strategy")[["roles", "strategy"]].value_counts().reset_index()
llm_strats["count"] = llm_strats.groupby("roles")["count"].transform(lambda x: x/x.sum()) # Make scaled by total strategy use per role

fig = px.bar(llm_strats, x="roles", y="count", color="strategy", barmode="group")
fig.update_layout(yaxis_title="Ratio of Role's Strategy Use", xaxis_title="Role", title="LLMs: Strategy Used by Role", height=400)
fig.show()

In [35]:
## Strategy used over time (LLMs)

llm_strats_byround = llm_text_df.explode("strategy").groupby("round")["strategy"].value_counts().reset_index()
# Make scaled by total strategy use per round
llm_strats_byround["count"] = llm_strats_byround.groupby("round")["count"].transform(lambda x: x/x.sum()) 

fig = px.line(llm_strats_byround, x="round", y="count", color="strategy", markers=True)
fig.update_traces(line_width=3)
fig.update_layout(yaxis_title="Ratio of Round's Strategy Use", xaxis_title="Game Round", title="LLMs: Strategy Use Over Rounds")
fig.show()

In [14]:
## PCA/Clustering of strategies?

### Talking time vs. was voted on

Investigating whether a vote was cast upon a person, compared to how much they talked

In [36]:
## Humans dataset
hum_talklen = hum_text_df.groupby("speaker")["utterance_length"].sum().reset_index()
hum_votedon = hum_rounds_df["voted_for"].value_counts().reset_index()
cmp = hum_talklen.merge(hum_votedon, left_on="speaker", right_on="voted_for", how="inner")\
                .rename(columns={"speaker": "Player", "utterance_length": "Total talking time", "count": "Was voted for"})

fig = go.Figure(
    data=[
        go.Bar(name='Total talking time', x=cmp["Player"], y=cmp["Total talking time"], yaxis='y', offsetgroup=1),
        go.Bar(name='Was voted for', x=cmp["Player"], y=cmp["Was voted for"], yaxis='y2', offsetgroup=2)
    ],
    layout={
        'yaxis': {'title': 'Total talking time'},
        'yaxis2': {'title': 'Was voted for', 'overlaying': 'y', 'side': 'right'},
        "title": "Humans: Player Talking Time vs. Was Voted On"
    }
)
# Change the bar mode
fig.update_layout(barmode='group', height=400)
fig.show()


## LLMs dataset
llm_talklen = llm_text_df["speaker"].value_counts().reset_index()
llm_votes = llm_text_df.groupby(["game_id", "round"])["votes"].first().reset_index().dropna()["votes"].apply(ast.literal_eval)
voted_on = {}
for vote in llm_votes:
    for k, v in vote.items():
        if not k in voted_on:
            voted_on[k] = 1
        else:
            voted_on[k] += 1
llm_votedon = pd.DataFrame(voted_on.items()).rename(columns={0: "speaker", 1: "Voted On"})

cmp = llm_talklen.merge(llm_votedon, on="speaker", how="inner")\
                .rename(columns={"speaker": "Player", "count": "Number of Speeches"})

fig = go.Figure(
    data=[
        go.Bar(name='Number of Speeches', x=cmp["Player"], y=cmp["Number of Speeches"], yaxis='y', offsetgroup=1),
        go.Bar(name='Was voted for', x=cmp["Player"], y=cmp["Voted On"], yaxis='y2', offsetgroup=2)
    ],
    layout={
        'yaxis': {'title': 'Number of Speeches'},
        'yaxis2': {'title': 'Was voted for', 'overlaying': 'y', 'side': 'right'},
        "title": "LLMs: Player Talking Time vs. Was Voted On"
    }
)
# Change the bar mode
fig.update_layout(barmode='group', height=400)
fig.show()

### Voting spread

How unified vs spread out the votes were per round

In [16]:
def human_spread(group):
    # Drop NaN or None values in voted_for
    votes = group['voted_for'].dropna()
    # Count the votes
    vote_counts = votes.value_counts()
    if len(vote_counts) == 0:
        return 0
    # Get most voted for person / total votes
    return vote_counts.iloc[0] / len(group)
hum_vote_spread = hum_rounds_df.groupby("game_id").apply(human_spread).reset_index()[0]

llm_vote_spread = []
for vote in llm_votes:
    llm_vote_spread.append(Counter(vote.values()).most_common(1)[0][1] / len(vote))


fig = go.Figure(
    data=[
        go.Histogram(name='Human Voter Spread', x=hum_vote_spread, opacity=0.75, yaxis='y', offsetgroup=1),
        go.Histogram(name='LLM Voter Spread', x=llm_vote_spread, opacity=0.75, yaxis='y2', offsetgroup=2),
    ],
    layout=dict(
        title="Spread of Votes",
        xaxis=dict(title="Ratio That Voted for Top Choice"),
        yaxis={"title": "Human Count"},
        yaxis2={'overlaying': 'y', 'side': 'right', "title": "LLM Count"},
    )
)
# Change the bar mode
fig.update_layout(barmode='overlay', height=400)
fig.show()

In [17]:
hum_text_df["text"]

0        I don't know why this is necessary considering...
1                                         I am a Villager.
2                                            Like, I mean-
3                                            But, I-I-I-I-
4        Continue. Well, I guess we don't really need t...
                               ...                        
21067                                            We could.
21068                                               Justin
21069                    It's just Justin, Justin, Justin.
21070                                      Just inception.
21071                            So we're voting Mitchell?
Name: text, Length: 21072, dtype: object

In [18]:
llm_text_df["text"]

0      Alright everyone, it's unfortunate that we've ...
1      Thanks, Will. It's indeed a tough start losing...
2      Thanks, Hayley. I agree with you and Will that...
3      Jacob, I appreciate you pointing out that I wa...
4      Thanks for addressing that, Jackson. I complet...
                             ...                        
250    Hayley, your accusations are unfounded and ser...
251    Harold, your insistence on labeling me as the ...
252    Hayley, your tactics of distraction are transp...
253    I want to echo what Harold has shared with us:...
254    I appreciate the spirited debate, but it's cru...
Name: text, Length: 255, dtype: object

Bailis, Suma, Jane Friedhoff, and Feiyang Chen. 2024. “Werewolf Arena: A Case Study in LLM Evaluation via Social Deduction.” July 18, 2024. <https://doi.org/10.48550/arXiv.2407.13943>.

Chi, Yizhou, Lingjun Mao, and Zineng Tang. 2024. “AMONGAGENTS: Evaluating Large Language Models in the Interactive Text-Based Social Deduction Game.” July 24, 2024. <https://doi.org/10.48550/arXiv.2407.16521>.

Cho, Young-Min, Raphael Shu, Nilaksh Das, Tamer Alkhouli, Yi-An Lai, Jason Cai, Monica Sunkara, and Yi Zhang. 2024. “RoundTable: Investigating Group Decision-Making Mechanism in Multi-Agent Collaboration.” November 11, 2024. <https://doi.org/10.48550/arXiv.2411.07161>.

Du, Yinuo, Prashanth Rajivan, and Cleotilde Gonzalez. 2024. “Large Language Models for Collective Problem-Solving: Insights into Group Consensus Decision-Making.” *Proceedings of the Annual Meeting of the Cognitive Science Society* 46 (0). <https://escholarship.org/uc/item/6s060914>.

Lai, Bolin, Hongxin Zhang, Miao Liu, Aryan Pariani, Fiona Ryan, Wenqi Jia, Shirley Anugrah Hayati, James M. Rehg, and Diyi Yang. 2022. “Werewolf Among Us: A Multimodal Dataset for Modeling Persuasion Behaviors in Social Deduction Games.” December 16, 2022. <https://doi.org/10.48550/arXiv.2212.08279>.

Piatti, Giorgio, Zhijing Jin, Max Kleiman-Weiner, Bernhard Schölkopf, Mrinmaya Sachan, and Rada Mihalcea. 2024. “Cooperate or Collapse: Emergence of Sustainable Cooperation in a Society of LLM Agents.” *Advances in Neural Information Processing Systems* 37 (December): 111715–59. <https://proceedings.neurips.cc/paper_files/paper/2024/hash/ca9567d8ef6b2ea2da0d7eed57b933ee-Abstract-Conference.html>.

Wikipedia contributors. 2024. “Mafia (Party Game).” <https://en.wikipedia.org/wiki/Mafia_(party_game)>.

Xu, Zelai, Chao Yu, Fei Fang, Yu Wang, and Yi Wu. 2024. “Language Agents with Reinforcement Learning for Strategic Play in the Werewolf Game.” February 20, 2024. <https://doi.org/10.48550/arXiv.2310.18940>.