In [1]:
from helpers import database
import pandas as pd

In [2]:
reviews = database.get_reviews(language="english")
reviews.head()

Unnamed: 0,id,recommendation_id,author_steamid,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,language,...,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck
0,215760124,,76561199565605732,0,4,325,325,264,1768155336,english,...,1768151687,True,0,0,0.5,0,True,False,False,False
1,215756568,,76561198026330868,39,10,4201,2559,4150,1768154369,english,...,1768148999,True,0,0,0.5,0,True,False,False,False
2,215744888,,76561198066200885,221,44,5543,0,5543,1739371844,english,...,1768140031,True,0,0,0.5,0,True,False,False,False
3,215742083,,76561198880718555,0,8,3239,2734,3147,1768155154,english,...,1768137616,True,0,0,0.5,0,True,False,False,False
4,215740976,,76561199486130374,14,3,200,200,200,1768055486,english,...,1768136629,True,0,0,0.5,0,False,False,False,False


In [3]:
reviews.columns

Index(['id', 'recommendation_id', 'author_steamid', 'author_num_games_owned',
       'author_num_reviews', 'author_playtime_forever',
       'author_playtime_last_two_weeks', 'author_playtime_at_review',
       'author_last_played', 'language', 'review', 'timestamp_created',
       'timestamp_updated', 'voted_up', 'votes_up', 'votes_funny',
       'weighted_vote_score', 'comment_count', 'steam_purchase',
       'received_for_free', 'written_during_early_access',
       'primarily_steam_deck'],
      dtype='object')

In [8]:
def concept_negativity(concept):
    """
    Calculate the negativity for each concept based on the regex pattern:
    - Mentions: The number of reviews containing the pattern.
    - Negativity: The percentage of negative reviews (1 - mean voted_up) in the subset containing the pattern.

    :param concepts: A dictionary with the key being the concept name and the value being the regex pattern.
    :return: A DataFrame with columns 'concept', 'mentions', and 'negativity'.
    """

    records = []

    for concept, pattern in concept.items():
        # Find reviews containing pattern (case-insensitive, regex)
        mask = reviews["review"].str.contains(pattern, case=False, na=False, regex=True)
        subset = reviews[mask]

        if len(subset) > 0: # Only count if mentioned
            negativity = 1 - subset["voted_up"].mean() # Negative ratio = 1 - Positive ratio
            records.append({
                "concept": concept,
                "mentions": len(subset),
                "negativity": negativity * 100
            })

    return pd.DataFrame(records).sort_values("negativity", ascending=False)

In [144]:
def concept_share_of_negative(concepts):
    """
    Calculate the share of negativity for each concept based on the regex pattern:
    - Mentions: The number of reviews containing the pattern.
    - Negativity: The percentage of negative reviews (1 - mean voted_up) in the subset containing the pattern.

    :param concepts: A dictionary with the key being the concept name and the value being the regex pattern.
    :return: A DataFrame with columns 'concept', 'mentions', and 'negativity'.
    """

    negative = reviews[reviews["voted_up"] == False].copy()
    rows = []

    for concept, pattern in concepts.items():
        mask = negative["review"].str.contains(pattern, case=False, na=False, regex=True)
        negative_mentions = int(mask.sum())
        share = (negative_mentions / len(negative)) * 100 if len(negative) > 0 else 0.0
        rows.append({
            "concept": concept,
            "mentions": negative_mentions,
            "negativity": share
        })

    return pd.DataFrame(rows).sort_values("negativity", ascending=False)

In [145]:
def filter_low_signal_concepts(df, min_mentions=50):
    return df[df["mentions"] >= min_mentions]

In [146]:
import plotly.express as px

def create_plot(df, title):
    fig = px.bar(
    df,
    x=df["concept"],
    y=df["negativity"],
    title=title,
    labels={"negativity": "Negativity Ratio", "concept": "Concept"},
    color="negativity",
    color_continuous_scale="RdYlGn_r",
    template="plotly_dark",
    text="negativity"
    )

    fig.update_traces(
    texttemplate='%{y}%',
    textposition="inside",
    hovertemplate="%{y}% negativity (%{customdata[0]:,} mentions)",
    customdata=df[["mentions"]]
    )

    fig.update_layout(
    xaxis_tickformat=".0%",
    xaxis_title="Negativity Ratio",
    yaxis_title="Concept",
    height=600,
    title_font_size=20,
    font_size=12,
    bargap=0.15
    )

    fig.show()


In [147]:
def plot_negativity(df):
    create_plot(df, "Negativity by Concept (Higher = More Negative Mentions) - (negative mentions / total mentions)")

In [148]:
def plot_share_of_negative(df):
    create_plot(df, "Coverage in negative reviews (Higher = More Negative Mentions) - (negative mentions / all negatives)")

In [181]:
split_concepts = {

    # combat
    "hit": r"\b(?:hit|hits|hitbox|hit detection|missed hits?)\b",
    "combat": r"\b(?:combat|fighting|battle system|gameplay)\b",
    "damage": r"\b(?:damage|dmg|low damage|high damage|bullet sponge)\b",
    "health": r"\b(?:health|hp|healing|regen|lifebar|health bar)\b",

    # design
    "open world": r"open.{0,3}world",
    "fast travel": r"\b(?:fast[- ]?travel)\b",
    "runbacks": r"\b(?:runback|runbacks|checkpoint|long walk back|retry distance)\b",
    "design": r"\b(?:design|level design|map design|game design|world design)\b",
    "exploration": r"\b(?:exploration|open world|world map|empty world|map size)\b",
    "cutscenes": r"\b(?:cutscene|cutscenes|cinematic|too many cutscenes|cut scene|cut scenes )\b",
    "performance": r"(performance|lag|stutter|fps|frame|crash|bug|glitch|optimization|optimisation)",
    "controls": r"\b(?:controls?|input lag|button delay|clunky controls?)\b",
    "camera": r"\b(?:camera|camera angle|bad camera)\b",
    "ui/ux": r"(ui|ux|user.{0,3}interface|user.{0,3}experience|menu|navigation)",
    "graphics": r"graphics?|visuals?|looks?",
    "music": r"music|soundtrack|ost",
    "replayability": r"replay|replayability|replayable",
    "narrative choices": r"(choices?|decisions?|branching|multiple.{0,3}endings?)",

    # character
    "characters": r"characters?|cast",
    "voice acting": r"(voice.{0,3}acting?|vo|voiceover|voice.{0,3}over)",

    # difficulty
    "enemies": r"\b(?:enemy|enemies|mob|mobs|trash mobs?)\b",
    "bosses": r"\b(?:boss|bosses|superboss|elite bosses?)\b",
    "difficulty": r"\b(?:difficulty|hard|too hard|too easy|unbalanced|balance issues?)\b",

    # quest
    "minigames": r"\b(?:mini[- ]?games?)\b",
    "quests": r"\b(?:quest|quests|side[- ]?quests?|fetch quests?|optional quests?)\b",
    "story": r"\b(?:story|stories|plot|narrative|writing|pacing|chapter|ending)\b",

    # else
    "value for money": r"(value|worth|price|cost|expensive|cheap|money)",
    "modding": r"modding|mods|modifiable"
}

In [182]:
split_negativity = concept_negativity(split_concepts)


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



In [183]:
split_negativity = filter_low_signal_concepts(split_negativity)

In [184]:
plot_negativity(split_negativity)

In [185]:
split_share_of_negative = concept_share_of_negative(split_concepts)


This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



In [186]:
split_share_of_negative = filter_low_signal_concepts(split_share_of_negative)

In [187]:
plot_share_of_negative(split_share_of_negative)