In [1]:

from helpers import database
import pandas as pd
import re
import plotly.express as px

In [2]:
reviews = database.get_reviews("english")
reviews.head()

Unnamed: 0,id,recommendation_id,author_steamid,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,language,...,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,primarily_steam_deck
0,215760124,,76561199565605732,0,4,325,325,264,1768155336,english,...,1768151687,True,0,0,0.5,0,True,False,False,False
1,215756568,,76561198026330868,39,10,4201,2559,4150,1768154369,english,...,1768148999,True,0,0,0.5,0,True,False,False,False
2,215744888,,76561198066200885,221,44,5543,0,5543,1739371844,english,...,1768140031,True,0,0,0.5,0,True,False,False,False
3,215742083,,76561198880718555,0,8,3239,2734,3147,1768155154,english,...,1768137616,True,0,0,0.5,0,True,False,False,False
4,215740976,,76561199486130374,14,3,200,200,200,1768055486,english,...,1768136629,True,0,0,0.5,0,False,False,False,False


In [5]:
total_reviews = len(reviews)
total_reviews

12775

In [58]:
character_aliases = {

    # Main character
     "Cloud Strife": [
        "Cloud Strife", "Cloud", "Strife"
    ],
    "Tifa Lockhart": [
        "Tifa Lockhart", "Tifa", "Lockhart"
    ],
    "Aerith Gainsborough": [
        "Aerith Gainsborough", "Aerith", "Aeris", "Gainsborough"
    ],
    "Barret Wallace": [
        "Barret Wallace", "Barret", "Wallace"
    ],
    "Red XIII": [
        "Red XIII", "Red 13", "Nanaki" # Don't use "Red"
    ],
    "Yuffie Kisaragi": [
        "Yuffie Kisaragi", "Yuffie", "Kisaragi"
    ],
    "Cait Sith": [
        "Cait Sith", "Cait", "Sith"
    ],

    # Antagonists / Shinra / Sephiroth group
    "Sephiroth": [
        "Sephiroth"
    ],
    "Zack Fair": [
        "Zack Fair", "Zack", "Fair"
    ],
    "Rufus Shinra": [
        "Rufus Shinra", "Rufus"
    ],
    "Hojo": [
        "Hojo", "Professor Hojo"
    ],
    "Heidegger": [
        "Heidegger"
    ],
    "Scarlet": [
        "Scarlet"
    ],
    "Roche": [
        "Roche"
    ],
    "Reno": [
        "Reno"
    ],
    "Rude": [
        "Rude"
    ],
    "Elena": [
        "Elena"
    ],
    "Tseng": [
        "Tseng"
    ],

    # Avalanche / Important side character
    "Biggs": [
        "Biggs"
    ],
    "Wedge": [
        "Wedge"
    ],
    "Jessie Rasberry": [
        "Jessie Rasberry", "Jessie"
    ],
    "Marlene Wallace": [
        "Marlene Wallace", "Marlene"
    ],
    "Cid Highwind": [
        "Cid Highwind", "Cid", "Highwind"
    ],
    "Vincent Valentine": [
      "Vincent Valentine", "Vincent", "Valentine"
    ],

    # Side character
    "Bugenhagen": [
        "Bugenhagen"
    ],
    "Dyne": [
        "Dyne"
    ],
    "Don Corneo": [
        "Don Corneo", "Corneo" # Don't use "Don"
    ],
    "Kyrie Canaan": [
        "Kyrie Canaan", "Kyrie"
    ],
    "Regina Konigin": [
        "Regina Konigin", "Regina"
    ],
    "Chadley": [
        "Chadley"
    ],
    "Mai": [
        "Mai"
    ],
}

In [59]:
def build_pattern(aliases):
    parts = []
    for alias in aliases:
        alias = alias.strip()
        tokens = alias.split()
        phrase = r"\s+".join(re.escape(t) for t in tokens)
        parts.append(rf"\b{phrase}\b")
    pattern = "|".join(parts) if parts else r"$^"
    return re.compile(pattern, flags=re.IGNORECASE)

character_regex = {
    character: build_pattern(aliases)
    for character, aliases in character_aliases.items()
}

In [60]:
rows = []
for character, regex in character_regex.items():
    mask = reviews["review"].str.contains(regex, na=False, regex=True)
    mentions = int(mask.sum())
    positivity = reviews.loc[mask, "voted_up"].mean() if mentions > 0 else None
    rows.append({
        "character": character,
        "mentions": mentions,
        "share": mentions / total_reviews if total_reviews else 0.0,
        "positivity": float(positivity) if pd.notna(positivity) else None
    })

character_df = pd.DataFrame(rows)
character_df


Unnamed: 0,character,mentions,share,positivity
0,Cloud Strife,499,0.039061,0.697395
1,Tifa Lockhart,495,0.038748,0.810101
2,Aerith Gainsborough,426,0.033346,0.793427
3,Barret Wallace,138,0.010802,0.702899
4,Red XIII,106,0.008297,0.745283
5,Yuffie Kisaragi,155,0.012133,0.690323
6,Cait Sith,159,0.012446,0.63522
7,Sephiroth,319,0.024971,0.739812
8,Zack Fair,237,0.018552,0.738397
9,Rufus Shinra,34,0.002661,0.529412


In [61]:
min_mentions = 10
character_df = character_df[character_df["mentions"] >= min_mentions].copy()
character_df

Unnamed: 0,character,mentions,share,positivity
0,Cloud Strife,499,0.039061,0.697395
1,Tifa Lockhart,495,0.038748,0.810101
2,Aerith Gainsborough,426,0.033346,0.793427
3,Barret Wallace,138,0.010802,0.702899
4,Red XIII,106,0.008297,0.745283
5,Yuffie Kisaragi,155,0.012133,0.690323
6,Cait Sith,159,0.012446,0.63522
7,Sephiroth,319,0.024971,0.739812
8,Zack Fair,237,0.018552,0.738397
9,Rufus Shinra,34,0.002661,0.529412


In [62]:
character_df = character_df.sort_values("positivity", ascending=True, na_position="last").reset_index(drop=True)
character_df


Unnamed: 0,character,mentions,share,positivity
0,Dyne,23,0.0018,0.347826
1,Hojo,22,0.001722,0.409091
2,Rufus Shinra,34,0.002661,0.529412
3,Cid Highwind,78,0.006106,0.589744
4,Chadley,455,0.035616,0.608791
5,Cait Sith,159,0.012446,0.63522
6,Mai,46,0.003601,0.652174
7,Yuffie Kisaragi,155,0.012133,0.690323
8,Cloud Strife,499,0.039061,0.697395
9,Barret Wallace,138,0.010802,0.702899


In [63]:
fig = px.bar(
    character_df,
    x="positivity",
    y="character",
    orientation="h",
    text="mentions",
    labels={
        "positivity": "Sentiment",
        "character": "Character (at least 10 mentions)",
        "mentions": "Mentions"},
    color="positivity",
    color_continuous_scale=[(0, "red"), (1, "green")],
    range_x=[0, 1],
    title="Character sentiment & mentions"
)

fig.update_traces(texttemplate="%{text:,}", textposition="inside")
fig.update_layout(height=max(500, 28 * len(character_df)), coloraxis_showscale=False)
fig.update_xaxes(tickformat=".0%")
fig.update_traces(hovertemplate="<b>%{y}</b><br>Positivity: %{x:.1%}<br>Mentions: %{text:,}<extra></extra>")
fig.show()

In [64]:
character_df = character_df.sort_values("mentions", ascending=True, na_position="last").reset_index(drop=True)
character_df["positivity_label"] = character_df["positivity"].map(lambda v: f"{v:.1%}")
character_df

Unnamed: 0,character,mentions,share,positivity,positivity_label
0,Elena,11,0.000861,0.727273,72.7%
1,Roche,13,0.001018,0.769231,76.9%
2,Kyrie Canaan,15,0.001174,0.733333,73.3%
3,Hojo,22,0.001722,0.409091,40.9%
4,Dyne,23,0.0018,0.347826,34.8%
5,Rufus Shinra,34,0.002661,0.529412,52.9%
6,Mai,46,0.003601,0.652174,65.2%
7,Cid Highwind,78,0.006106,0.589744,59.0%
8,Vincent Valentine,78,0.006106,0.769231,76.9%
9,Red XIII,106,0.008297,0.745283,74.5%


In [65]:

fig = px.bar(
    character_df,
    x="mentions",
    y="character",
    orientation="h",
    text="positivity_label",
    labels={
        "mentions": "Mentions",
        "character": "Character (at least 10 mentions)",
        "positivity": "Sentiment"},
    color="positivity",
    color_continuous_scale=[(0, "red"), (1, "green")],
    title="Character mentions & sentiment"
)
fig.update_traces(textposition="inside")
fig.update_layout(
    height=max(500, 28 * len(character_df)),
    coloraxis_showscale=False)
fig.update_traces(hovertemplate="<b>%{y}</b><br>Mentions: %{x:,}<br>Positivity: %{customdata:.1%}<extra></extra>", customdata=character_df["positivity"])
fig.show()