### __Visualizations__

#### __Imports__

In [89]:
import os, json, copy, sys, re
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from math import ceil
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import textwrap

sys.path.append(os.path.dirname(os.path.abspath('..')))
from utils.visualizations import text_language_frequency, plot_horizontal_barplot
from utils.text_analysis_functions import data_cleaning

#### __Data__

In [90]:
## Reddit
reddit_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\reddit_cleaned.json"
## YouTube
youtube_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\youtube_cleaned.json"
## OpenGov 
opengov_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\ogov_cleaned.json"

## YouTube
with open(youtube_path, "r", encoding="utf-8") as f:
    youtube_clean = json.load(f)
## Reddit
with open(reddit_path, "r", encoding="utf-8") as f:
    reddit_clean = json.load(f)
## OpenGov
with open(opengov_path, "r", encoding="utf-8") as f:
    ogov_clean = json.load(f) 

final_table = pd.read_csv(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(".")))) + "\\working_data\\transformed_dataset.csv")

In [91]:
youtube_plain_comments = []
for com in youtube_clean:
    for dic in com["comments"]:
        youtube_plain_comments.append(dic["body"])

ogov_plain_comments = []
for com in ogov_clean:
    ogov_plain_comments.append(com["article_text"])

reddit_plain_comments = []
for com in reddit_clean:
    for dic in com["comments"]:
        reddit_plain_comments.append(dic["body"])

In [92]:
all_comments = []
all_comments.extend(reddit_plain_comments)
all_comments.extend(youtube_plain_comments)
all_comments.extend(ogov_plain_comments)

#### __Preprocessing__

In [93]:
all_users_yt = []
for i in youtube_clean:
    for j in i["comments"]:
        all_users_yt.append(j["author_id"])

all_users_reddit = []
for i in reddit_clean:
    for j in i["comments"]:
        all_users_reddit.append(j["author_id"])

all_users_ogov = []
for i in ogov_clean:
    all_users_ogov.append(i["author_id"]) 

#### __Results__

In [None]:
def plot_user_comment_counts(data_dict, title="User Comment Frequency", wrap_width=30, top_n=None):
    """
    Plots a horizontal bar chart of user comment frequencies.

    Args:
        data_dict (dict): Dictionary of {user: count}
        title (str): Title of the plot
        wrap_width (int): Max width before wrapping user names
        top_n (int, optional): Limit to top N users by count
    """
    sorted_items = sorted(data_dict.items(), key=lambda x: x[1], reverse=True)

    if top_n:
        sorted_items = sorted_items[:top_n]

    users, counts = zip(*sorted_items)
    wrapped_users = ["\n".join(textwrap.wrap(str(user), wrap_width)) for user in users]
    y_pos = range(len(wrapped_users))

    fig, ax = plt.subplots(figsize=(10, 0.5 * len(wrapped_users)))  # Dynamic height
    bars = ax.barh(y_pos, counts, color='skyblue')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(wrapped_users)
    ax.invert_yaxis()
    ax.set_ylabel("User id")
    ax.set_xlabel("Number of Comments")
    ax.set_title(title)

    for bar, count in zip(bars, counts):
        width = bar.get_width()
        ax.text(width + max(counts)*0.01, bar.get_y() + bar.get_height()/2,
                str(count), va='center')

    plt.tight_layout()
    plt.show()

def plot_comment_count_histogram(
    data_dict,
    title="Distribution of User Comment Counts",
    bins='auto',
    min_comments=None,
    cumulative=False,
    percent_y=True,
    annotate_bars=True,
    show_bin_edges=True
):
    """
    Plots a histogram showing how many users made X number of comments.

    Args:
        data_dict (dict): Dictionary of {user: comment_count}
        title (str): Title of the plot
        bins (int or str): Number of bins or binning strategy for plt.hist()
        min_comments (int or None): Filter out users with fewer than this many comments
        cumulative (bool): Whether to show a cumulative histogram
        percent_y (bool): Whether to show y-axis as percentage of users
        annotate_bars (bool): Whether to show percentage above each bar
        show_bin_edges (bool): Whether to show bin edges as x-axis ticks
    """
    if min_comments is not None:
        filtered_counts = [count for count in data_dict.values() if count >= min_comments]
    else:
        filtered_counts = list(data_dict.values())

    if not filtered_counts:
        print("No users meet the minimum comment threshold.")
        return

    plt.figure(figsize=(8, 5))

    max_count = max(filtered_counts)
    bin_edges = range(0, max_count + 5, 5)

    n, bins_, patches = plt.hist(
        filtered_counts,
        bins=bin_edges,
        color='mediumseagreen',
        edgecolor='black',
        cumulative=cumulative,
        density=percent_y
    )

    title_suffix = " (Cumulative)" if cumulative else ""
    ylabel = "Percentage of Users" if percent_y else "Number of Users"
    plt.title(title + title_suffix)
    plt.xlabel("Number of Comments per User")
    plt.ylabel(ylabel)

    if percent_y:
        plt.gca().yaxis.set_major_formatter(PercentFormatter(xmax=1))

    if show_bin_edges:
        plt.xticks(bins_, rotation=45)

    if annotate_bars and percent_y:
        for count, patch in zip(n, patches):
            if count == 0:
                continue
            height = patch.get_height()
            x = patch.get_x() + patch.get_width() / 2
            y = height
            plt.text(x, y + 0.005, f"{count * 100:.1f}%", ha='center', va='bottom', fontsize=9)

    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

def binned_cumulative_table(data_dict, step=5):
    """
    Groups users into comment count bins and computes cumulative totals.

    Args:
        data_dict (dict): {user: comment_count}
        step (int): Bin size, e.g., 5 for 0–4, 5–9, etc.

    Returns:
        pd.DataFrame with bin ranges, counts, and cumulative stats.
    """
    counts = [v for k, v in data_dict.items() if k is not None and v is not None]

    if not counts:
        return pd.DataFrame(columns=["Comments Range", "Users in Bin", "Cumulative Users", "Cumulative %"])

    max_count = max(counts)
    total_users = len(counts)

    bins = list(range(0, max_count + step, step))
    bin_labels = [f"{start}–{start+step-1}" for start in bins[:-1]]
    bin_counts = [0] * len(bin_labels)

    for c in counts:
        bin_index = min(c // step, len(bin_counts) - 1)
        bin_counts[bin_index] += 1

    cumulative_counts = []
    cumulative = 0
    for count in bin_counts:
        cumulative += count
        cumulative_counts.append(cumulative)

    cumulative_percent = [round(c / total_users * 100, 2) for c in cumulative_counts]

    df = pd.DataFrame({
        "Comments Range": bin_labels,
        "Users in Bin": bin_counts,
        "Cumulative Users": cumulative_counts,
        "Cumulative %": cumulative_percent
    })

    return df

def plot_binned_cumulative_table(df, title="Cumulative Percentage of Users by Comment"):
    x = df["Comments Range"]
    y = df["Cumulative %"]

    plt.figure(figsize=(10, 5))
    plt.plot(x, y, marker='o', color='#1f77b4')
    plt.title(title)
    plt.xlabel("Comment Count Range")
    plt.ylabel("Cumulative % of Users")
    plt.xticks(rotation=45)
    plt.grid(True)

    y_start = max(0, y.iloc[0] - 5)
    y_max = 101
    plt.ylim(y_start, y_max)

    for xi, yi in zip(x, y):
        if yi == 100.0:
            continue
        plt.text(xi, yi - 3, f"{yi:.1f}%", ha='center', va='top', fontsize=9)

    plt.tight_layout()
    plt.show()

def plot_multiple_binned_cumulative_tables(
    tables: dict,
    title="Cumulative User Distribution by Comment Bins",
    y_label="Cumulative % of Users"):
    """
    Plots cumulative percentage curves for multiple platforms in one diagram,
    with staggered percentage annotations to avoid overlap.

    Args:
        tables (dict): {label: DataFrame}, each must contain 'Comments Range' and 'Cumulative %'
        title (str): Plot title
        y_label (str): Y-axis label
    """
    plt.figure(figsize=(10, 6))
    all_y_values = []

    for offset_i, (label, df) in enumerate(tables.items()):
        x = df["Comments Range"]
        y = df["Cumulative %"]
        all_y_values.extend(y)

        plt.plot(x, y, marker='o', label=label)

        offset = 1 + offset_i * 2

        for i, (xi, yi) in enumerate(zip(x, y)):
            if i % 2 == 0:
                plt.text(xi, yi - offset, f"{yi:.2f}%", ha='center', va='top', fontsize=8)

    y_min = max(0, min(all_y_values) - 10)
    y_max = min(105, max(all_y_values) + 5)
    plt.ylim(y_min, y_max)

    plt.title(title)
    plt.xlabel("Comment Count Range")
    plt.ylabel(y_label)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend(title="Platform")
    plt.tight_layout()
    plt.show()

def scale_like_counts_to_range(df, column="like_count", target_min=1, target_max=10):
    x_min = df[column].min()
    x_max = df[column].max()
    shift = 0
    if x_min < 1:
        shift = 1 - x_min
        x_min += shift
        x_max += shift
    shifted = df[column] + shift
    normalized = (shifted - x_min) / (x_max - x_min)
    scaled = target_min + normalized * (target_max - target_min)
    return scaled


##### General

Scale the scaled likes for reddit and youtube seperately

In [96]:
final_table.loc[final_table['platform'] == 'opengov', 'like_scaled'] = 1

In [98]:
final_table['like_scaled_norm'] = (
    final_table
      .groupby('platform')['like_scaled']
      .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)

In [None]:
search_long = 'από τη στιγμή που τα ομοφυλόφιλα ζευγάρια δεν μπορούν να αποκτήσουν παιδιά δεν δικαιούνται να έχουν το ότι υπάρχουν γονείς ακατάλληλοι δε σημαίνει ότι πρέπει να νομοθετήσουμε να έχουν παιδιά οι λοατκι όπως το ότι υπάρχουν αρτιμελείς άνθρωποι απαράδεκτοι δεν σημαίνει ότι πρέπει να επιδιώκουμε μια κοινωνία ατόμων χωρίς χέρια ή πόδια είναι άλλο να κατανοούμε και να αγκαλιάζουμε τα πάθη τις ιδιαιτερότητες και τις αδυναμίες του καθενός κι άλλο να στρεβλώσουμε τη φύση και τις φυσιολογικές λειτουργίες δεν υπάρχουν καν επαρκείς μελέτες που να συνιστούν ότι είναι ok ένα παιδί να μεγαλώνει χωρίς το ένα πρότυπο γονέα και με το άλλο πρότυπο εις διπλούν τι είναι τα παιδιά πειραματόζωα κατοικίδια τρόπαια χρειάζονται πατέρα και μητέρα δεν μπορεί το εγωιστικό καπρίτσιο των λοατκι να είναι πάνω από τα δικαιώματα των παιδιών'
search_medium = 'διαφωνώ κάθετα γάμος είναι η συζυγία άνδρα και γυναίκα δεν μπορούμε με έναν νόμο να αλλάξουμε τη φύση τη βιολογία των ανθρώπων οι κοινωνικές συνέπειες αν ψηφιστεί αυτό το νομοσχέδιο θα είναι ολέθριες ας σεβαστούμε τουλάχιστον τη φύση αν δε θέλουμε να σεβαστούμε τις πατροπαράδοτες αξίες μας'
search_short = 'πιστεύω ότι γάμος μπορεί να γίνει μόνο μεταξύ ενός άνδρα και μιας γυναίκας ο γάμος μεταξύ ομοφυλόφιλων δεν φυσιολογικό'

found_long = final_table[final_table["text"] == search_long]
found_medium = final_table[final_table["text"] == search_medium]
found_short = final_table[final_table["text"] == search_short]

In [None]:
for comment in list(final_table[final_table["text_length_bin"] == "short"].sample(n=5)["text"]):
    print("Short comment: ", comment)

for comment in list(final_table[final_table["text_length_bin"] == "medium"].sample(n=5)["text"]):
    print("Medium comment: ", comment)

for comment in list(final_table[final_table["text_length_bin"] == "long"].sample(n=5)["text"]):
    print("Long comment: ", comment)

In [None]:
cross = pd.crosstab(
    index=[final_table['platform'], final_table['text_length_bin']],
    columns=final_table['period']
)

cross = cross.reset_index()

platform_totals = (
    cross.groupby('platform')[['pre', 'during', 'post']]
    .sum()
    .reset_index()
)
platform_totals['text_length_bin'] = 'Total'

combined = pd.concat([cross, platform_totals], ignore_index=True)

combined['sort_order'] = combined['text_length_bin'].replace({'short': 0, 'medium': 1, 'long': 2, 'Total': 3})
combined = combined.sort_values(['platform', 'sort_order']).drop(columns='sort_order')

grand_total = pd.DataFrame({
    'platform': ['Total'],
    'text_length_bin': [''],
    'pre': [combined['pre'].sum()],
    'during': [combined['during'].sum()],
    'post': [combined['post'].sum()]
})

final = pd.concat([combined, grand_total], ignore_index=True)

final = final[['platform', 'text_length_bin', 'pre', 'during', 'post']]
final.reset_index(drop=True, inplace=True)

final['Total'] = final[['pre', 'during', 'post']].sum(axis=1)

final = final[['platform', 'text_length_bin', 'pre', 'during', 'post', 'Total']]
final 

In [None]:
cross = pd.crosstab(
    index=[final_table['platform'], final_table['text_length_bin']],
    columns=final_table['period'],
    margins=True,
    margins_name="Total"
).reset_index()
cross

In [None]:
print(final.to_latex(index=False))

##### Scaling Visualization

In [None]:
final_table[final_table["platform"] == "youtube"]["like_scaled"].hist(bins=50, figsize=(8, 5))

plt.title("Histogram of scaled likes")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

Word Cloud

In [None]:
cleaning_object = data_cleaning()

In [106]:
clean_comment_list = []
for comment in all_comments:
    clean_comment_list.append(cleaning_object.remove_greek_stopwords(text=comment))
len(clean_comment_list)

23874

In [None]:
all_words = []
for sentence in clean_comment_list:
        words = sentence.split()
        all_words.extend(words)

word_frequency = dict(Counter(all_words))

In [None]:
sorted(word_frequency.items(), key=lambda item: item[1], reverse=True)

In [None]:
wc = WordCloud(
    width=800,
    height=400,
    background_color='white',
    colormap='inferno',
    max_words=100
).generate_from_frequencies(word_frequency)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.show()

##### Userwise

In [110]:
yt_users_dict = dict(Counter(all_users_yt))
reddit_users_dict = dict(Counter(all_users_reddit))
ogov_users_dict = dict(Counter(all_users_ogov))

In [111]:
yt_table = binned_cumulative_table(yt_users_dict, step=10)
reddit_table = binned_cumulative_table(reddit_users_dict, step=10)
ogov_table = binned_cumulative_table(ogov_users_dict, step=10)

In [None]:
print(yt_table.to_latex(index=False))
print(reddit_table.to_latex(index=False))
print(ogov_table.to_latex(index=False))

In [None]:
plot_binned_cumulative_table(yt_table, title="YouTube - Cumulative Percentage of Users by Comment")
plot_binned_cumulative_table(reddit_table, title="Reddit - Cumulative Percentage of Users by Comment")
plot_binned_cumulative_table(ogov_table, title="OpenGov - Cumulative Percentage of Users by Comment") 

In [None]:
plot_multiple_binned_cumulative_tables({
    "YouTube": yt_table,
    "Reddit": reddit_table,
    "OpenGov": ogov_table
})

In [115]:
len(reddit_users_dict) # 3127

821

In [None]:
plot_user_comment_counts(reddit_users_dict, title="Reddit user-comment distribution", top_n=11)
plot_user_comment_counts(yt_users_dict, title="YouTube user-comment distribution", top_n=10)
plot_user_comment_counts(ogov_users_dict, title="OpenGov user-comment distribution", top_n=10)

##### Timewise

In [None]:
def plot_datetime_frequencies(data_dict, title="Year-Month Frequency Plot", color_code='C0'):
    """
    Accepts year-month strings ('YYYY-MM') as keys and plots frequency over time.

    Args:
        data_dict (dict): {year_month_str: count}
        title (str): Title for the plot
    """
    data = [(datetime.strptime(k, "%Y-%m"), v) for k, v in data_dict.items()]
    
    data.sort(key=lambda x: x[0])
    
    x_vals = [item[0] for item in data]
    y_vals = [item[1] for item in data]

    plt.figure(figsize=(10, 5))
    plt.plot(x_vals, y_vals, marker='o', linestyle='-', color=color_code)
    plt.title(title)
    plt.xlabel("Year–Month")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def greek_dates_to_year_month(dates):
    """
    Converts a list of Greek date strings to ['YYYY-MM', ...]
    """
    greek_months = {
        "Ιανουαρίου": "01",
        "Φεβρουαρίου": "02",
        "Μαρτίου": "03",
        "Απριλίου": "04",
        "Μαΐου": "05",
        "Ιουνίου": "06",
        "Ιουλίου": "07",
        "Αυγούστου": "08",
        "Σεπτεμβρίου": "09",
        "Οκτωβρίου": "10",
        "Νοεμβρίου": "11",
        "Δεκεμβρίου": "12"
    }

    result = []
    for date_str in dates:
        try:
            day, month_name, rest = date_str.strip().split(" ", 2)
            year = rest.split(",")[0].strip()
            month = greek_months.get(month_name)
            if year and month:
                result.append(f"{year}-{month}")
        except Exception as e:
            print(f"Skipping invalid date: {date_str} → {e}")
            continue
    return result

def build_quarterly_frequency_table(month_count_dict):
    """
    Groups a month:count dictionary by quarter and returns counts and percentages.
    """
    df = pd.DataFrame(list(month_count_dict.items()), columns=["Month", "Count"])
    df["Month"] = pd.to_datetime(df["Month"])
    df["Quarter"] = df["Month"].dt.to_period("Q").astype(str)
    quarter_df = df.groupby("Quarter", as_index=False)["Count"].sum()
    total = quarter_df["Count"].sum()
    quarter_df["Percentage"] = (quarter_df["Count"] / total * 100).round(2)

    return quarter_df.sort_values("Quarter")


In [118]:
all_time_reddit = []
all_time_reddit_dynamic = []
for element in reddit_clean:
    for comment in element["comments"]:
        all_time_reddit.append(comment["published_at"])
        all_time_reddit_dynamic.append({comment["published_at"]: comment["like_count"]})

all_time_yt = []
all_time_yt_dynamic = []
for element in youtube_clean:
    for comment in element["comments"]:
        all_time_yt.append(comment["published_at"])
        all_time_yt_dynamic.append({comment["published_at"]: comment["like_count"]})

all_time_ogov = []
for element in ogov_clean:
        all_time_ogov.append(element["date_published"])

In [119]:
all_time_reddit_formated = [datetime.fromisoformat(d).strftime("%Y-%m") for d in all_time_reddit]
all_time_yt_formated = [datetime.fromisoformat(d).strftime("%Y-%m") for d in all_time_yt]
all_time_ogov_formated = greek_dates_to_year_month(all_time_ogov) 

all_time_reddit_dynamic_formatted = [
    {datetime.fromisoformat(k).strftime("%Y-%m"): v}
    for d in all_time_reddit_dynamic
    for k, v in d.items()
]

all_time_yt_dynamic_formated = [
    {datetime.fromisoformat(k).strftime("%Y-%m"): v}
    for d in all_time_yt_dynamic
    for k, v in d.items()
] 

In [120]:
all_time_yt_dynamic_formated_d = defaultdict(int)
for d in all_time_yt_dynamic_formated:
    for k, v in d.items():
        all_time_yt_dynamic_formated_d[k] += v
all_time_yt_dynamic_formated_dict = dict(all_time_yt_dynamic_formated_d)

all_time_reddit_dynamic_formatted_d = defaultdict(int)
for d in all_time_reddit_dynamic_formatted:
    for k, v in d.items():
        all_time_reddit_dynamic_formatted_d[k] += v
all_time_reddit_dynamic_formatted_dict = dict(all_time_reddit_dynamic_formatted_d)

In [121]:
yt_time_dict = dict(Counter(all_time_yt_formated))
reddit_time_dict = dict(Counter(all_time_reddit_formated))
ogov_time_dict = dict(Counter(all_time_ogov_formated)) 

In [None]:
time_quarter_table_yt = build_quarterly_frequency_table(yt_time_dict)
time_quarter_table_reddit = build_quarterly_frequency_table(reddit_time_dict)

print(time_quarter_table_yt.to_latex(index=False))
print(time_quarter_table_reddit.to_latex(index=False))

In [None]:
plot_datetime_frequencies(yt_time_dict, title="YouTube Comment Frequency")
plot_datetime_frequencies(reddit_time_dict, title="Reddit Comment Frequency", color_code='C1')

In [None]:
def plot_platform_frequencies_clean(df, title="Monthly Comment Frequency by Platform", color_map=None):
    """
    Plots monthly frequency line plot per platform with year-only x-axis ticks and equal spacing.

    Args:
        df (DataFrame): Must have 'platform' and 'date_mini' ('YYYY-MM') columns.
        title (str): Plot title.
        color_map (dict): Optional dict of platform -> color string (e.g., 'blue', 'orange').
    """
    df = df.copy()
    df["date_mini"] = pd.to_datetime(df["date_mini"], format="%Y-%m")

    counts = df.groupby(["platform", "date_mini"]).size().reset_index(name="count")
    pivot = counts.pivot(index="date_mini", columns="platform", values="count").fillna(0)

    plt.figure(figsize=(12, 6))

    for platform in pivot.columns:
        plt.plot(
            pivot.index,
            pivot[platform],
            marker='o',
            label=platform,
            color=color_map.get(platform, None) if color_map else None
        )

    years = pd.date_range(start="2014-01-01", end="2025-12-01", freq="YS")

    plt.xticks(
        ticks=years,
        labels=[d.strftime("%Y") for d in years],
        rotation=0
    )

    plt.title(title)
    plt.xlabel("Year")
    plt.ylabel("Comment Count")
    plt.grid(True)
    plt.legend(title="Platform")
    plt.tight_layout()
    plt.show()

In [None]:
color_map = {
    "reddit": "C1",
    "youtube": "C0",
    "opengov": "C2"
}
plot_platform_frequencies_clean(final_table[final_table["platform"] != "opengov"], color_map=color_map)

##### Commentwise

In [126]:
commentwise_df = final_table.copy()

In [None]:
records = []

for plat, grp in commentwise_df.groupby('platform'):
    arr = grp['like_scaled_norm'].dropna()
    total = len(arr)
    if total == 0:
        continue

    if arr.nunique() == 1:
        interval = pd.Interval(arr.min(), arr.max(), closed='both')
        bins = pd.IntervalIndex([interval])
        binned = pd.cut(arr, bins=bins)
    else:
        try:
            binned = pd.cut(arr, bins=20, include_lowest=True)
        except ValueError:
            binned = pd.qcut(arr, q=20, duplicates='drop')

    freq = (
        binned
          .value_counts(sort=False)
          .reset_index(name='count')
          .rename(columns={'index':'like_scaled_norm_interval'})
    )
    freq['percentage'] = round(100 * freq['count'] / total,2)
    freq['platform']   = plat

    records.append(freq)

freq_table = pd.concat(records, ignore_index=True)

print(freq_table.to_latex(index=False))

In [None]:
commentwise_df["like_scaled_norm"].hist(bins=100, figsize=(10, 3))
plt.title("Histogram of scaled likes")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
group_counts = final_table.groupby(['platform', 'text_length_bin']).size().reset_index(name='count')
total_per_platform = group_counts.groupby('platform')['count'].transform('sum')
group_counts['percentage'] = 100 * group_counts['count'] / total_per_platform

pivot_df = group_counts.pivot(index='text_length_bin', columns='platform', values='percentage').fillna(0)

platform_order = list(pivot_df.columns)

color_map = {
    platform_order[0]: 'C2',
    platform_order[1]: 'C1',
    platform_order[2]: 'C0'
}

ax = pivot_df.plot(
    kind='bar',
    figsize=(10, 6),
    color=[color_map[col] for col in pivot_df.columns]
)

for container in ax.containers:
    for bar in container:
        height = bar.get_height()
        if height > 0:
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                height + 0.5,
                f"{height:.1f}%",
                ha='center',
                va='bottom',
                fontsize=9
            )

plt.title("Percentage of Comment Length per Platform")
plt.xlabel("Text Length Bin")
plt.ylabel("Percentage (%)")
plt.xticks(rotation=0)
plt.legend(title="Platform")
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
group_counts = final_table.groupby(['period', 'text_length_bin']).size().reset_index(name='count')
total_per_period = group_counts.groupby('period')['count'].transform('sum')
group_counts['percentage'] = 100 * group_counts['count'] / total_per_period

pivot_df = group_counts.pivot(index='text_length_bin', columns='period', values='percentage').fillna(0)

period_order = ["pre", "during", "post"]
color_map = {
    period_order[0]: 'C31',
    period_order[1]: 'C3',
    period_order[2]: 'C28'
}

ax = pivot_df.plot(
    kind='bar',
    figsize=(10, 6),
    color=[color_map[col] for col in pivot_df.columns]
)

for container in ax.containers:
    for bar in container:
        height = bar.get_height()
        if height > 0:
            ax.text(
                bar.get_x() + bar.get_width() / 2,
                height + 0.5,
                f"{height:.1f}%",
                ha='center',
                va='bottom',
                fontsize=9
            )

plt.title("Percentage of Comment Length per Period")
plt.xlabel("Text Length Bin")
plt.ylabel("Percentage (%)")
plt.xticks(rotation=0)
plt.legend(title="Period")
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
def split_every_n_words(s: str, n: int = 10) -> list[str]:
    words = s.split()
    return [" ".join(words[i:i+n]) for i in range(0, len(words), n)]

matches = final_table[(final_table["text"].str.contains("woke", na=False)) & (final_table["text_length_bin"] == "short")]
txt = list(matches["text"])

print(matches)

Comment weighted with like scaled

In [None]:
period_order = ["pre", "during", "post"]
color_map = {
    period_order[0]: 'C31',
    period_order[1]: 'C3',
    period_order[2]: 'C28'
}

filtered = final_table[final_table['platform'] != 'opengov']

top_comments = (
    filtered
      .sort_values('like_scaled_norm', ascending=False)
      .groupby(['platform', 'period'], group_keys=False)
      .head(100)
      .reset_index(drop=True)
)

text_bins = sorted(filtered['text_length_bin'].unique())
periods   = period_order[:] 

platforms = top_comments['platform'].unique()
fig, axes = plt.subplots(1, len(platforms),
                         figsize=(5 * len(platforms), 5),
                         sharey=True)

if len(platforms) == 1:
    axes = [axes]

for i, (ax, plat) in enumerate(zip(axes, platforms)):
    sub = top_comments[top_comments['platform'] == plat]
    pivot = (
        sub
         .groupby(['text_length_bin','period'])
         .size()
         .unstack(fill_value=0)
         .reindex(index=text_bins, columns=periods, fill_value=0)
    )
    pivot_pct = pivot.div(pivot.sum(axis=0), axis=1) * 100

    colors = [color_map[period] for period in pivot_pct.columns]

    pivot_pct.plot(
        kind='bar',
        ax=ax,
        width=0.8,
        color=colors,
        legend=(i == len(platforms) - 1)
    )

    ax.set_title(f"{plat.capitalize()} Top 100 Rated Comments Distribution")
    ax.set_xlabel("Text Length")
    ax.set_ylabel("Percent of Top 100 Comments (%)")
    ax.tick_params(axis='x', rotation=0)

    if i == len(platforms) - 1:
        ax.legend(title="Period", loc='upper left', bbox_to_anchor=(1.02, 1))

    for container in ax.containers:
        for bar in container:
            h = bar.get_height()
            if h > 0:
                ax.text(
                    bar.get_x() + bar.get_width() / 2,
                    h + 1,
                    f"{h:.1f}%",
                    ha='center',
                    va='bottom',
                    fontsize=7
                )

plt.tight_layout(rect=[0,0,1,0.8])
plt.show()