In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

DATA_DIR = Path(r"C:\Users\Admin\Downloads\Papers we care about\yt_climate_data\excel")

files = sorted(DATA_DIR.glob("climate_comments_*_all_clean_labeled.xlsx"))

print("Found", len(files), "labeled files:")
for f in files:
    print(" -", f.name)

all_dfs = []
for f in files:
    
    year = None
    for token in f.stem.split("_"):
        if token.isdigit() and len(token) == 4:
            year = int(token)
            break

    df_year = pd.read_excel(f)

    
    df_year["year"] = year

    all_dfs.append(df_year)

full_df = pd.concat(all_dfs, ignore_index=True)
print("Combined shape:", full_df.shape)


print(full_df.columns.tolist())
full_df.head()


In [None]:

full_df["sentiment_label"] = full_df["sentiment_label"].str.lower().str.strip()


full_df = full_df[full_df["comment_text"].notna()].copy()


full_df = full_df[full_df["region_group"].notna()].copy()

full_df["region_group"].value_counts(dropna=False)


In [None]:
CONF_THRESHOLD = 0.75

hc_df = full_df[full_df["sentiment_confidence"] >= CONF_THRESHOLD].copy()

print("All rows:", len(full_df))
print("High-confidence rows:", len(hc_df))
print("Retention %:",
      round(len(hc_df) / max(len(full_df),1) * 100, 2), "%")


In [None]:
def sentiment_breakdown(df):
    
    counts = (
        df
        .groupby(["region_group", "sentiment_label"])
        .size()
        .reset_index(name="count")
    )

    
    totals = (
        counts.groupby("region_group")["count"]
        .sum()
        .reset_index(name="region_total")
    )

    
    counts = counts.merge(totals, on="region_group", how="left")

    
    counts["pct_within_region"] = counts["count"] / counts["region_total"] * 100.0

    
    sentiment_order = ["negative", "neutral", "positive"]
    counts["sentiment_label"] = pd.Categorical(
        counts["sentiment_label"].str.lower().str.strip(),
        categories=sentiment_order,
        ordered=True
    )

    counts = counts.sort_values(["region_group", "sentiment_label"]).reset_index(drop=True)

    return counts[[
        "region_group",
        "sentiment_label",
        "count",
        "pct_within_region"
    ]]

sent_overall_full = sentiment_breakdown(full_df)
sent_overall_hc   = sentiment_breakdown(hc_df)

print("=== Overall sentiment distribution (all rows) ===")
display(sent_overall_full)

print("=== Overall sentiment distribution (high-confidence only) ===")
display(sent_overall_hc)


In [None]:
def yearly_neg_trend(df):
    tmp = df.copy()
    tmp["is_negative"] = (tmp["sentiment_label"] == "negative").astype(int)

    trend = (
        tmp
        .groupby(["year", "region_group"])["is_negative"]
        .mean()
        .reset_index()
        .rename(columns={"is_negative": "neg_rate"})
    )

    
    trend["neg_rate_pct"] = trend["neg_rate"] * 100.0

    return trend.sort_values(["region_group", "year"])

neg_trend_full = yearly_neg_trend(full_df)
neg_trend_hc   = yearly_neg_trend(hc_df)

print("=== Negativity over time (all rows) ===")
display(neg_trend_full.head(30))

print("=== Negativity over time (high-confidence) ===")
display(neg_trend_hc.head(30))


In [None]:
import matplotlib.pyplot as plt

def compute_yearly_total(df):
    """Return yearly total negativity rate (%), computed over all comments (not averaging region means)."""
    tmp = df.copy()
    tmp["is_negative"] = (tmp["sentiment_label"] == "negative").astype(int)
    total = (
        tmp.groupby("year")["is_negative"]
           .mean()
           .reset_index()
           .rename(columns={"is_negative": "Total"})
    )
    total["Total"] = total["Total"] * 100.0
    return total

def to_pivot_with_total(trend_df, base_df):
    """
    Convert your yearly_neg_trend() output (year, region_group, neg_rate_pct)
    into a pivot with columns ['EU','US','Total'] where 'Total' is computed from base_df.
    """
    pivot = (trend_df
             .pivot(index="year", columns="region_group", values="neg_rate_pct")
             .rename_axis(None, axis=1)
             .reset_index())
    total = compute_yearly_total(base_df)
    pivot = pivot.merge(total, on="year", how="left")
    
    cols = ["year"] + [c for c in ["US", "EU"] if c in pivot.columns] + ["Total"]
    pivot = pivot[cols]
    return pivot

def plot_neg_trends(pivot, title="Negativity over time (% negative comments)"):
    plt.figure(figsize=(10.5, 5.5))
    
    for col in ["US", "EU", "Total"]:
        if col in pivot.columns:
            plt.plot(pivot["year"], pivot[col], marker="o", linewidth=2, label=col)
    plt.xlabel("Year")
    plt.ylabel("% Negative")
    plt.title(title)
    plt.grid(True, linestyle="--", alpha=0.35)
    plt.legend()
    plt.tight_layout()
    plt.show()




pivot_full = to_pivot_with_total(neg_trend_full, full_df)
plot_neg_trends(pivot_full, title="Negativity over time")


pivot_hc = to_pivot_with_total(neg_trend_hc, hc_df)
plot_neg_trends(pivot_hc, title="Negativity over time — High-confidence (75%)")



In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def yearly_counts(df):
    return (df.groupby("year").size()
              .reset_index(name="count")
              .sort_values("year"))

def yearly_counts_by_region(df):
    return (df.groupby(["year", "region_group"]).size()
              .reset_index(name="count")
              .pivot(index="year", columns="region_group", values="count")
              .reset_index()
              .sort_values("year"))


counts_total = yearly_counts(full_df)

plt.figure(figsize=(10.5, 4.8))
plt.bar(counts_total["year"], counts_total["count"])
plt.xlabel("Year")
plt.ylabel("Total comments")
plt.title("Comment activity over time — total per year")
plt.tight_layout()
plt.show()

counts_region = yearly_counts_by_region(full_df)

plt.figure(figsize=(10.5, 4.8))
for col in ["US", "EU"]:
    if col in counts_region.columns:
        plt.plot(counts_region["year"], counts_region[col], marker="o", linewidth=2, label=col)
plt.xlabel("Year")
plt.ylabel("Comments per year")
plt.title("Comment activity over time — US vs EU")
plt.grid(True, linestyle="--", alpha=0.35)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from collections import Counter


FORCE_US_CHANNELS = {"pbs newshour"}  

def _pick_channel_col(df):
    cols = {c.lower(): c for c in df.columns}
    for key in ["channel_name", "channel_title", "channel"]:
        if key in cols:
            return cols[key]
    raise KeyError("Expected a channel column: channel_name / channel_title / channel")

def _wilson_ci(neg, N, z=1.96):
    neg = pd.Series(neg, dtype=float)
    N = pd.Series(N, dtype=float)
    with np.errstate(divide="ignore", invalid="ignore"):
        phat = np.where(N > 0, neg / N, np.nan)
        denom = 1 + (z**2) / N
        centre = (phat + (z**2)/(2*N)) / denom
        hw = z * np.sqrt((phat*(1-phat)/N) + (z**2)/(4*N**2)) / denom
    low = np.clip(centre - hw, 0, 1); high = np.clip(centre + hw, 0, 1)
    return centre, low, high

def _canon_text(s: pd.Series) -> pd.Series:
    s = s.astype(str)
    try:
        s = s.str.normalize("NFKC")
    except Exception:
        pass
    s = (s.str.replace("\u200b", "", regex=False)   
           .str.replace("\ufeff", "", regex=False)  
           .str.replace(r"\s+", " ", regex=True)
           .str.strip())
    return s

def _canon_region(s: pd.Series) -> pd.Series:
    s = _canon_text(s).str.upper()
    region_map = {
        "USA": "US", "U.S.": "US", "UNITED STATES": "US", "US ": "US",
        "EUROPE": "EU", "EUROPEAN UNION": "EU", "E.U.": "EU", "EU ": "EU",
    }
    s = s.replace(region_map)
    s = s.replace({"": np.nan, "NAN": np.nan, "NONE": np.nan, "NULL": np.nan})
    return s

def _ensure_channel_stats_full():
    if "channel_stats_full" in globals():
        return channel_stats_full, None
    if "full_df" not in globals():
        raise RuntimeError("I couldn't find `channel_stats_full` or `full_df` in memory.")

    CH = _pick_channel_col(full_df)
    tmp = full_df.copy()

    
    tmp[CH] = _canon_text(tmp[CH])
    tmp["region_group"] = _canon_region(tmp["region_group"])


    key = tmp[CH].str.lower()
    canonical_name = {}
    for k, g in tmp.groupby(key):
        canonical_name[k] = Counter(g[CH].tolist()).most_common(1)[0][0]
    tmp["channel_name"] = key.map(canonical_name)


    def _mode_region(series):
        c = Counter(series.dropna())
        if not c:
            return np.nan
        mc = c.most_common()
        top = mc[0][1]
        tied = [r for r, n in mc if n == top]
        return "US" if "US" in tied else tied[0]

    region_mode = (
        tmp.groupby("channel_name")["region_group"]
           .agg(_mode_region).rename("channel_region").reset_index()
    )
    tmp = tmp.merge(region_mode, on="channel_name", how="left")


    mask_force = tmp["channel_name"].str.casefold().isin(FORCE_US_CHANNELS)
    tmp.loc[mask_force, "channel_region"] = "US"


    lab = tmp["sentiment_label"].astype(str).str.lower().str.strip()
    num = pd.to_numeric(tmp["sentiment_label"], errors="coerce")
    tmp["is_negative"] = np.where(lab.str.startswith("neg") | (num == 0), 1, 0)


    g = (tmp.groupby(["channel_name", "channel_region"], dropna=False)["is_negative"]
           .agg(neg="sum", N="count").reset_index()
           .rename(columns={"channel_region": "region_group"}))


    centre, low, high = _wilson_ci(g["neg"], g["N"])
    g["neg_share"] = g["neg"] / g["N"]
    g["neg_low"] = low
    g["neg_high"] = high


    g = g.sort_values(["region_group", "channel_name"]).reset_index(drop=True)


    us = g[g["region_group"] == "US"]["channel_name"].unique().tolist()
    eu = g[g["region_group"] == "EU"]["channel_name"].unique().tolist()
    print(f"[Diagnostic] US channels ({len(us)}): {sorted(us)}")
    print(f"[Diagnostic] EU channels ({len(eu)}): {sorted(eu)}")
    if "PBS NewsHour" not in us:

        present = tmp.loc[tmp["channel_name"].str.casefold() == "pbs newshour", ["channel_name","region_group","channel_region"]]
        print("\n[Diagnostic] Rows seen for PBS NewsHour after cleaning:")
        print(present.drop_duplicates())

    return g, "built"

channel_stats_full, _built = _ensure_channel_stats_full()

def plot_negativity_bars(stats_df, region, top_n=None, sort_by="neg_share", save=True):
    sub = stats_df[stats_df["region_group"] == region].copy()
    if sub.empty:
        print(f"[{region}] No channels found."); return
    sub = sub.sort_values(sort_by, ascending=False)
    if top_n is not None:
        sub = sub.head(top_n)
    sub = sub.iloc[::-1]

    y = np.arange(len(sub))
    x = sub["neg_share"].values
    xerr = np.vstack([x - sub["neg_low"].values, sub["neg_high"].values - x])

    fig_h = 0.48 * len(sub) + 1.8
    plt.figure(figsize=(10.5, fig_h))
    plt.barh(y, x, xerr=xerr, capsize=3, alpha=0.85)
    plt.yticks(y, sub["channel_name"])
    plt.xlabel("Negativity share")
    plt.title(f"Channel negativity (%) — {region}")
    plt.xlim(0, 1)

    for i, (val, n) in enumerate(zip(x, sub["N"].values)):
        pct = f"{val*100:.1f}%"
        label = f"{pct}  (N={int(n):,})"
        xpos = max(val, 0.005)
        plt.text(xpos + 0.01, i, label, va="center")

    plt.tight_layout()
    if save:
        fname = f"ch5_533_negativity_bars_{region}.png"
        plt.savefig(fname, dpi=200, bbox_inches="tight")
        print(f"Saved: {fname}")
    plt.show()


plot_negativity_bars(channel_stats_full, "EU", top_n=None)
plot_negativity_bars(channel_stats_full, "US", top_n=None)
