# Category Percentage Share Over Time

This notebook reads CSVs from `output_pass4b/` and plots the percentage share of mentions for categories by year. It aggregates counts by year and category, normalizes to 100% per year, and visualizes the composition using a clean, publication-quality 100% stacked area chart.

In [None]:
# Imports and styling
from pathlib import Path
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, PercentFormatter
from matplotlib import cm

%config InlineBackend.figure_format = 'retina'  # crisp plots

def set_plot_style():
    candidates = ['seaborn-v0_8-whitegrid', 'seaborn-whitegrid', 'ggplot']
    for s in candidates:
        try:
            plt.style.use(s)
            break
        except Exception:
            continue
    plt.rcParams.update({
        'figure.figsize': (12, 7),
        'figure.dpi': 120,
        'savefig.dpi': 150,
        'axes.titlesize': 16,
        'axes.labelsize': 12,
        'legend.fontsize': 10,
        'axes.spines.top': False,
        'axes.spines.right': False,
        'axes.grid': True,
        'grid.alpha': 0.3,
    })
    try:
        import seaborn as sns  # noqa: F401
        sns.set_theme(style='whitegrid', context='notebook')
    except Exception:
        pass

set_plot_style()


In [None]:
# Parameters
DATA_DIR = Path('output_pass4b')
TOP_K = 10  # number of top categories to display; remainder becomes 'Other'

CATEGORY_COL_CANDIDATES = [
    'category', 'universal_category', 'universal categories', 'universal_categories', 'Category'
]


In [None]:
# Load and concatenate CSVs
files = sorted(DATA_DIR.glob('*.csv'))
if not files:
    raise FileNotFoundError(f'No CSV files found in {DATA_DIR.resolve()}')

dfs = []
year_re = re.compile(r'(?P<year>\d{4})')

for f in files:
    df = pd.read_csv(f)

    # Identify category column
    cat_col = next((c for c in CATEGORY_COL_CANDIDATES if c in df.columns), None)
    if cat_col is None:
        raise ValueError(f'No category-like column found in {f.name}. Columns: {list(df.columns)}')
    df = df.rename(columns={cat_col: 'category'})

    # Ensure year column exists (fallback to filename)
    if 'year' not in df.columns:
        m = year_re.search(f.name)
        if not m:
            raise ValueError(f'Cannot infer year from filename: {f.name}')
        df['year'] = int(m.group('year'))

    # Normalize
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    df = df.dropna(subset=['year'])
    df['year'] = df['year'].astype(int)
    df['category'] = df['category'].astype(str).str.strip()
    df = df[df['category'] != '']

    dfs.append(df[['year', 'category']].copy())

all_df = pd.concat(dfs, ignore_index=True)
all_df.head()


In [None]:
# Aggregate counts across all categories
agg = (
    all_df.groupby(['year', 'category']).size().reset_index(name='mentions')
)
totals = agg.groupby('category')['mentions'].sum().sort_values(ascending=False)
top_categories = totals.head(TOP_K).index.tolist()

pivot_all = (
    agg.pivot(index='year', columns='category', values='mentions')
       .fillna(0)
       .sort_index()
)

# Normalize to percentages per year
row_sums = pivot_all.sum(axis=1)
pct = (pivot_all.div(row_sums.replace(0, np.nan), axis=0) * 100).fillna(0)

# Keep top categories and fold the rest into 'Other' for readability
pct_top = pct[top_categories].copy()
pct_top['Other'] = (100 - pct_top.sum(axis=1)).clip(lower=0)
pct_top


In [None]:
# Plot: 100% stacked area chart of category share over time
set_plot_style()
fig_pct, ax = plt.subplots()

labels = pct_top.columns.tolist()
n = len(labels)
try:
    import seaborn as sns  # noqa: F401
    colors = sns.color_palette('tab20', n_colors=n)
except Exception:
    colors = [cm.tab20(i) for i in np.linspace(0, 1, n)]

ax.stackplot(
    pct_top.index,
    *[pct_top[c].values for c in labels],
    labels=labels,
    colors=colors,
    alpha=0.9
)

ax.set_title('Category Share Over Time (Top Categories + Other)')
ax.set_xlabel('Year')
ax.set_ylabel('Share')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_ylim(0, 100)
ax.yaxis.set_major_formatter(PercentFormatter(xmax=100))
ax.set_xticks(pct_top.index.to_list())

# Legend outside for clarity
ax.legend(title='Category', bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0., frameon=True)

plt.tight_layout()
plt.show()


> Optional: Save the percentage plot to a file by running the next cell.

In [None]:
# Optional: save the percentage stacked area plot
try:
    fig_pct.savefig('category_share_over_time.png', bbox_inches='tight')
    print('Saved to category_share_over_time.png')
except Exception as e:
    print('Figure not saved (was the plot cell run?):', e)
