# Category Mentions Over Time

This notebook reads CSVs from `output_pass4b/` and plots the frequency of mentions for each category over time. It aggregates counts by year and category, then visualizes the top categories as a clean, publication-quality line chart.

In [None]:
# Imports and styling
from pathlib import Path
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import cm

%config InlineBackend.figure_format = 'retina'  # crisp plots

def set_plot_style():
    candidates = ['seaborn-v0_8-whitegrid', 'seaborn-whitegrid', 'ggplot']
    for s in candidates:
        try:
            plt.style.use(s)
            break
        except Exception:
            continue
    plt.rcParams.update({
        'figure.figsize': (12, 7),
        'figure.dpi': 120,
        'savefig.dpi': 150,
        'axes.titlesize': 16,
        'axes.labelsize': 12,
        'legend.fontsize': 10,
        'axes.spines.top': False,
        'axes.spines.right': False,
        'axes.grid': True,
        'grid.alpha': 0.3,
    })
    try:
        import seaborn as sns  # noqa: F401
        sns.set_theme(style='whitegrid', context='notebook')
    except Exception:
        pass

set_plot_style()


In [None]:
# Parameters
DATA_DIR = Path('output_pass4b')
TOP_K = 12  # number of top categories to plot

CATEGORY_COL_CANDIDATES = [
    'category', 'universal_category', 'universal categories', 'universal_categories', 'Category'
]
VERBATIM_COL_CANDIDATES = ['verbatim', 'text', 'snippet']


In [None]:
# Load and concatenate CSVs
files = sorted(DATA_DIR.glob('*.csv'))
if not files:
    raise FileNotFoundError(f'No CSV files found in {DATA_DIR.resolve()}')

dfs = []
year_re = re.compile(r'(?P<year>\d{4})')

for f in files:
    df = pd.read_csv(f)

    # Identify category column
    cat_col = next((c for c in CATEGORY_COL_CANDIDATES if c in df.columns), None)
    if cat_col is None:
        raise ValueError(f'No category-like column found in {f.name}. Columns: {list(df.columns)}')
    df = df.rename(columns={cat_col: 'category'})

    # Ensure year column exists (fallback to filename)
    if 'year' not in df.columns:
        m = year_re.search(f.name)
        if not m:
            raise ValueError(f'Cannot infer year from filename: {f.name}')
        df['year'] = int(m.group('year'))

    # Normalize
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    df = df.dropna(subset=['year'])
    df['year'] = df['year'].astype(int)
    df['category'] = df['category'].astype(str).str.strip()
    df = df[df['category'] != '']

    dfs.append(df[['year', 'category']].copy())

all_df = pd.concat(dfs, ignore_index=True)
all_df.head()


In [None]:
# Aggregate counts and select top categories
agg = (
    all_df.groupby(['year', 'category']).size().reset_index(name='mentions')
)

totals = agg.groupby('category')['mentions'].sum().sort_values(ascending=False)
top_categories = totals.head(TOP_K).index.tolist()

filtered = agg[agg['category'].isin(top_categories)]
pivot = (
    filtered.pivot(index='year', columns='category', values='mentions')
           .fillna(0)
           .sort_index()
)
pivot


In [None]:
# Plot: mentions over time for top categories
set_plot_style()
fig, ax = plt.subplots()

n = len(top_categories)
try:
    import seaborn as sns  # noqa: F401
    colors = sns.color_palette('tab20', n_colors=n)
except Exception:
    colors = [cm.tab20(i) for i in np.linspace(0, 1, n)]

for i, cat in enumerate(top_categories):
    y = pivot.get(cat)
    if y is None:
        continue
    ax.plot(
        pivot.index,
        y.values,
        label=cat,
        color=colors[i],
        linewidth=2.25,
        marker='o',
        markersize=4,
    )

ax.set_title('Category Mentions Over Time')
ax.set_xlabel('Year')
ax.set_ylabel('Mentions')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_xticks(pivot.index.to_list())
ax.set_xlim(pivot.index.min() - 0.2, pivot.index.max() + 0.2)
ax.grid(True, which='major', axis='both', linestyle='-', alpha=0.25)

# Place legend outside the plot for readability
ax.legend(title='Category', bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0., frameon=True)

plt.tight_layout()
plt.show()


> Optional: Save the figure to a file by running the next cell.

In [None]:
# Optional: save the last figure to a PNG
try:
    fig.savefig('category_mentions_over_time.png', bbox_inches='tight')
    print('Saved to category_mentions_over_time.png')
except Exception as e:
    print('Figure not saved (was the plot cell run?):', e)
