In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import datetime
from pathlib import Path

import datasets
import polars as pl
from lets_plot import *
from polars import col as c
from rich.pretty import pprint

LetsPlot.setup_html()

In [19]:
asset_folder = Path("../../assets")

## Speech Count

In [20]:
dataset = datasets.load_dataset(
    "Eugleo/us-congressional-speeches-subset", split="train"
)
dataset_df = dataset.to_polars()

In [40]:
filtered_speech_count = dataset_df.group_by(
    c("date").dt.truncate("1y").alias("year")
).agg(c("speech_id").n_unique().alias("speech_count"))

chart = (
    ggplot(filtered_speech_count, aes(x="year", y="speech_count"))
    + geom_line()
    + xlab("Year")
    + ylab("# of speeches")
    + scale_x_datetime(format="%Y")
)

chart.show()

ggsave(chart + ggsize(400, 200), str(asset_folder / "speech_count.svg"))

'/mnt/ssd-1/mechinterp/gw1/evzen-test/assets/speech_count.svg'

In [21]:
filtered_speech_count["speech_count"].sum()

5038919

## Pairwise Comparisons

In [22]:
pairs = datasets.load_dataset(
    "Eugleo/us-congressional-speeches-emotionality-pairs", split="train"
)
pairs_df = pairs.to_polars()

In [23]:
pairs_df["result"].value_counts()

result,count
i64,u32
-1,1420
1,71964
0,76616


In [34]:
print(pairs[0])

{'speech_1': 'Auschwitz. I am glad I heard that word. I would like everybody in this Chamber. every visitor. Madam Chairman. and everybody across the country to absorb this fact that was just suppressed by the dominant media culture. The angel of death should have been called the devil of death on rail lines at Birkenau. the adjunct satellite camp to Auschwitz which the gentleman mentioned. which actually killed 4 million people. that angel of death. Dr. Mengele. who did in fact escape justice until God finally took him in a drowning accident on a Brazilian beach in 1979. guess what Dr. Mengele. the devil of Auschwitz and Birkenau did when he went to South America to hide from justice. guess what he practiced as a medical doctor. again disregarding his hypocratic oath and any sense of Christian or Jewish decency in this all over Europe. surprise. surprise. surprise. Dr. Mengele was an abortionist in Argentina and Brazil. We are killing innocent human life. Leave these Bush regulations 

In [48]:
def extract_speeches(df, speech_id_col: str, results: pl.Series) -> pl.DataFrame:
    return pl.DataFrame({"speech_id": df[speech_id_col], "result": results})


def geom_labeled_line(label: str, year: int):
    return geom_vline(
        xintercept=datetime.datetime(year, 1, 1),
        color="gray",
        linetype="dashed",
        size=1,
        alpha=0.5,
    ) + geom_text(
        x=datetime.datetime(year, 1, 1),
        y=0.7,
        label=label,
        nudge_y=0.05,
        color="black",
        size=6,
        background_color="white",
    )


results = (
    pl.concat(
        [
            extract_speeches(pairs_df, "speech_1_id", 1 - pairs_df["result"]),
            extract_speeches(pairs_df, "speech_2_id", pairs_df["result"]),
        ]
    )
    .join(dataset_df, on="speech_id")
    .filter(c("chamber").is_in(["H", "S"]))
    .with_columns(c("date").dt.truncate("1y").alias("year"))
    .group_by("year", "chamber")
    .agg(c("result").mean().alias("mean_result"))
)

chart = (
    ggplot(results)
    + geom_line(aes(x="year", y="mean_result", color="chamber"))
    + xlab("Year")
    + ylab("% emotional speeches")
    + scale_x_datetime(format="%Y")
    + geom_labeled_line("WW1", 1917)
    + geom_labeled_line("WW2", 1941)
    + geom_labeled_line("Viet.", 1964)
    + geom_labeled_line("TV", 1977)
    + theme(legend_position="top")
    + scale_y_continuous(format="{.0%}")
)

chart.show()

ggsave(chart + ggsize(400, 250), str(asset_folder / "emotionality_timeline.svg"))

'/mnt/ssd-1/mechinterp/gw1/evzen-test/assets/emotionality_timeline.svg'