In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
def set_seed(seed: int) -> None:
    import os
    import random

    # import numpy as np

    # np.random.seed(seed)
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")


set_seed(42)

Random seed set as 42


In [3]:
import datetime

import polars as pl
from lets_plot import *
from polars import col as c

LetsPlot.setup_html()

In [4]:
word_count = (
    pl.scan_parquet("../data/processed/*.parquet")
    .group_by("date")
    .agg(c("word_count").mean().alias("mean_word_count"))
    .sort("date")
    .with_columns(mean_word_count=c("mean_word_count").rolling_mean_by("date", "1y"))
    .collect()
)

(
    ggplot(word_count, aes(x="date", y="mean_word_count"))
    + geom_line()
    + geom_line(stat="smooth", color="red", linetype="longdash", size=1.5)
    + scale_x_datetime()
)

In [5]:
speech_count = (
    pl.scan_parquet("../data/processed/*.parquet")
    .group_by(c("date").dt.year().alias("year"))
    .agg(c("speech_id").n_unique().alias("speech_count"))
    .collect()
)

(
    ggplot(speech_count, aes(x="year", y="speech_count"))
    + geom_line()
    + geom_line(stat="smooth", color="red", linetype="longdash", size=1.5)
)

In [6]:
# How many speeches will I use

min_limit = 64 / 1.5
max_limit = 1024 / 1.5

filtered_speech_count = (
    pl.scan_parquet("../data/processed/*.parquet")
    .filter(c("word_count") > min_limit, c("word_count") < max_limit)
    .group_by(c("date").dt.year().alias("year"))
    .agg(c("speech_id").n_unique().alias("speech_count"))
    .collect()
)

(
    ggplot(filtered_speech_count, aes(x="year", y="speech_count"))
    + geom_line()
    + geom_line(stat="smooth", color="red", linetype="longdash", size=1.5)
    + ggtitle("Speech count after filtering")
    + xlab("Year")
    + ylab("Speech count")
)

In [7]:
filtered_speech_count.select("speech_count").sum()

speech_count
u32
4112076


In [15]:
num_samples = 5000
seed = 42


def sample_pairs(
    num: int, seed: int, path: str = "../data/processed/*.parquet"
) -> pl.DataFrame:
    ids = (
        pl.scan_parquet("../data/processed/*.parquet")
        .filter(c("word_count") > min_limit, c("word_count") < max_limit)
        .select("speech_id")
        .collect()
        .get_column("speech_id")
        .sample(num * 2, seed=seed)
    )

    examples = (
        pl.scan_parquet("../data/processed/*.parquet")
        .filter(c("speech_id").is_in(ids))
        .collect()
    )

    result = pl.DataFrame(
        {
            "speech_1": examples[:num].get_column("text"),
            "speech_1_id": examples[:num].get_column("speech_id"),
            "speech_2": examples[num:].get_column("text"),
            "speech_2_id": examples[num:].get_column("speech_id"),
        }
    )

    return result


df = sample_pairs(num_samples, seed)

In [19]:
def cost_estimate(texts: list[str], token_chars: int = 4, token_cost=0.000005) -> float:
    return sum(len(text) / token_chars for text in texts) * token_cost


analysis_template = """
Consider the following two excerpts from two political speeches. Note that they contain spelling mistakes and typos since they are were obtained by OCR.

Excerpt 0: {excerpt_0}

Excerpt 1: {excerpt_1}

The goal is to judge which of these two, if any, is more emotional. Don't take the topic itself into account. Sometimes you will need to read between the lines and infer the emotionality level, e.g. in cases when the text is passive aggresive. Is there any difference between Excerpt 1 and Excerpt 2 in terms of emotionality? If there is not, say so.

Only after you’re done with the analysis, end the message with `result: 0` if Excerpt 0 is the more emotional, or `result: 1` if Excerpt 1 is more emotional. Use `result: -1`  if the result is unclear or there is no large difference.
"""

texts = (
    df.with_columns(
        pl.struct(["speech_1", "speech_2"])
        .map_elements(
            lambda s: analysis_template.format(
                excerpt_0=s["speech_1"],
                excerpt_1=s["speech_2"],
            ),
            return_dtype=pl.String,
        )
        .alias("text")
    )
    .get_column("text")
    .to_list()
)

cost_estimate(texts)

16.210755000000002

In [20]:
print(texts[4])


Consider the following two excerpts from two political speeches. Note that they contain spelling mistakes and typos since they are were obtained by OCR.

Excerpt 0: I can see nio possible obijection to this lill It sineply appropriates $2!10.tltitk0 to enable tire Secreta ry of tire lTrersury to .pay bonds nur ig er thie 1st of * taury. There is sonr co1nflict et ae e trnir of tire corion bods ril tir registered bonds. which rnkes it necessary fr the Urite Stares to express its ehie whether it will ray- err tire lot of Jarniry or not. T1his will asuthorize tire issue of an eqiral arronrot of 5 per cent. aends under the lay of 1870 irt paynt of tie bonds issrer under the at of 1853. It wo tusd be uaoed. natrirlly. iat thie Secretary would have tih anthority to do that nuder tie general provisions of law. hut it sems the law of 1s70 ceulinee the exchange therein provided for to tre issue of 5 per cent. heads for tre 520 hiod. which were not of tie character and description nentionred in