In [1]:
from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.appName("advent-of-code-2024").getOrCreate()

In [3]:
def read_input(input_file_name: str) -> DataFrame:
    return (
        spark.read.text(input_file_name)
        .withColumn("pair_array", F.split(F.col("value"), r"\s+"))
        .withColumns(
            {
                f"value_{idx}": F.col("pair_array").getItem(idx).cast(IntegerType())
                for idx in range(2)
            }
        )
    )

In [4]:
def part_1(input_file_name: str) -> int:
    raw_df = read_input(input_file_name)

    sorted_values_df_list = []
    for idx in range(2):
        sorted_values_df = (
            raw_df.select(F.col(f"value_{idx}"))
            .sort(F.col(f"value_{idx}"))
            .withColumn("id", F.monotonically_increasing_id())
        )
        sorted_values_df_list.append(sorted_values_df)

    return (
        sorted_values_df_list[0]
        .join(sorted_values_df_list[1], "id", "inner")
        .drop(F.col("id"))
        .withColumn("value_difference", F.abs(F.col("value_0") - F.col("value_1")))
        .agg(F.sum(F.col("value_difference")))
        .collect()[0][0]
    )


assert part_1("test-input.txt") == 11

print(f'Solution: {part_1("input.txt")}')

Solution: 3574690


In [15]:
def part_2(input_file_name: str) -> int:
    raw_df = read_input(input_file_name)

    counter_2_df = raw_df.groupBy(F.col("value_1")).count()

    return (
        raw_df.join(counter_2_df, raw_df.value_0 == counter_2_df.value_1)
        .withColumn("similarity_score", F.col("value_0") * F.col("count"))
        .agg(F.sum("similarity_score"))
        .collect()[0][0]
    )


assert part_2("test-input.txt") == 31

print(f'Solution: {part_2("input.txt")}')

Solution: 22565391
