In [1]:
from itertools import chain

def prepare_files(file_name):
    base_name = file_name.split(".")[0]
    with open(file_name, "r") as file:
        chosen_numbers = [number for number in next(file).strip().split(",")]
        with open(f"{base_name}/chosen_numbers.txt", "w") as chosen_numbers_file:
            chosen_numbers_file.write(",".join(chosen_numbers))

        board_idx = 0
        current_file_content = []
        for line in chain(file, [""]):
            line = line.strip()
            if not line:
                if current_file_content:
                    with open(f"{base_name}/{base_name}_{board_idx}.txt", "w") as current_file:
                        current_file.write("\n".join(current_file_content))
                    current_file_content = []
                    board_idx += 1
            else:
                current_file_content.append(line)

prepare_files("test-input.txt")
prepare_files("input.txt")

In [2]:
from pyspark.sql import SparkSession, functions as F, Window

In [3]:
spark = SparkSession \
            .builder \
            .appName("advent-of-code-2021") \
            .master("local[*]") \
            .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/13 09:45:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
def load_boards(path):
    return spark.read.text(path) \
        .withColumn("board_idx", F.regexp_extract(F.input_file_name(), "^.*input_(\\d+)\.txt$", 1).cast("int"))

def load_chosen_numbers(path):
    with open(f"{path}/chosen_numbers.txt", "r") as file:
        return [int(number) for number in next(file).strip().split(",")]

def load_data(path):
    return load_boards(path), load_chosen_numbers(path)

df_test, chosen_numbers_test = load_data("test-input/")
df, chosen_numbers = load_data("input/")

In [5]:
from functools import reduce
from operator import add, or_

board_size = 5
number_mark = -1

def prepare_boards(df):
    columns = [
        "board_idx",
        (F.row_number().over(Window.partitionBy("board_idx").orderBy(F.lit(1))) - 1).alias("row_idx")
    ]
    for idx in range(board_size):
        columns.append(
            F.split(F.trim("value"), "\s+") \
                .getItem(idx) \
                .cast("int") \
                .alias(f"col_{idx}")
        )
    return df.select(*columns)

col_columns = [f"col_{idx}" for idx in range(board_size)]
def mark_numbers(df, chosen_number):
    return df.replace(chosen_number, number_mark, subset=col_columns)

def check_board_win(df):
    row_wins = df \
        .select([
            "board_idx",
            reduce(add, [F.col(column) for column in col_columns]).alias("row_sum")
        ]) \
        .filter(F.col("row_sum") == -board_size) \
        .select("board_idx")

    agg_columns = [F.sum(f"col_{idx}").alias(f"col_sum_{idx}") for idx in range(board_size)]
    filter_columns = [F.col(f"col_sum_{idx}") == -board_size for idx in range(board_size)]
    col_wins = df \
        .groupBy("board_idx") \
        .agg(*agg_columns) \
        .filter(reduce(or_, filter_columns)) \
        .select("board_idx")
    return row_wins.union(col_wins) \
        .dropDuplicates()

def sum_unmarked_numbers(df, board_idx):
    return df \
        .filter(F.col("board_idx") == board_idx) \
        .replace(number_mark, 0, subset=col_columns) \
        .select(reduce(add, [F.col(column) for column in col_columns]).alias("row_sum")) \
        .select(F.sum("row_sum"))

In [6]:
# Part 1
def part_1(df, chosen_numbers):
    df = prepare_boards(df).repartition(4, "board_idx")
    winning_board_idx = None
    for chosen_number in chosen_numbers:
        df = mark_numbers(df, chosen_number).cache()
        board_wins = check_board_win(df)
        if board_wins.count() > 0:
            winning_board_idx = board_wins.take(1)[0][0]
            break

    if winning_board_idx is None:
        raise ValueError("No winning board found!")

    numbers_sum = sum_unmarked_numbers(df, winning_board_idx).take(1)[0][0]

    return numbers_sum * chosen_number

assert part_1(df_test, chosen_numbers_test) == 4512
print(f"Solution: {part_1(df, chosen_numbers)}")

21/12/13 09:45:54 WARN BlockManager: Block rdd_12_3 already exists on this machine; not re-adding it
21/12/13 09:45:54 WARN BlockManager: Block rdd_12_2 already exists on this machine; not re-adding it
                                                                                

Solution: 38594


In [7]:
# Part 2
def part_2(df, chosen_numbers):
    df = prepare_boards(df).repartition(4, "board_idx")
    total_boards_count = df.agg(F.countDistinct("board_idx")).collect()[0][0]
    previous_winning_boards = set()
    last_winning_board_idx = None
    last_winning_chosen_number = None
    for chosen_number in chosen_numbers:
        df = mark_numbers(df, chosen_number).cache()
        board_wins = check_board_win(df)
        current_winning_boards = set(board[0] for board in board_wins.collect())

        if len(current_winning_boards) == total_boards_count:
            last_winning_board_idx = (current_winning_boards - previous_winning_boards).pop()
            last_winning_chosen_number = chosen_number
            break
        previous_winning_boards = current_winning_boards

    if last_winning_board_idx is None:
        raise ValueError("No winning board found!")

    numbers_sum = sum_unmarked_numbers(df, last_winning_board_idx).take(1)[0][0]

    return numbers_sum * last_winning_chosen_number

assert part_2(df_test, chosen_numbers_test) == 1924
print(f"Solution: {part_2(df, chosen_numbers)}")

21/12/13 09:46:42 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:42 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:43 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:44 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:44 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:45 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:46 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:47 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:47 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:48 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:49 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:50 WARN CacheManager: Asked to cache already cached data.
21/12/13 09:46:55 WARN CacheManager: Asked to cache already cached data.        
21/12/13 09:46:56 WARN CacheManager: Asked 

Solution: 21184
