In [1]:
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import sklearn
import os, pathlib, zipfile
import kaggle, kagglehub

In [2]:
### Downloading the competition
competition_name = (
    "spaceship-titanic"  # Change this to the competition you want to download
)
competition_path = pathlib.Path(
    kagglehub.competition.competition_download(competition_name)
)  # Downloads the competition
competition_files: list = os.listdir(
    competition_path
)  # List of the files included in the competition

print(competition_files)

['sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
pl.Config.set_tbl_rows(40)
train_df = pl.read_csv(competition_path / "train.csv")
test_df = pl.read_csv(competition_path / "test.csv")
sample_submission_df = pl.read_csv(competition_path / "sample_submission.csv")

Helper functions


In [4]:
def split_passenger_id(p_id: str) -> list[int]:
    id_split = p_id.split("_")
    return [int(id_split[0]), int(id_split[1])]


def split_cabin(cabin: str) -> list:
    return cabin.split("/")


def split_name(name: str) -> list[str]:
    return name.split(" ")

In [5]:
test_df = test_df.with_columns([pl.Series("Transported", [None] * test_df.height)])
df = pl.concat([train_df, test_df])

df = df.with_columns(
    pl.col("PassengerId")
    .map_elements(split_passenger_id, return_dtype=pl.List(pl.Int16))
    .cast(pl.Array(pl.Int16, 2))
)
df = df.with_columns(
    pl.col("Cabin").map_elements(split_cabin, return_dtype=pl.List(pl.String))
)
df = df.with_columns(
    pl.col("Name").map_elements(split_name, return_dtype=pl.List(pl.String))
)

df = df.select(
    [
        pl.col("Cabin").list.get(0).cast(pl.Categorical).alias("cabin_part1"),
        pl.col("Cabin").list.get(1).cast(pl.UInt16).alias("cabin_part2"),
        pl.col("Cabin").list.get(2).cast(pl.Categorical).alias("cabin_part3"),
    ]
).hstack(df.drop("Cabin"))
df = df.select(
    [
        pl.col("Name").list.get(0).cast(pl.Categorical).alias("name_first"),
        pl.col("Name").list.get(1).cast(pl.Categorical).alias("name_last"),
    ]
).hstack(df.drop("Name"))
df = df.select(
    [
        pl.col("PassengerId").arr.get(0).cast(pl.UInt16).alias("id_part1"),
        pl.col("PassengerId").arr.get(1).cast(pl.UInt8).alias("id_part2"),
    ]
).hstack(df.drop("PassengerId"))

df = df.with_columns(
    [
        pl.col("HomePlanet").cast(pl.Categorical),
        pl.col("Destination").cast(pl.Categorical),
        pl.col("Age").cast(pl.UInt8),
        pl.col("RoomService").cast(pl.UInt16),
        pl.col("FoodCourt").cast(pl.UInt16),
        pl.col("ShoppingMall").cast(pl.UInt16),
        pl.col("Spa").cast(pl.UInt16),
        pl.col("VRDeck").cast(pl.UInt16),
    ]
)

In [10]:
classifier = sklearn.ensemble.HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_iter=1000,
    max_leaf_nodes=None,
    max_depth=None,
    min_samples_leaf=10,
    l2_regularization=1.0,
    categorical_features="from_dtype",
    warm_start=True,
    scoring="f1",
    validation_fraction=0.2,
    verbose=1,
    random_state=42,
)

In [None]:
classifier.fit()