In [38]:
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import sklearn
import os, pathlib, shutil, zipfile
import kaggle, kagglehub
from polars.selectors import categorical

In [41]:
### Downloading the competition
competition_name = (
    "spaceship-titanic"  # Change this to the competition you want to download
)
competition_path = pathlib.Path(
    kagglehub.competition.competition_download(competition_name)
)  # Downloads the competition
competition_files: list = os.listdir(
    competition_path
)  # List of the files included in the competition

print(competition_files)

# copy `sample_submission.csv` file to current directory
shutil.copyfile(competition_path / "sample_submission.csv", "sample_submission.csv")


['sample_submission.csv', 'test.csv', 'train.csv']


'sample_submission.csv'

### File and Data Field Descriptions
train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

- PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
- CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- Destination - The planet the passenger will be debarking to.
- Age - The age of the passenger.
- VIP - Whether the passenger has paid for special VIP service during the voyage.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- Name - The first and last names of the passenger.
- Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.
test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
sample_submission.csv - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False.

In [40]:
pl.Config.set_tbl_rows(40)
train_df = pl.read_csv(competition_path / "train.csv")
test_df = pl.read_csv(competition_path / "test.csv")
sample_submission_df = pl.read_csv(competition_path / "sample_submission.csv")
train_df_height = train_df.height

Helper functions


In [26]:
def split_passenger_id(p_id: str) -> list[int]:
    id_split = p_id.split("_")
    return [int(id_split[0]), int(id_split[1])]


def split_cabin(cabin: str) -> list:
    return cabin.split("/")


def split_name(name: str) -> list[str]:
    return name.split(" ")

In [27]:
test_df = test_df.with_columns([pl.Series("Transported", [None] * test_df.height)])
df = pl.concat([train_df, test_df])

df = df.with_columns(
    pl.col("PassengerId")
    .map_elements(split_passenger_id, return_dtype=pl.List(pl.Int16))
    .cast(pl.Array(pl.Int16, 2))
)
df = df.with_columns(
    pl.col("Cabin").map_elements(split_cabin, return_dtype=pl.List(pl.String))
)
df = df.with_columns(
    pl.col("Name").map_elements(split_name, return_dtype=pl.List(pl.String))
)

df = df.select(
    [
        pl.col("Cabin").list.get(0).cast(pl.Categorical).alias("deck_of_cabin"),
        pl.col("Cabin").list.get(1).cast(pl.UInt16).alias("num_of_cabin"),
        pl.col("Cabin").list.get(2).cast(pl.Categorical).alias("side_of_cabin"),
        #     `side_of_cabin` can be either P for Port or S for Starboard.
    ]
).hstack(df.drop("Cabin"))
df = df.select(
    [
        pl.col("Name").list.get(0).cast(pl.Categorical).alias("first_name"),
        pl.col("Name").list.get(1).cast(pl.Categorical).alias("last_name"),
    ]
).hstack(df.drop("Name"))
df = df.select(
    [
        pl.col("PassengerId").arr.get(0).cast(pl.UInt16).alias("group"),
        # `group` column indicates a group the passenger is travelling with.
        pl.col("PassengerId").arr.get(1).cast(pl.UInt8).alias("num_in_group"),
        #`num_in_group` is the passenger number within the group.
        # People in a group are often family members, but not always.
    ]
).hstack(df.drop("PassengerId"))

df = df.with_columns(
    [
        pl.col("HomePlanet").cast(pl.Categorical),
        pl.col("Destination").cast(pl.Categorical),
        pl.col("Age").cast(pl.UInt8),
        pl.col("RoomService").cast(pl.UInt16),
        pl.col("FoodCourt").cast(pl.UInt16),
        pl.col("ShoppingMall").cast(pl.UInt16),
        pl.col("Spa").cast(pl.UInt16),
        pl.col("VRDeck").cast(pl.UInt16),
    ]
)

# RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
df = df.with_columns([
    (pl.col('RoomService').fill_null(0)
     + pl.col('FoodCourt').fill_null(0)
     + pl.col('ShoppingMall').fill_null(0)
     + pl.col('Spa').fill_null(0)
     + pl.col('VRDeck').fill_null(0)
     )
    .cast(pl.UInt16).alias('total_bill')
])

df = df.with_columns([
    # pl.when(pl.col('side_of_cabin') == 'P').then(True).otherwise(False).alias('is_port_side'),
    pl.when(pl.col('side_of_cabin') == 'S').then(True).otherwise(False).alias('is_starboard_side'),
])

df = df.with_row_index(offset=1)
df.write_parquet("raw_df.parquet", statistics='full')

In [33]:
df = pl.read_parquet("raw_df.parquet")

# unused_features = ['index', 'first_name', 'last_name', 'side_of_cabin']
# dubious_features = ['group', 'num_in_group', 'deck_of_cabin', ]
boolean_features = ['CryoSleep', 'VIP', 'is_starboard_side']
categorical_features = ['deck_of_cabin', 'HomePlanet', 'Destination']
numerical_features = ['num_in_group', 'num_of_cabin', 'Age', 'total_bill']
target_feature = ['Transported']

df = df.select(*boolean_features, *categorical_features, *numerical_features, *target_feature).rename(
    {'Age': 'age', 'CryoSleep': 'cryo_sleep', 'VIP': 'vip', 'HomePlanet': 'home_planet', 'Destination': 'destination'})

train_df = df.limit(train_df_height)
test_df = df.slice(train_df_height)

In [34]:
train_df.write_parquet("train_df.parquet", statistics='full')
test_df.write_parquet("test_df.parquet", statistics='full')