In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv("data/test.csv")

#region --- Fill NaN HomePlanet based on Group ---

df['Group'] = df['PassengerId'].str.split('_').str[0]
choices_home_planet = ["Earth", "Europa", "Mars"]

def fill_home_planet(group):
    # If any non-missing values exist, use the first one
    if group['HomePlanet'].notna().any():
        fill_value = group['HomePlanet'].dropna().iloc[0]

    else:
        # Otherwise, pick a random planet
        fill_value = np.random.choice(choices_home_planet)

    # Fill missing values
    group['HomePlanet'] = group['HomePlanet'].fillna(fill_value)
    
    return group

df = df.groupby('Group', group_keys=False).apply(fill_home_planet)
#endregion

#region --- Fill NaN CryoSleep based on Spendings ---
spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
total_spending_col = df['TotalSpending'] = df[spending_cols].sum(axis=1, skipna=True)
mask_cryosleep_nan = df['CryoSleep'].isna()
df.loc[mask_cryosleep_nan, 'CryoSleep'] = total_spending_col[mask_cryosleep_nan].gt(0).map({True: False, False: True})
#endregion

col_features = ["RoomService", "Spa", "VRDeck", "FoodCourt", "ShoppingMall"]

#region --- Fill NaN spendings based on Age and CryoSleep status ---
kids_and_cryosleeping_mask = (
    (df["Age"] < 13) |
    (df["CryoSleep"] == True) |
    # This condition ensures we only target rows where all spending columns are NaN
    (df[spending_cols].isna().all(axis=1))
)
df.loc[kids_and_cryosleeping_mask, spending_cols] = 0
#endregion

# Fill remaining cols with 0 where NaN
df[col_features] = df[col_features].fillna(0)

In [None]:
col_features = ["CryoSleep", "RoomService", "Spa", "VRDeck", "FoodCourt", "ShoppingMall"]
X = pd.get_dummies(df[col_features])
y = df["Transported"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_train, y_train)

prediction = model.predict(X_valid)

accuracy = accuracy_score(y_valid, prediction)
print(f"Score accuracy on validation: {accuracy*100:.2f}%")