# 🔎 Analyze: Exploratory Data Analysis (EDA) for Spaceship Titanic

This notebook guides a quick EDA of the dataset to understand features, missing values, and basic distributions.

📘 Thought processes are provided in analyze and in the cleaning notebooks clean.

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
spending_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
feature_cols = ["CryoSleep", "RoomService", "Spa", "VRDeck", "FoodCourt", "ShoppingMall"]

def preprocess(df):
    df = df.copy()

    # Total spending
    df["TotalSpending"] = df[spending_cols].sum(axis=1, skipna=True)

    # Fill CryoSleep NaN deterministically from spending: if spend>0 -> False, else -> True
    mask_cryo_nan = df["CryoSleep"].isna()
    df.loc[mask_cryo_nan, "CryoSleep"] = df.loc[mask_cryo_nan, "TotalSpending"].gt(0).map({True: False, False: True})

    # If kid (<13) or CryoSleep True or all spendings NaN -> set spendings to 0
    all_spend_nan = df[spending_cols].isna().all(axis=1)
    mask_zero_spend = (df["Age"].fillna(-1) < 13) | (df["CryoSleep"] == True) | all_spend_nan
    df.loc[mask_zero_spend, spending_cols] = df.loc[mask_zero_spend, spending_cols].fillna(0)

    # Fill any remaining spending NaNs with 0
    df[spending_cols] = df[spending_cols].fillna(0)

    # Ensure boolean dtype
    df["CryoSleep"] = df["CryoSleep"].astype(bool)

    return df

In [11]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train_p = preprocess(train)
test_p  = preprocess(test)

# Features/target
X_train_full = pd.get_dummies(train_p[feature_cols])
X_test_full  = pd.get_dummies(test_p[feature_cols])

# Align columns between train and test
X_train_full, X_test_full = X_train_full.align(X_test_full, join="outer", axis=1, fill_value=0)

y = train_p["Transported"].astype(bool)

In [12]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_tr, y_tr)

val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
print(f"Validation accuracy: {val_acc*100:.2f}%")

Validation accuracy: 80.28%


In [None]:
model.fit(X_train_full, y)
test_pred = model.predict(X_test_full)

submission = pd.DataFrame({
    "PassengerId": test_p["PassengerId"],
    "Transported": test_pred
})

submission.to_csv("data/submission.csv", index=False)
print("Wrote submission.csv")

Wrote submission.csv
