In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from typing import Union, Optional, Dict, Tuple, Any

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay, accuracy_score

In [10]:
spaceship = pd.read_csv('../datasets/spaceship_titanic/train.csv').dropna(how='all')
spaceship

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [11]:
spaceship.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 3.4 MB


In [12]:
spaceship.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [16]:
not_nan_rows_mask = ~(spaceship.isna().sum(axis=1) >= 2)
spaceship = spaceship[not_nan_rows_mask]

In [18]:
spaceship.isna().sum()

PassengerId       0
HomePlanet      163
CryoSleep       167
Cabin           158
Destination     146
Age             151
VIP             165
RoomService     149
FoodCourt       142
ShoppingMall    164
Spa             153
VRDeck          151
Name            158
Transported       0
dtype: int64

In [20]:
spaceship['Transported'] = spaceship['Transported'].astype('int8')

In [21]:
y = spaceship['Transported']
x = spaceship.drop('Transported', axis=1)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [24]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((6354, 13), (2119, 13), (6354,), (2119,))

In [None]:
def preprocess_first(
    data: pd.DataFrame, 
) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    
    data = data.copy()
    data_info: Dict[str, Any] = {}

    data['Group'] = data['PassengerId'].str.split('_').str[0]
    data['GroupSize'] = data.groupby('Group')['Group'].transform('count').astype(np.int16)
    data['CabinSide'] = data['Cabin'].str.split('/').str[-1]

    bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    data[bill_cols] = data[bill_cols].fillna(0).astype('int32')
    
    data_info['home_planet'] = data['HomePlanet'].mode()[0]
    data_info['cryo_sleep'] = data['CryoSleep'].mode()[0]
    data_info['destination'] = data['Destination'].mode()[0]
    data_info['cabin_side'] = data['CabinSide'].mode()[0]
    data_info['age'] = data['Age'].median() if age_mode == 'median' else data['Age'].mean()
    
    data_info['bill_cols'] = bill_cols
    data_info['age_mode'] = age_mode
    data_info['vip_mode'] = vip_mode
    
    vip_median = data[data['VIP'] == True]['TotalBill'].median()
    vip_mean = data[data['VIP'] == True]['TotalBill'].mean()
    data_info['vip_threshold'] = vip_median if vip_mode == 'median' else vip_mean
    
    cols_to_fill = {
        'HomePlanet': data_info['home_planet'],
        'CryoSleep': data_info['cryo_sleep'],
        'Destination': data_info['destination'],
        'CabinSide': data_info['cabin_side'],
        'Age': data_info['age']
    }
    
    data_info['cols_to_fill'] = cols_to_fill
    
    for col, global_val in cols_to_fill.items():
        group_filled = data.groupby('Group')[col].transform(lambda x: fill_na_logic(x, global_val))
        data[col] = data[col].fillna(group_filled).infer_objects(copy=False)
    
    vip_mask = data['VIP'].isna()
    new_vip_values = (data.loc[vip_mask, 'TotalBill'] > data_info['vip_threshold']).astype(bool)
    data.loc[vip_mask, 'VIP'] = new_vip_values
    
    data['VIP'] = data['VIP'].astype('int8')
    data['CryoSleep'] = data['CryoSleep'].astype('int8')
    data['Age'] = data['Age'].astype('int16')
    
    data = pd.get_dummies(data, columns=['HomePlanet', 'Destination', 'CabinSide'], dtype='int8')

    drop_cols = ['Name', 'PassengerId', 'Cabin', 'Group'] + bill_cols
    data.drop(drop_cols, axis=1, inplace=True)

    home_planet_cols = sorted([c for c in data.columns if c.startswith('HomePlanet_')])
    cabin_side_cols = sorted([c for c in data.columns if c.startswith('CabinSide_')])
    destination_cols = sorted([c for c in data.columns if c.startswith('Destination_')])
    
    columns_order = (
        ['GroupSize'] + 
        home_planet_cols + 
        ['CryoSleep'] + 
        cabin_side_cols + 
        destination_cols + 
        ['Age', 'VIP', 'TotalBill']
    )
    
    data = data.reindex(columns=columns_order, fill_value=0)

    ohe_cols = [c for c in data.columns if '_' in c or c in ['VIP', 'CryoSleep']]
    data[ohe_cols] = data[ohe_cols].astype('int8')
    
    data_info["data_columns"] = list(data.columns)
    
    return data, data_info