In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost as xgb
train = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\train.csv")
test = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\test.csv")
data = pd.concat([train, test], ignore_index=True, sort=False)

From EDA we saw numerical values had massive outliers, so we winsorize.

In [2]:
from scipy.stats.mstats import winsorize
numeric_cols = data.select_dtypes(include=[np.number]).columns
for column in numeric_cols:
    data[column] = winsorize(data[column], limits=[0, 0.02])

People in the same group or family share characteristics as we saw from the EDA, so it is useful to create columns for these.

In [3]:
#DEFINING FAMILY AND GROUP

# Split the PassengerId column
id_split = data['PassengerId'].str.split('_', expand=True)
# Assign the group column to the dataframe
data['Group'] = id_split[0]
# Find family members by grouping on last name
name_split = data['Name'].str.split(' ', expand=True)
# Assign the split columns to the dataframe
data['Surname'] = name_split[1]
data['Family'] = data['Group'].astype(str) + "_" + name_split[1]
data['GroupSize'] = data.groupby('Group')['Group'].transform('size')

We fill in missing values based on assumptions which we justified in the EDA

In [4]:
def impute_based_on_assumptions(data, age_col, cryo_col):
    numeric_columns = data.select_dtypes(include=[np.number]).columns.drop(age_col)

    # Assumption 1: Under 13s have spent no money and are not VIPs
    data.loc[(data[age_col] < 13), numeric_columns] = data.loc[(data[age_col] < 13), numeric_columns].fillna(0)
    data.loc[(data[age_col] < 13), 'VIP'] = data.loc[(data[age_col] < 13), 'VIP'].fillna(False)

    # Assumption 2: Those in cryosleep spent no money
    data.loc[data[cryo_col] == True, numeric_columns] = data.loc[data[cryo_col] == True, numeric_columns].fillna(0)
    
    #Assumption 3: Those in cryosleep are not VIPs ----- NOT VALID
    #data.loc[data[cryo_col] == True, 'VIP'] = data.loc[data[cryo_col] == True, 'VIP'].fillna(False)
    
    
    data['NoSpend'] = data['VRDeck'] + data['ShoppingMall'] + data['Spa'] + data['RoomService'] + data['FoodCourt'] == 0
    

    #Assumption 4: Single Travellers with no expenses who are not VIPs are in Cryosleep
    data.loc[(data['GroupSize'] == 1) & (data['VIP'] == False) & (data['NoSpend']==True), cryo_col].fillna(True)

    # Assumption 5: Single travellers who are not in cryosleep have spent money
    data.loc[(data['GroupSize'] == 1) & (data[cryo_col] == False), 'NoSpend'].fillna(True)

    return data

data = impute_based_on_assumptions(data, 'Age', 'CryoSleep')
data = impute_based_on_assumptions(data, 'Age', 'CryoSleep')

People in the same family/cabin/group usually have similar data, so we fill missing values based on this.

In [5]:
#Filling missing HomePlanet and VIP values based on other group members
def impute_from_group(data, group_col, impute_columns):
    for col in impute_columns:
        # For each group, find the mode of the column
        group_mode = data.groupby(group_col)[col].apply(lambda x: x.mode().get(0, np.nan))
        # Join this back to the original dataframe
        data = data.join(group_mode, on=group_col, rsuffix='_group_mode')
        
        # Impute missing values for non-single group members from the group mode
        group_counts = data[group_col].value_counts()
        non_single_groups = group_counts[group_counts > 1].index
        condition = (data[col].isnull()) & (data[group_col].isin(non_single_groups))
        data.loc[condition, col] = data.loc[condition, col + '_group_mode']
        
        # Drop the temporary columns
        data = data.drop(columns=[col + '_group_mode'])

    return data

# Impute 'HomePlanet' and 'VIP' based on group members
data = impute_from_group(data, 'Group', ['HomePlanet', 'VIP'])

New columns which might be relevant

In [6]:
# Split the Cabin column
cabin_split = data['Cabin'].str.split('/', expand=True)

data['Deck'] = cabin_split[0]
data['Side'] = cabin_split[2]

#NonEssential Spending
data['Luxury'] = data['VRDeck'] + data['Spa'] + data['RoomService']
#data['NonEssentialSpending'] = np.log(data['NonEssentialSpending'] + 1)

#Essential Spending
data['Essential'] = data['ShoppingMall'] + data['FoodCourt']
#data['EssentialSpending'] = np.log(data['EssentialSpending'] + 1)

# We remove unnecessary columns
PassengerID = data['PassengerId']
Transported = data['Transported']
datan = data.drop(columns=['PassengerId', 'Cabin','Name','Family','Group','Surname','Transported'])
datan = datan.drop(columns=['VRDeck','Spa','RoomService','ShoppingMall','FoodCourt'])
print(datan.isnull().sum())

HomePlanet     157
CryoSleep      310
Destination    274
Age            270
VIP            168
GroupSize        0
NoSpend          0
Deck           299
Side           299
Luxury         469
Essential      332
dtype: int64


In [7]:
from sklearn.preprocessing import StandardScaler

# Features to be scaled
features_to_scale = ['Age', 'GroupSize', 'Luxury', 'Essential']

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the selected features
datan[features_to_scale] = scaler.fit_transform(datan[features_to_scale])

Imputing with Iterative Imputer and with KNN proved to give similar results. I may try voting with one model trained on a data set which was processed with iterative impuitation and one processed with KNN to see if that can help.

In [8]:
from sklearn.impute import IterativeImputer

# One-hot encode categorical variables
encoded = pd.get_dummies(datan, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP','Deck','Side','NoSpend'])

# Apply Iterative Imputer
iterative_imputer = IterativeImputer()
data_imputed = iterative_imputer.fit_transform(encoded)

# Convert back to DataFrame
dataf = pd.DataFrame(data_imputed, columns=encoded.columns)

# Check if there are still any missing values
print(dataf.isnull().sum())
dataf['Transported'] = Transported
dataf['PassengerId'] = PassengerID


# Splitting back into train and test sets
train = dataf[dataf['Transported'].notna()].copy()
test = dataf[dataf['Transported'].isna()].copy()
test.drop(columns=['Transported'], inplace=True)

Age                          0
GroupSize                    0
Luxury                       0
Essential                    0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_False              0
CryoSleep_True               0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
VIP_False                    0
VIP_True                     0
Deck_A                       0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_T                       0
Side_P                       0
Side_S                       0
NoSpend_False                0
NoSpend_True                 0
dtype: int64


In [9]:
# Save the DataFrame to a CSV file
train.to_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTrain.csv", index=False)
test.to_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\Stacking\\Process\\CombinedTest.csv", index=False)

In [10]:
train.head(100)

Unnamed: 0,Age,GroupSize,Luxury,Essential,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,...,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S,NoSpend_False,NoSpend_True,Transported,PassengerId
0,0.710937,-0.648668,-0.445806,-0.365928,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,False,0001_01
1,-0.331693,-0.648668,-0.062512,-0.345663,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,True,0002_01
2,2.031602,-0.014569,3.270830,1.765479,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,False,0003_01
3,0.293885,-0.014569,1.477213,0.619907,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,False,0003_02
4,-0.887763,-0.648668,0.029216,-0.234205,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,True,0004_01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.331693,0.619530,-0.436524,-0.087789,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,True,0103_01
96,-1.374323,0.619530,-0.445806,-0.365928,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,True,0103_02
97,-1.582849,0.619530,-0.445806,-0.365928,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,False,0103_03
98,-0.123167,-0.648668,-0.373187,-0.026190,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,False,0105_01


In [11]:
test.head(100)

Unnamed: 0,Age,GroupSize,Luxury,Essential,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S,NoSpend_False,NoSpend_True,PassengerId
8693,-0.123167,-0.648668,-0.445806,-0.365928,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0013_01
8694,-0.679237,-0.648668,1.095558,-0.360563,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0018_01
8695,0.154868,-0.648668,-0.445806,-0.365928,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0019_01
8696,0.641428,-0.648668,-0.027568,3.598870,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0021_01
8697,-0.609728,-0.648668,-0.440346,0.012552,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0023_01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8788,0.919463,-0.014569,2.178826,-0.354007,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0214_02
8789,-0.401202,-0.648668,-0.445806,-0.365928,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0215_01
8790,-0.679237,-0.648668,-0.041218,-0.358179,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0218_01
8791,-0.262185,-0.648668,0.391761,6.745317,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0226_01
