# Spaceship Titanic

## Import

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Loading the Data

In [2]:
train = pd.read_csv("/Users/andrei/Downloads/spaceship-titanic/train.csv")

In [3]:
X = train.drop(columns = ["Transported", "PassengerId", "Name"])

In [4]:
y = train[["Transported"]]

In [5]:
y = y.Transported.astype(int)

In [6]:
y

0       0
1       1
2       0
3       0
4       1
       ..
8688    0
8689    0
8690    1
8691    0
8692    1
Name: Transported, Length: 8693, dtype: int64

## Preprocessing

### Imputing

In [7]:
X[['Deck','Room', 'Side']] = X.Cabin.str.split("/",expand=True)

In [8]:
X.drop(columns = ["Room"], inplace = True)

In [9]:
cat_imputer = SimpleImputer(strategy = "most_frequent")

In [10]:
age_imputer = SimpleImputer()

In [11]:
mean_imputer = SimpleImputer(fill_value = 0)

In [12]:
X[["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]] = cat_imputer.fit_transform(X[["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]])

In [13]:
X[["Age"]] = age_imputer.fit_transform(X[["Age"]])

In [14]:
X[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = mean_imputer.fit_transform(X[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]])

In [15]:
X.isna().sum()

HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Deck              0
Side              0
dtype: int64

### Encoding

In [16]:
X["CryoSleep"], X["VIP"] = X["CryoSleep"].astype(int), X["VIP"].astype(int)

In [17]:
X["isPort"] = np.where(X["Side"] == "P", 0, 1)

In [18]:
X.drop(columns = ["Side"], inplace = True)

In [19]:
X.drop(columns = ["Cabin"], inplace = True)

In [20]:
ohe = OneHotEncoder(sparse=False)

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(X[['HomePlanet','Destination','Deck']])
cols = ohe.get_feature_names_out(['HomePlanet','Destination','Deck'])
#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=X.index, columns=cols)

#Extract only the columns that didnt need to be encoded
data_other_cols = X.drop(columns=['HomePlanet','Destination','Deck'])

#Concatenate the two dataframes : 
X_ohe = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [21]:
X = X_ohe

### Normalization

In [22]:
import matplotlib.pyplot as plt

In [23]:
from sklearn.preprocessing import StandardScaler, RobustScaler

In [24]:
standard = StandardScaler()
robust = RobustScaler()

In [25]:
X["Age"] = standard.fit_transform(X[["Age"]])

In [26]:
X["RoomService"] = robust.fit_transform(X[["RoomService"]])

In [27]:
X["VRDeck"] = robust.fit_transform(X[["VRDeck"]])

In [28]:
X["FoodCourt"] = robust.fit_transform(X[["FoodCourt"]])

In [29]:
X["ShoppingMall"] = robust.fit_transform(X[["ShoppingMall"]])

In [30]:
X["Spa"] = robust.fit_transform(X[["Spa"]])

In [31]:
X.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,...,Deck_T,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,isPort
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0,0.709437,0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.336717,0,1.397436,0.076271,0.555556,6.168539,0.619718,1
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,2.034566,1,0.551282,30.305085,0.0,75.449438,0.690141,1
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0,0.290975,0,0.0,10.872881,8.244444,37.404494,2.71831,1
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0,-0.894666,0,3.884615,0.59322,3.355556,6.348315,0.028169,1


### Split

In [45]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Prediction 1

In [76]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [77]:
xgb_class = xgb.XGBClassifier(max_depth=1000, n_estimators=100, learning_rate=0.01, use_label_encoder=False)

xgb_class.fit(X_train, y_train,
    eval_set=[(X_val, y_val)],  
    early_stopping_rounds= 100,
    eval_metric = "error")

[0]	validation_0-error:0.22024
[1]	validation_0-error:0.22427
[2]	validation_0-error:0.22484
[3]	validation_0-error:0.22657
[4]	validation_0-error:0.22887
[5]	validation_0-error:0.22829
[6]	validation_0-error:0.22829
[7]	validation_0-error:0.22829
[8]	validation_0-error:0.22427
[9]	validation_0-error:0.22312
[10]	validation_0-error:0.22369
[11]	validation_0-error:0.22369
[12]	validation_0-error:0.22369
[13]	validation_0-error:0.22484
[14]	validation_0-error:0.22599
[15]	validation_0-error:0.22542
[16]	validation_0-error:0.22312
[17]	validation_0-error:0.22484
[18]	validation_0-error:0.22714
[19]	validation_0-error:0.22599
[20]	validation_0-error:0.22542
[21]	validation_0-error:0.22082
[22]	validation_0-error:0.22254
[23]	validation_0-error:0.22369
[24]	validation_0-error:0.22369
[25]	validation_0-error:0.22484
[26]	validation_0-error:0.22427
[27]	validation_0-error:0.22484
[28]	validation_0-error:0.22369
[29]	validation_0-error:0.22369
[30]	validation_0-error:0.22139
[31]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=1000, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

## Gridsearch for parameters

In [55]:
from sklearn.model_selection import GridSearchCV

In [56]:
xgb_class.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=1000, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)>

In [78]:
gs = GridSearchCV(xgb_class,
                 param_grid={"max_depth":range(100, 1000, 50), "learning_rate":[0.001, 0.01, 0.1], "n_estimators":range(10, 200, 5)},
                 scoring = "roc_auc")
             

In [None]:
results = gs.fit(X_train, y_train, verbose = 1)























































































































































































































































































































































































































































































































































