In [2]:
import pandas as pd
import numpy as np

In [3]:
# In this notebook A XGB Gradient Boosting Classifier model was used.


# import pandas as pd
# from sklearn.model_selection import train_test_split


# Load the data, and separate the target
spaceship_file_path = 'train.csv'
spaceship_data = pd.read_csv(spaceship_file_path)

#spaceship_data["Transported"] = spaceship_data["Transported"].astype(int)
y = spaceship_data.Transported
#y.astype('category')

features = ['HomePlanet', 'CryoSleep','Destination','Age','VIP',
            "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

# Select columns corresponding to features, and preview the data
X_train_full = spaceship_data[features]


test_file_path = 'test.csv'
test_data = pd.read_csv(test_file_path)

In [4]:
# preprocessing the categorical variables in both the train data and test data
# get_dummies function is simpler and more straightforward than OneHotEncoding. 
# However, before using get_dummies, make sure the same number of variables
# in X_test and X_train.  Otherwise,there will be inconsistency after using get_dummies



X_train = pd.get_dummies(X_train_full[features])
X_test = pd.get_dummies(test_data[features])

In [5]:
#X_train.shape
X_train.columns


Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'CryoSleep_False', 'CryoSleep_True', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'VIP_False',
       'VIP_True'],
      dtype='object')

In [6]:
#X_test.shape
X_test.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'CryoSleep_False', 'CryoSleep_True', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'VIP_False',
       'VIP_True'],
      dtype='object')

In [7]:
# There are missing numerical values in both X_train and X_test, impute them.

from sklearn.impute import SimpleImputer

#imputation : handling the missing values in the train data and test data
#use X_train as fit_transform first, then X_test transform next. This will 
#enforce both data sets have the same number of columns!!
my_imputer = SimpleImputer(strategy = 'median') # Your code here
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

In [8]:
imputed_X_train.isnull().sum()

Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_False              0
CryoSleep_True               0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
VIP_False                    0
VIP_True                     0
dtype: int64

In [9]:
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [10]:
imputed_X_test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,27.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,19.0,0.0,9.0,0.0,2823.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,38.0,0.0,6652.0,0.0,181.0,585.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,20.0,10.0,0.0,635.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [11]:
from xgboost import XGBClassifier

my_model = XGBClassifier()
my_model.fit(imputed_X_train,y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [12]:
y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [13]:
# predictions
# This model outputs boolean 0 or 1
my_list = my_model.predict(imputed_X_test)


In [14]:
my_list

array([1, 0, 1, ..., 1, 1, 0])

In [15]:
# We want the output to be False or True. 
predictions = [bool(item) for item in my_list]

In [16]:
#Use cross validation to evaluate the performance of the model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_model, imputed_X_train, y, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())


Mean cross-validation score: 0.79


In [17]:
# output predictions data
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
