# Spaceship Titanic

- [Link to Source](https://www.kaggle.com/competitions/spaceship-titanic/overview)

---

In [17]:
import sys

!{sys.executable} -m pip install hyperopt --quiet
!{sys.executable} -m pip install xgboost --quiet

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:
import os
import time

import sagemaker
import time
import boto3
import numpy as np
import pandas as pd
from numpy import array
import xgboost as xgb

from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [19]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

bucket = "test-sagemaker-examples-1357942113492"
prefix = "DEMO_Spaceship"

In [20]:
train = pd.read_csv(f's3://{bucket}/{prefix}/train.csv')
test = pd.read_csv(f's3://{bucket}/{prefix}/test.csv')

In [21]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [22]:
def scale_columns(df, column):
    df[column] = df[column].fillna(df[column].mean())

    df[column] = (df[column].astype(int)-df[column].min())/(df[column].max() - df[column].min())
    return df[column]

def change_to_int(column_data, nan_value):
    column_data= column_data.fillna(nan_value)
    column_data= column_data.astype(int)
    return column_data

def one_hot_encode(df, column):
    df[column].fillna(df[column].value_counts().idxmax(), inplace=True)
    
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df[column])
    num_unique_val = len(set(integer_encoded))
    
    hot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    hot_encoded = hot_encoder.fit_transform(integer_encoded)
    return hot_encoded, num_unique_val

In [23]:
def dataset_transform(df):
    if "Transported" in df.columns:
        first_col = df.pop('Transported').astype(int)
        df.insert(0, 'Transported', first_col)
    
    passenger_id = df['PassengerId']
       
    df['VIP'] = change_to_int(df['VIP'], False)
    df['CryoSleep'] = change_to_int(df['CryoSleep'], False)
    
    df.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)
    
    df['Cost'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    
    list_to_scale = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cost']
    for i in list_to_scale:
        df[i] = scale_columns(df, i)
        
    list_to_hot_encode = ['HomePlanet', 'Destination']
    for i in list_to_hot_encode:
        hot_encoded, unique_val_count = one_hot_encode(df, i)
        new_columns = []
        for j in range(unique_val_count):
            new_columns.append(f'{i}-{j}')
        df[new_columns] = hot_encoded    
    df.drop(["HomePlanet", "Destination"], axis=1, inplace=True)
    
    
    return df, passenger_id

In [24]:
train, train_pass = dataset_transform(train)
test, test_pass = dataset_transform(test)

In [25]:
train.head()

Unnamed: 0,Transported,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cost,HomePlanet-0,HomePlanet-1,HomePlanet-2,Destination-0,Destination-1,Destination-2
0,0,0,0.493671,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,0,0.303797,0,0.007608,0.000302,0.001064,0.0245,0.001823,0.020452,1.0,0.0,0.0,0.0,0.0,1.0
2,0,0,0.734177,1,0.003001,0.119948,0.0,0.29967,0.00203,0.288521,0.0,1.0,0.0,0.0,0.0,1.0
3,0,0,0.417722,0,0.0,0.043035,0.015793,0.148563,0.007997,0.14383,0.0,1.0,0.0,0.0,0.0,1.0
4,1,0,0.202532,0,0.021149,0.002348,0.006428,0.025214,8.3e-05,0.030317,1.0,0.0,0.0,0.0,0.0,1.0


In [26]:
train_pass

0       0001_01
1       0002_01
2       0003_01
3       0003_02
4       0004_01
         ...   
8688    9276_01
8689    9278_01
8690    9279_01
8691    9280_01
8692    9280_02
Name: PassengerId, Length: 8693, dtype: object

In [27]:
test.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cost,HomePlanet-0,HomePlanet-1,HomePlanet-2,Destination-0,Destination-1,Destination-2
0,1,0.341772,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0,0.240506,0,0.0,0.000356,0.0,0.14226,0.0,0.08412,1.0,0.0,0.0,0.0,0.0,1.0
2,1,0.392405,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,0.481013,0,0.0,0.263206,0.0,0.009121,0.026266,0.220341,0.0,1.0,0.0,0.0,0.0,1.0
4,0,0.253165,0,0.000865,0.0,0.07658,0.0,0.0,0.019159,1.0,0.0,0.0,0.0,0.0,1.0


In [28]:
train_file = "train_modified.csv"
test_file = "test_modified.csv"
train.to_csv(f'./{train_file}', index=False, header=False)
test.to_csv(f'./{test_file}', index=False, header=False)

boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, f'train/{train_file}')).upload_file(train_file)
boto3.Session().resource("s3").Bucket(bucket).Object(os.path.join(prefix, f'test/{test_file}')).upload_file(test_file)


In [29]:
space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
}

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
        n_estimators =space['n_estimators'], 
        max_depth = int(space['max_depth']), 
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        min_child_weight=int(space['min_child_weight']),
        colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [58]:
from sagemaker.inputs import TrainingInput

input_train = TrainingInput(
    s3_data=f's3://{bucket}/{prefix}/train/{train_file}',
    content_type="csv"
)

container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

hyperpar = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'num_round':'100'
}

output_path = f's3://{bucket}/{prefix}/output'

xgb = sagemaker.estimator.Estimator(
    image_uri=container,
    hyperparameters=hyperpar,
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=output_path,
    sagemaker_session=sess)

In [59]:
xgb.fit({"train": input_train})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-05-31-12-46-41-909


2023-05-31 12:46:42 Starting - Starting the training job...
2023-05-31 12:47:07 Starting - Preparing the instances for training......
2023-05-31 12:48:14 Downloading - Downloading input data...
2023-05-31 12:48:39 Training - Downloading the training image......
2023-05-31 12:49:45 Uploading - Uploading generated training model[34m[2023-05-31 12:49:40.052 ip-10-0-109-150.eu-west-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-05-31 12:49:40.142 ip-10-0-109-150.eu-west-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-05-31:12:49:40:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-05-31:12:49:40:INFO] Failed to parse hyperparameter eval_metric value error to Json.[0m
[34mReturning the value itself[0m
[34m[2023-05-31:12:49:40:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2023-05-31:12:49:4

In [None]:
batch_input = f's3://{bucket}/{prefix}/test/{test_file}'
batch_output = f's3://{bucket}/{prefix}/batch-inference'


transformer = xgb.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge",
    strategy="MultiRecord",
    assemble_with="Line",
    accept="text/csv",
    output_path=batch_output
)

transformer.transform(batch_input, content_type="text/csv", split_type="Line")
transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-05-31-12-51-58-041
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2023-05-31-12-51-58-810


........................

In [None]:
# !aws s3 cp --recursive $transformer.output_path ./

In [14]:
predictions = pd.read_csv("test_modified.csv.out", header=None)

# test_pass.drop(["Survived"], axis=0, inplace=True)
# test_pass["Survived"] = [round(float(value)) for value in predictions.values]

predictions = pd.DataFrame([bool(round(float(x))) for x in predictions.values])
predictions.columns = ["Transported"]

result = pd.concat([test_pass, predictions], axis=1)

# result.drop(result.columns[0], axis=1, inplace=True)

print(result)

result.to_csv("results.csv", index=False)

     PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3        0021_01         True
4        0023_01         True
...          ...          ...
4272     9266_02         True
4273     9269_01        False
4274     9271_01         True
4275     9273_01         True
4276     9277_01        False

[4277 rows x 2 columns]
