## Import libraries

In [1]:
# data analysis and wrangling

import pandas as pd
import numpy as np
import random as rnd
import re

# visualization

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
%matplotlib inline

# machine learning

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Load the Data

In [2]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
train.info()
print()
print('----------'*6)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data c

we can see there are lot of missing values, so it is important to handle them well.

In [5]:
print(train.columns.values)
print('-----'*5)
print(train['HomePlanet'].value_counts())
print('-----'*5)
print(train['Destination'].value_counts())
print('-----'*5)
print(train['VIP'].value_counts())
print('-----'*5)
print(train['Transported'].value_counts())
print('-----'*5)
print(train['Cabin'].value_counts())
print('-----'*5)
print(train['CryoSleep'].value_counts())
print('-----'*5)
print(train['Age'].value_counts())
print('-----'*5)

['PassengerId' 'HomePlanet' 'CryoSleep' 'Cabin' 'Destination' 'Age' 'VIP'
 'RoomService' 'FoodCourt' 'ShoppingMall' 'Spa' 'VRDeck' 'Name'
 'Transported']
-------------------------
Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
-------------------------
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64
-------------------------
False    8291
True      199
Name: VIP, dtype: int64
-------------------------
True     4378
False    4315
Name: Transported, dtype: int64
-------------------------
G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64
-------------------------
False    5439
True     3037
Name: CryoSleep, dtype: int64
-------------------------
24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0  

### Plan for replacing missing values

* HomePlanet: Since most of the data is Earth, we will replace missing values with Earth.
* Destination: Since most of the data is Earth, we will replace the missing values with TRAPPIST-1e
* VIP: Most of them are False so we will replace the missing values with False.
* Cabin: Cabin consists of Deck, Num, Side. It can be divided and analyzed
* CryoSleep: Missing values will be replaced as False.

## Handling Missing Values

In [6]:
Missing_features = ['FoodCourt', 'Spa', 'ShoppingMall', 'RoomService', 'VRDeck', 'Cabin', 'CryoSleep', 'VIP', 'HomePlanet', 'Destination', 'Age']
for feature in Missing_features:
    if feature == 'Age':
        fill = train[feature].mean()
    else:
        fill = train[feature].value_counts().index[0]
    train[feature] = train[feature].fillna(fill)
    test[feature] = test[feature].fillna(fill)

## Divide and Analyze Cabins

In [7]:
def extract_deck(s):
    return s.split('/')[0]

def extract_num(s):
    return s.split('/')[1]

def extract_side(s):
    return s.split('/')[2]

train['Deck'] = train['Cabin'].apply(extract_deck)
train['Num'] = train['Cabin'].apply(extract_num)
train['Side'] = train['Cabin'].apply(extract_side)

test['Deck'] = test['Cabin'].apply(extract_deck)
test['Num'] = test['Cabin'].apply(extract_num)
test['Side'] = test['Cabin'].apply(extract_side)

## Convert categorical features into numerical

In [8]:
features_cat = ['HomePlanet', 'Destination', 'Deck', 'Side']
for feature in features_cat:
    train[pd.get_dummies(train[feature], prefix=feature).columns] = pd.get_dummies(train[feature], prefix=feature)
    test[pd.get_dummies(test[feature], prefix=feature).columns] = pd.get_dummies(test[feature], prefix=feature)

To handle the name data, combining the train and test dataset. We will split it later.

In [9]:
data = pd.concat([train[test.columns], test])
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.00000,False,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.00000,False,109.0,9.0,25.0,...,0,0,0,0,0,1,0,0,0,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.00000,True,43.0,3576.0,0.0,...,1,0,0,0,0,0,0,0,0,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.00000,False,0.0,1283.0,371.0,...,1,0,0,0,0,0,0,0,0,1
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.00000,False,303.0,70.0,151.0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.00000,False,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,1
4273,9269_01,Earth,False,G/734/S,TRAPPIST-1e,42.00000,False,0.0,847.0,17.0,...,0,0,0,0,0,0,1,0,0,1
4274,9271_01,Mars,True,D/296/P,55 Cancri e,28.82793,False,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4275,9273_01,Europa,False,D/297/P,TRAPPIST-1e,28.82793,False,0.0,2680.0,0.0,...,0,0,0,1,0,0,0,0,1,0


## Handling name

In [10]:
def extract_last_name(s):
    return str(s).split(' ')[-1]

data['LastName'] = data['Name'].apply(extract_last_name)

dict_names = data['LastName'].value_counts().to_dict()

def same_name(s):
    return dict_names[s]-1

data['SameName'] = data['LastName'].apply(same_name)

we will add column ['SameName'] to train and test set

In [11]:
to_train = data.iloc[0:8693]
train['SameName'] = pd.Series(to_train['SameName'])

In [12]:
to_test = data.iloc[8693:]
test['SameName'] = pd.Series(to_test['SameName'])

## Handling with age

let us replace age with ordinals based on these groups

In [13]:
def age_group(s):
    if s==0:
        return -1
    elif (s > 0) and (s <= 11):
        return 1
    elif (s > 11) and (s <= 22):
        return 2
    elif (s > 22) and (s <= 33):
        return 3
    elif (s > 33) and (s <= 45):
        return 4
    elif (s > 45) and (s <= 56):
        return 5
    elif (s > 56) and (s <= 67):
        return 6
    elif (s > 67) and (s <= 79):
        return 7
    elif (s > 79) and (s <= 80):
        return 8

    train['Age_Group'] = train['Age'].apply(age_group)
    test['Age_Group'] = test['Age'].apply(age_group)

## Drop Columns

* passengerId is unique Id for each passenger. It is not important while building model
* we converted HomePlanet into 3 numerical columns so we don't need the original column
* we analysed Cabin, Destination, Name, Age and we don't need them.

In [14]:
train.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name', 'Age', 'Deck', 'Side'], axis = 1, inplace = True)

In [15]:
test.drop(['PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name', 'Age', 'Deck', 'Side'], axis = 1, inplace = True)

## Convert False to 0 and True to 1

In [16]:
train['CryoSleep'] = train['CryoSleep'].astype(int)
train['VIP'] = train['VIP'].astype(int)
train['Num'] = train['Num'].astype(int)

test['CryoSleep'] = test['CryoSleep'].astype(int)
test['VIP'] = test['VIP'].astype(int)
test['Num'] = test['Num'].astype(int)

In [17]:
x_train = train.drop(['Transported'], axis = 1)
y_train = train['Transported']
x_test = test

In [18]:
x_test.shape, x_train.shape

((4277, 25), (8693, 25))

## Apply Models

### Random Forest

In [19]:
parameter = {
    'n_estimators': [5, 10, 15, 20, 25],
    'max_depth': [3, 5, 7, 9, 11, 12],
    'min_samples_leaf': [18, 19, 20],
    'min_samples_split': [8, 9, 10]
}

rfc_model = RandomForestClassifier()
rfc_grid = GridSearchCV(rfc_model, param_grid = parameter, cv = 5, scoring = 'accuracy', n_jobs = -1)
rfc_grid.fit(x_train, y_train)
print('Best parameters:', rfc_grid.best_params_)
print()
print('Best Accuracy:', rfc_grid.best_score_)

Best parameters: {'max_depth': 11, 'min_samples_leaf': 20, 'min_samples_split': 10, 'n_estimators': 20}

Best Accuracy: 0.7962762483365771


In [20]:
rf = RandomForestClassifier(max_depth = 12, min_samples_leaf = 18, min_samples_split = 9, n_estimators = 25)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

## KGBClassifier

In [21]:
param_grid = {'n_estimators': [10, 25, 50, 75,100], 'learning_rate': [0.2, 0.15, 0.1, 0.05],
             'eval_metric': ['mlogloss']}
grid = GridSearchCV(XGBClassifier(), param_grid = param_grid, cv = 5, scoring = 'accuracy')
grid.fit(x_train, y_train)
best_params = grid.best_params_
print('Best score of cross validation: {:.2f}' .format(grid.best_score_))
print('Best parameters:', best_params)



Best score of cross validation: 0.78
Best parameters: {'eval_metric': 'mlogloss', 'learning_rate': 0.1, 'n_estimators': 75}


In [22]:
xgb = XGBClassifier()
xgb.set_params(**best_params)
xgb.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=75, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [23]:
y_pred_xgb = xgb.predict(x_test)

## Submission

In [24]:
subs = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
subs

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [25]:
subs['Transported'] = y_pred_xgb
subs.to_csv('./Transported', index = False)