In [37]:
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [38]:
np.random.seed(10)
def fill_by_distribution(series):
    probs = series.value_counts(normalize=True)
    return series.apply(lambda x: np.random.choice(probs.index, p=probs.values) if pd.isna(x) else x)

In [39]:
std = StandardScaler()

In [86]:
train = pd.read_csv('Spaceship_Titanic.csv')
test = pd.read_csv('test.csv')
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [41]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Data normalizing

### X_train

In [42]:
X_train = train.drop(['Transported'], axis=1)
y_train = train['Transported']
X_test = test

In [43]:
X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre


In [44]:
X_train['PassengerId'] = (X_train['PassengerId'].str.replace(r'.*_', '', regex=True) == '02').astype(int)

In [45]:
X_train['HomePlanet'].describe()

count      8492
unique        3
top       Earth
freq       4602
Name: HomePlanet, dtype: object

In [46]:
X_train['Cabin'].str.split('/').str[-1].describe()

count     8494
unique       2
top          S
freq      4288
Name: Cabin, dtype: object

In [47]:
X_train['Cabin'] = X_train['Cabin'].str.split('/').str[-1]

In [48]:
X_train['Destination'].describe()

count            8511
unique              3
top       TRAPPIST-1e
freq             5915
Name: Destination, dtype: object

In [49]:
for col in X_train.columns:
    X_train[col] = fill_by_distribution(X_train[col])

In [50]:
for col in ['CryoSleep', 'VIP']:
    X_train[col] = X_train[col].astype('int8')

In [51]:
for col in ['HomePlanet', 'Cabin', 'Destination']:
    X_train = pd.get_dummies(X_train, columns=[col], dtype='int8')

In [52]:
X_train.drop(['Name'], axis=1, inplace=True)

In [53]:
cols = ["Age", "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

X_train_scaling = pd.DataFrame(std.fit_transform(X_train), columns=X_train.columns, index=X_train.index)

In [54]:
X_train_scaling

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,-0.440374,-0.747665,0.700976,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,-1.088448,1.724249,-0.509402,1.010291,-1.010291,-0.518701,-0.321858,0.663829
1,-0.440374,-0.747665,-0.335231,-0.155796,-0.173665,-0.278933,-0.246369,0.206091,-0.227426,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
2,-0.440374,-0.747665,2.013505,6.418647,-0.271329,1.940311,-0.287824,5.593460,-0.223044,-1.088448,1.724249,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
3,2.270796,-0.747665,0.286494,-0.155796,-0.334959,0.513699,0.327370,2.635038,-0.096852,-1.088448,1.724249,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
4,-0.440374,-0.747665,-0.887874,-0.155796,0.113411,-0.240981,-0.037435,0.220071,-0.264232,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,-0.440374,-0.747665,0.839137,6.418647,-0.334959,3.957975,-0.287824,1.161943,-0.201136,-1.088448,1.724249,-0.509402,1.010291,-1.010291,1.927892,-0.321858,-1.506411
8689,-0.440374,1.337498,-0.749713,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,3.106958,-1.506411
8690,-0.440374,-0.747665,-0.197070,-0.155796,-0.334959,-0.284532,2.816335,-0.272708,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
8691,-0.440374,-0.747665,0.217413,-0.155796,-0.334959,0.368113,-0.287824,0.034842,2.568949,-1.088448,1.724249,-0.509402,-0.989814,0.989814,1.927892,-0.321858,-1.506411


In [55]:
X_train

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,1
1,0,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,0,0,1,0,0,1
2,0,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1,0,0,1,0,0,1
3,1,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,0,1,0,0,1
4,0,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0,1,0,1,0,1,0,0
8689,0,1,18.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,1,0
8690,0,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,0,0,0,1,0,0,1
8691,0,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0,1,0,0,1,1,0,0


In [56]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PassengerId                8693 non-null   int64  
 1   CryoSleep                  8693 non-null   int8   
 2   Age                        8693 non-null   float64
 3   VIP                        8693 non-null   int8   
 4   RoomService                8693 non-null   float64
 5   FoodCourt                  8693 non-null   float64
 6   ShoppingMall               8693 non-null   float64
 7   Spa                        8693 non-null   float64
 8   VRDeck                     8693 non-null   float64
 9   HomePlanet_Earth           8693 non-null   int8   
 10  HomePlanet_Europa          8693 non-null   int8   
 11  HomePlanet_Mars            8693 non-null   int8   
 12  Cabin_P                    8693 non-null   int8   
 13  Cabin_S                    8693 non-null   int8 

### X_test

In [57]:
X_test['PassengerId'] = (X_test['PassengerId'].str.replace(r'.*_', '', regex=True) == '02').astype(int)
X_test['Cabin'] = X_test['Cabin'].str.split('/').str[-1]

for col in X_test.columns:
    X_test[col] = fill_by_distribution(X_test[col])

for col in ['CryoSleep', 'VIP']:
    X_test[col] = X_test[col].astype('int8')

for col in ['HomePlanet', 'Cabin', 'Destination']:
    X_test = pd.get_dummies(X_test, columns=[col], dtype='int8')

X_test.drop(['Name'], axis=1, inplace=True)

cols = ["Age", "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

X_test_scaling = pd.DataFrame(std.transform(X_test), columns=X_test.columns, index=X_test.index)

In [87]:
X_test_scaling

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,-0.440374,1.337498,-0.127989,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
1,-0.440374,-0.747665,-0.680633,-0.155796,-0.334959,-0.278933,-0.287824,2.192935,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
2,-0.440374,1.337498,0.148333,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,-1.088448,1.724249,-0.509402,-0.989814,0.989814,1.927892,-0.321858,-1.506411
3,-0.440374,-0.747665,0.631896,-0.155796,-0.334959,3.854074,-0.287824,-0.115438,0.246670,-1.088448,1.724249,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
4,-0.440374,-0.747665,-0.611552,-0.155796,-0.320162,-0.284532,0.765136,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2.270796,1.337498,0.355574,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
4273,-0.440374,-0.747665,0.908218,-0.155796,-0.334959,0.242437,-0.259634,-0.264845,-0.139793,0.918739,-0.579963,-0.509402,1.010291,-1.010291,-0.518701,-0.321858,0.663829
4274,-0.440374,1.337498,0.010172,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,-1.088448,-0.579963,1.963088,1.010291,-1.010291,1.927892,-0.321858,-1.506411
4275,-0.440374,-0.747665,0.010172,-0.155796,-0.334959,1.382856,-0.287824,-0.273582,0.192337,-1.088448,1.724249,-0.509402,1.010291,-1.010291,-0.518701,-0.321858,0.663829


In [59]:
X_test_scaling

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,-0.440374,1.337498,-0.127989,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
1,-0.440374,-0.747665,-0.680633,-0.155796,-0.334959,-0.278933,-0.287824,2.192935,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
2,-0.440374,1.337498,0.148333,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,-1.088448,1.724249,-0.509402,-0.989814,0.989814,1.927892,-0.321858,-1.506411
3,-0.440374,-0.747665,0.631896,-0.155796,-0.334959,3.854074,-0.287824,-0.115438,0.246670,-1.088448,1.724249,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
4,-0.440374,-0.747665,-0.611552,-0.155796,-0.320162,-0.284532,0.765136,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2.270796,1.337498,0.355574,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,0.918739,-0.579963,-0.509402,-0.989814,0.989814,-0.518701,-0.321858,0.663829
4273,-0.440374,-0.747665,0.908218,-0.155796,-0.334959,0.242437,-0.259634,-0.264845,-0.139793,0.918739,-0.579963,-0.509402,1.010291,-1.010291,-0.518701,-0.321858,0.663829
4274,-0.440374,1.337498,0.010172,-0.155796,-0.334959,-0.284532,-0.287824,-0.273582,-0.265984,-1.088448,-0.579963,1.963088,1.010291,-1.010291,1.927892,-0.321858,-1.506411
4275,-0.440374,-0.747665,0.010172,-0.155796,-0.334959,1.382856,-0.287824,-0.273582,0.192337,-1.088448,1.724249,-0.509402,1.010291,-1.010291,-0.518701,-0.321858,0.663829


## Models

In [60]:
# model = LogisticRegression()
# model.fit(X_train_scaling, y_train)
# y_preds = model.predict(X_test_scaling)
# print('Test accuracy:', accuracy_score(y_test, model.predict(X_test_scaling)))
# print('Train accuracy:', accuracy_score(y_train, model.predict(X_train_scaling)))
# print(classification_report(y_test, y_preds))
# confusion_matrix(y_test, y_preds)
# # y_preds, np.array(y_test)

In [61]:
# KNN = KNeighborsClassifier(n_neighbors=5)
# KNN.fit(X_train_scaling, y_train)
# y_preds_KNN = KNN.predict(X_test_scaling)
# print('Test accuracy:', accuracy_score(y_test, KNN.predict(X_test_scaling)))
# print('Train accuracy:', accuracy_score(y_train, KNN.predict(X_train_scaling)))
# print(classification_report(y_test, y_preds_KNN))
# confusion_matrix(y_test, y_preds_KNN)

In [62]:
# GNB = GaussianNB()
# GNB.fit(X_train_scaling, y_train)
# y_preds_GNB = GNB.predict(X_test_scaling)
# print('Test accuracy:', accuracy_score(y_test, GNB.predict(X_test_scaling)))
# print('Train accuracy:', accuracy_score(y_train, GNB.predict(X_train_scaling)))
# print(classification_report(y_test, y_preds_GNB))
# confusion_matrix(y_test, y_preds_GNB)

In [63]:
# GNB = GaussianNB()
# GNB.fit(X_train, y_train)
# y_preds_GNB = GNB.predict(X_test)
# print('Test accuracy:', accuracy_score(y_test, GNB.predict(X_test)))
# print('Train accuracy:', accuracy_score(y_train, GNB.predict(X_train)))
# print(classification_report(y_test, y_preds_GNB))
# confusion_matrix(y_test, y_preds_GNB)

In [64]:
# model = DecisionTreeClassifier(max_depth=3, min_samples_split=5, min_samples_leaf=5, criterion='gini')
# model.fit(X_train_scaling, y_train)
# y_preds = model.predict(X_test_scaling)
# print('Test accuracy:', accuracy_score(y_test, model.predict(X_test)))
# print('Train accuracy:', accuracy_score(y_train, model.predict(X_train)))
# print(classification_report(y_test, y_preds))
# confusion_matrix(y_test, y_preds)

In [65]:
# model_rf = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_split=7, min_samples_leaf=5, random_state=0, verbose=0)
# model_rf.fit(X_train_scaling, y_train)

# print(f'Test accuracy: {accuracy_score(y_test, model_rf.predict(X_test_scaling))} \nTrain accuracy: {accuracy_score(y_train, model_rf.predict(X_train_scaling))} \n {accuracy_score(y_test, model_rf.predict(X_test_scaling)) -  accuracy_score(y_train, model_rf.predict(X_train_scaling))}')
# print(classification_report(y_test, model_rf.predict(X_test_scaling)))

In [66]:
# adaboost_model = AdaBoostClassifier(n_estimators=500, learning_rate=1, random_state=0)
# adaboost_model.fit(X_train_scaling, y_train)

# print(f'Test accuracy: {accuracy_score(y_test, adaboost_model.predict(X_test_scaling))} \nTrain accuracy: {accuracy_score(y_train, adaboost_model.predict(X_train_scaling))} \n {accuracy_score(y_test, adaboost_model.predict(X_test_scaling)) -  accuracy_score(y_train, adaboost_model.predict(X_train_scaling))}')
# print(classification_report(y_test, adaboost_model.predict(X_test_scaling)))

In [70]:
gradientboost_model = GradientBoostingClassifier(n_estimators=100, learning_rate=.03, random_state=42)
gradientboost_model.fit(X_train_scaling, y_train)
y_preds = gradientboost_model.predict(X_test)

# print(f'Test accuracy: {accuracy_score(y_test, gradientboost_model.predict(X_test_scaling))} \nTrain accuracy: {accuracy_score(y_train, gradientboost_model.predict(X_train_scaling))} \n {accuracy_score(y_test, gradientboost_model.predict(X_test_scaling)) -  accuracy_score(y_train, gradientboost_model.predict(X_train_scaling))}')
# print(classification_report(y_test, gradientboost_model.predict(X_test_scaling)))

In [71]:
y_preds

array([False, False,  True, ..., False, False, False], shape=(4277,))

In [78]:
submission = pd.read_csv('sample_submission.csv')

In [79]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [80]:
submission['Transported'] = y_preds
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,True
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [85]:
submission.to_csv("sample_submission.csv", index=False)

In [84]:
print(submission.head())

  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01         True
3     0021_01        False
4     0023_01         True
