In [1]:
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
np.random.seed(10)
def fill_by_distribution(series):
    probs = series.value_counts(normalize=True)
    return series.apply(lambda x: np.random.choice(probs.index, p=probs.values) if pd.isna(x) else x)

In [3]:
std = StandardScaler()

In [4]:
train = pd.read_csv('Spaceship_Titanic.csv')
test = pd.read_csv('test.csv')
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Data normalizing

### X_train

In [8]:
X_train = train.drop(['Transported'], axis=1)
y_train = train['Transported']
X_test = test

In [112]:
X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [12]:
X_train['PassengerId'] = (X_train['PassengerId'].str.replace(r'.*_', '', regex=True) == '02').astype(int)

In [13]:
X_train['HomePlanet'].describe()

count      6786
unique        3
top       Earth
freq       3691
Name: HomePlanet, dtype: object

In [14]:
X_train['Cabin'].str.split('/').str[-1].describe()

count     6796
unique       2
top          S
freq      3421
Name: Cabin, dtype: object

In [15]:
X_train['Cabin'] = X_train['Cabin'].str.split('/').str[-1]

In [16]:
X_train['Destination'].describe()

count            6815
unique              3
top       TRAPPIST-1e
freq             4733
Name: Destination, dtype: object

In [17]:
for col in X_train.columns:
    X_train[col] = fill_by_distribution(X_train[col])

In [18]:
for col in ['CryoSleep', 'VIP']:
    X_train[col] = X_train[col].astype('int8')

In [19]:
for col in ['HomePlanet', 'Cabin', 'Destination']:
    X_train = pd.get_dummies(X_train, columns=[col], dtype='int8')

In [20]:
X_train.drop(['Name'], axis=1, inplace=True)

In [21]:
cols = ["Age", "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

X_train_scaling = pd.DataFrame(std.fit_transform(X_train), columns=X_train.columns, index=X_train.index)

In [22]:
X_train_scaling

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
2333,-0.436053,-0.740328,-0.060541,-0.153461,-0.329956,-0.255854,-0.286805,0.296183,-0.259658,0.914219,-0.579232,-0.506329,1.005479,-1.005479,-0.519301,-0.319742,0.662688
2589,2.293300,-0.740328,-0.821096,-0.153461,-0.329956,0.464706,-0.236085,-0.280129,-0.259658,0.914219,-0.579232,-0.506329,1.005479,-1.005479,-0.519301,-0.319742,0.662688
8302,-0.436053,1.350753,-0.060541,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,-1.093829,1.726423,-0.506329,-0.994550,0.994550,1.925667,-0.319742,-1.509005
8177,2.293300,-0.740328,-0.613672,-0.153461,-0.236309,-0.289354,0.186034,0.577310,-0.259658,-1.093829,-0.579232,1.975002,1.005479,-1.005479,-0.519301,-0.319742,0.662688
500,2.293300,1.350753,0.492590,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,-1.093829,1.726423,-0.506329,1.005479,-1.005479,1.925667,-0.319742,-1.509005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,-0.436053,-0.740328,-0.751955,-0.153461,-0.309145,-0.289354,-0.051203,0.255770,-0.259658,0.914219,-0.579232,-0.506329,-0.994550,0.994550,-0.519301,-0.319742,0.662688
5191,-0.436053,-0.740328,1.460570,-0.153461,0.695694,-0.290618,-0.237721,0.389306,0.107492,-1.093829,-0.579232,1.975002,-0.994550,0.994550,-0.519301,-0.319742,0.662688
5390,-0.436053,-0.740328,-0.475389,-0.153461,-0.095097,-0.290618,0.491989,-0.280129,-0.237354,0.914219,-0.579232,-0.506329,1.005479,-1.005479,-0.519301,3.127522,-1.509005
860,-0.436053,-0.740328,0.354307,-0.153461,0.233409,-0.290618,2.373529,-0.280129,-0.259658,-1.093829,-0.579232,1.975002,1.005479,-1.005479,-0.519301,-0.319742,0.662688


In [23]:
X_train

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
2333,0,0,28.0,0,0.0,55.0,0.0,656.0,0.0,1,0,0,1,0,0,0,1
2589,1,0,17.0,0,0.0,1195.0,31.0,0.0,0.0,1,0,0,1,0,0,0,1
8302,0,1,28.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1,1,0,0
8177,1,0,20.0,0,63.0,2.0,289.0,976.0,0.0,0,0,1,1,0,0,0,1
500,1,1,36.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0,0,18.0,0,14.0,2.0,144.0,610.0,0.0,1,0,0,0,1,0,0,1
5191,0,0,50.0,0,690.0,0.0,30.0,762.0,428.0,0,0,1,0,1,0,0,1
5390,0,0,22.0,0,158.0,0.0,476.0,0.0,26.0,1,0,0,1,0,0,1,0
860,0,0,34.0,0,379.0,0.0,1626.0,0.0,0.0,0,0,1,1,0,0,0,1


In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6954 entries, 2333 to 7270
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PassengerId                6954 non-null   int64  
 1   CryoSleep                  6954 non-null   int8   
 2   Age                        6954 non-null   float64
 3   VIP                        6954 non-null   int8   
 4   RoomService                6954 non-null   float64
 5   FoodCourt                  6954 non-null   float64
 6   ShoppingMall               6954 non-null   float64
 7   Spa                        6954 non-null   float64
 8   VRDeck                     6954 non-null   float64
 9   HomePlanet_Earth           6954 non-null   int8   
 10  HomePlanet_Europa          6954 non-null   int8   
 11  HomePlanet_Mars            6954 non-null   int8   
 12  Cabin_P                    6954 non-null   int8   
 13  Cabin_S                    6954 non-null   int8   

### X_test

In [25]:
X_test['PassengerId'] = (X_test['PassengerId'].str.replace(r'.*_', '', regex=True) == '02').astype(int)
X_test['Cabin'] = X_test['Cabin'].str.split('/').str[-1]

for col in X_test.columns:
    X_test[col] = fill_by_distribution(X_test[col])

for col in ['CryoSleep', 'VIP']:
    X_test[col] = X_test[col].astype('int8')

for col in ['HomePlanet', 'Cabin', 'Destination']:
    X_test = pd.get_dummies(X_test, columns=[col], dtype='int8')

X_test.drop(['Name'], axis=1, inplace=True)

cols = ["Age", "RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

X_test_scaling = pd.DataFrame(std.transform(X_test), columns=X_test.columns, index=X_test.index)

In [26]:
X_test

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
304,1,0,19.0,0,417.0,349.0,634.0,3.0,1057.0,0,0,1,0,1,0,0,1
2697,0,0,18.0,0,4.0,904.0,0.0,0.0,1.0,1,0,0,0,1,0,0,1
8424,0,1,41.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,1
1672,0,0,35.0,0,0.0,338.0,436.0,0.0,0.0,1,0,0,1,0,0,0,1
8458,1,1,43.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7175,0,1,16.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,1
3187,1,1,0.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0,0,1
1302,0,0,17.0,0,21.0,0.0,690.0,260.0,5.0,1,0,0,0,1,0,0,1
5934,0,1,42.0,0,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0,0,0,1


In [27]:
X_test_scaling

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_P,Cabin_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
304,2.293300,-0.740328,-0.682814,-0.153461,0.289894,-0.070026,0.750496,-0.277493,0.647064,-1.093829,-0.579232,1.975002,-0.994550,0.994550,-0.519301,-0.319742,0.662688
2697,-0.436053,-0.740328,-0.751955,-0.153461,-0.324010,0.280774,-0.286805,-0.280129,-0.258800,0.914219,-0.579232,-0.506329,-0.994550,0.994550,-0.519301,-0.319742,0.662688
8424,-0.436053,1.350753,0.838297,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,0.914219,-0.579232,-0.506329,-0.994550,0.994550,-0.519301,-0.319742,0.662688
1672,-0.436053,-0.740328,0.423449,-0.153461,-0.329956,-0.076978,0.426544,-0.280129,-0.259658,0.914219,-0.579232,-0.506329,1.005479,-1.005479,-0.519301,-0.319742,0.662688
8458,2.293300,1.350753,0.976580,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,-1.093829,1.726423,-0.506329,1.005479,-1.005479,-0.519301,-0.319742,0.662688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7175,-0.436053,1.350753,-0.890238,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,0.914219,-0.579232,-0.506329,-0.994550,0.994550,-0.519301,-0.319742,0.662688
3187,2.293300,1.350753,-1.996500,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,0.914219,-0.579232,-0.506329,-0.994550,0.994550,-0.519301,-0.319742,0.662688
1302,-0.436053,-0.740328,-0.821096,-0.153461,-0.298740,-0.290618,0.842119,-0.051713,-0.255369,0.914219,-0.579232,-0.506329,-0.994550,0.994550,-0.519301,-0.319742,0.662688
5934,-0.436053,1.350753,0.907439,-0.153461,-0.329956,-0.290618,-0.286805,-0.280129,-0.259658,-1.093829,-0.579232,1.975002,1.005479,-1.005479,-0.519301,-0.319742,0.662688


## Models

In [29]:
model = LogisticRegression()
model.fit(X_train_scaling, y_train)
y_preds = model.predict(X_test_scaling)
print('Test accuracy:', accuracy_score(y_test, model.predict(X_test_scaling)))
print('Train accuracy:', accuracy_score(y_train, model.predict(X_train_scaling)))
print(classification_report(y_test, y_preds))
confusion_matrix(y_test, y_preds)
# y_preds, np.array(y_test)

Test accuracy: 0.7786083956296722
Train accuracy: 0.7919183203911417
              precision    recall  f1-score   support

       False       0.80      0.74      0.77       861
        True       0.76      0.81      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



array([[639, 222],
       [163, 715]])

In [41]:
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(X_train_scaling, y_train)
y_preds_KNN = KNN.predict(X_test_scaling)
print('Test accuracy:', accuracy_score(y_test, KNN.predict(X_test_scaling)))
print('Train accuracy:', accuracy_score(y_train, KNN.predict(X_train_scaling)))
print(classification_report(y_test, y_preds_KNN))
confusion_matrix(y_test, y_preds_KNN)

Test accuracy: 0.7763082231167338
Train accuracy: 0.8281564567155594
              precision    recall  f1-score   support

       False       0.78      0.77      0.77       861
        True       0.78      0.78      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



array([[662, 199],
       [190, 688]])

In [42]:
GNB = GaussianNB()
GNB.fit(X_train_scaling, y_train)
y_preds_GNB = GNB.predict(X_test_scaling)
print('Test accuracy:', accuracy_score(y_test, GNB.predict(X_test_scaling)))
print('Train accuracy:', accuracy_score(y_train, GNB.predict(X_train_scaling)))
print(classification_report(y_test, y_preds_GNB))
confusion_matrix(y_test, y_preds_GNB)

Test accuracy: 0.7055779183438758
Train accuracy: 0.7029048029910843
              precision    recall  f1-score   support

       False       0.85      0.49      0.62       861
        True       0.65      0.91      0.76       878

    accuracy                           0.71      1739
   macro avg       0.75      0.70      0.69      1739
weighted avg       0.75      0.71      0.69      1739



array([[426, 435],
       [ 77, 801]])

In [43]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_preds_GNB = GNB.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, GNB.predict(X_test)))
print('Train accuracy:', accuracy_score(y_train, GNB.predict(X_train)))
print(classification_report(y_test, y_preds_GNB))
confusion_matrix(y_test, y_preds_GNB)

Test accuracy: 0.7073030477285797
Train accuracy: 0.7043428242737992
              precision    recall  f1-score   support

       False       0.85      0.49      0.63       861
        True       0.65      0.92      0.76       878

    accuracy                           0.71      1739
   macro avg       0.75      0.71      0.69      1739
weighted avg       0.75      0.71      0.69      1739



array([[425, 436],
       [ 73, 805]])

In [85]:
model = DecisionTreeClassifier(max_depth=3, min_samples_split=5, min_samples_leaf=5, criterion='gini')
model.fit(X_train_scaling, y_train)
y_preds = model.predict(X_test_scaling)
print('Test accuracy:', accuracy_score(y_test, model.predict(X_test)))
print('Train accuracy:', accuracy_score(y_train, model.predict(X_train)))
print(classification_report(y_test, y_preds))
confusion_matrix(y_test, y_preds)

Test accuracy: 0.7176538240368028
Train accuracy: 0.7158469945355191
              precision    recall  f1-score   support

       False       0.67      0.85      0.75       861
        True       0.80      0.59      0.68       878

    accuracy                           0.72      1739
   macro avg       0.73      0.72      0.71      1739
weighted avg       0.73      0.72      0.71      1739



array([[729, 132],
       [359, 519]])

In [88]:
model_rf = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_split=7, min_samples_leaf=5, random_state=0, verbose=0)
model_rf.fit(X_train_scaling, y_train)

print(f'Test accuracy: {accuracy_score(y_test, model_rf.predict(X_test_scaling))} \nTrain accuracy: {accuracy_score(y_train, model_rf.predict(X_train_scaling))} \n {accuracy_score(y_test, model_rf.predict(X_test_scaling)) -  accuracy_score(y_train, model_rf.predict(X_train_scaling))}')
print(classification_report(y_test, model_rf.predict(X_test_scaling)))

Test accuracy: 0.7843588269120184 
Train accuracy: 0.8057233247052057 
 -0.021364497793187276
              precision    recall  f1-score   support

       False       0.79      0.77      0.78       861
        True       0.78      0.80      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



In [90]:
adaboost_model = AdaBoostClassifier(n_estimators=500, learning_rate=1, random_state=0)
adaboost_model.fit(X_train_scaling, y_train)

print(f'Test accuracy: {accuracy_score(y_test, adaboost_model.predict(X_test_scaling))} \nTrain accuracy: {accuracy_score(y_train, adaboost_model.predict(X_train_scaling))} \n {accuracy_score(y_test, adaboost_model.predict(X_test_scaling)) -  accuracy_score(y_train, adaboost_model.predict(X_train_scaling))}')
print(classification_report(y_test, adaboost_model.predict(X_test_scaling)))

Test accuracy: 0.7768832662449684 
Train accuracy: 0.7976704055220017 
 -0.02078713927703335
              precision    recall  f1-score   support

       False       0.80      0.74      0.77       861
        True       0.76      0.82      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



In [103]:
gradientboost_model = GradientBoostingClassifier(n_estimators=100, learning_rate=.03, random_state=42)
gradientboost_model.fit(X_train_scaling, y_train)

print(f'Test accuracy: {accuracy_score(y_test, gradientboost_model.predict(X_test_scaling))} \nTrain accuracy: {accuracy_score(y_train, gradientboost_model.predict(X_train_scaling))} \n {accuracy_score(y_test, gradientboost_model.predict(X_test_scaling)) -  accuracy_score(y_train, gradientboost_model.predict(X_train_scaling))}')
print(classification_report(y_test, gradientboost_model.predict(X_test_scaling)))

Test accuracy: 0.7855089131684876 
Train accuracy: 0.8048605119355766 
 -0.01935159876708903
              precision    recall  f1-score   support

       False       0.82      0.72      0.77       861
        True       0.76      0.85      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.78      0.78      1739
weighted avg       0.79      0.79      0.78      1739

