# Spaceship Titanic: Model Selection

## Importing libraries

In [1]:
import pandas as pd
import numpy as np

## Loading & preparing data

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
df.head(2)

Unnamed: 0,CryoSleep,RoomService,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,-0.34059,-0.276663,-0.269023,0,0,1,0,0,0,1
1,0,-0.175364,0.211505,-0.230194,1,1,0,0,0,0,1


In [4]:
X = df.loc[:, df.columns != 'Transported'].values
y = df.loc[:, 'Transported'].values

In [5]:
X

array([[ 0.        , -0.34058987, -0.27666342, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        , -0.1753636 ,  0.2115053 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        , -0.27540886,  5.69428913, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.        , -0.34058987, -0.27577423, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        , -0.34058987,  0.03722284, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , -0.14959437, -0.27666342, ...,  0.        ,
         0.        ,  1.        ]])

In [6]:
y

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [7]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=422)

## Training ML models

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

def train_test_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    cv_scores = cross_val_score(model, X, y)
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print("\nModel Name:", type(model))
    print("Cross Validation Scores:", cv_scores, '~', cv_scores.mean())
    print("Accuracy Score:", accuracy)
    print("Confusion Matrix:\n", cm)


In [23]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = []

models.append(XGBClassifier())
models.append(LogisticRegression())
models.append(KNeighborsClassifier())
models.append(DecisionTreeClassifier())
models.append(RandomForestClassifier())
models.append(SVC())

for model in models:
    train_test_model(model)


Model Name: <class 'xgboost.sklearn.XGBClassifier'>
Cross Validation Scores: [0.76423232 0.76308223 0.76998275 0.79516686 0.77502877] ~ 0.77349858489099
Accuracy Score: 0.7780333525014376
Confusion Matrix:
 [[612 247]
 [139 741]]

Model Name: <class 'sklearn.linear_model._logistic.LogisticRegression'>
Cross Validation Scores: [0.76883266 0.7613571  0.76193214 0.75776755 0.78365938] ~ 0.7667097673292125
Accuracy Score: 0.7636572742955722
Confusion Matrix:
 [[673 186]
 [225 655]]

Model Name: <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Cross Validation Scores: [0.68200115 0.74008051 0.74640598 0.75949367 0.7589183 ] ~ 0.7373799208703599
Accuracy Score: 0.7469810235767682
Confusion Matrix:
 [[645 214]
 [226 654]]

Model Name: <class 'sklearn.tree._classes.DecisionTreeClassifier'>
Cross Validation Scores: [0.71937895 0.7320299  0.73835538 0.74683544 0.74453395] ~ 0.7362267244841982
Accuracy Score: 0.7423806785508913
Confusion Matrix:
 [[561 298]
 [150 730]]

Model Nam

### Conclusion
After training & testing models we want to tune  the best four. These are **XGBClassifier**(0.773), **Logistic Regression**(0.766), **RandomForestClassifer**(0.764), **SVC**(0.782).