# Import packages and Settings

In [1]:
import os

n_core = os.cpu_count()-3
print(n_core)

3


In [2]:
import pandas as pd

import matplotlib.pyplot as plt

from xgboost.sklearn import XGBClassifier

from catboost import CatBoostClassifier

%matplotlib inline

In [3]:
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

feature = 'Transported'
train_data[feature] = train_data[feature].astype(int)

feature = 'CryoSleep'
train_data[feature] = train_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')
test_data[feature] = test_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')

feature = 'VIP'
train_data[feature] = train_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')
test_data[feature] = test_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')

label = 'Transported'

id = test_data['PassengerId']

In [5]:
models = [
    LogisticRegression(random_state=0),
    SVC(random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    XGBClassifier(random_state=0),
    CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
]

# Base Model

In [12]:
def preprocess(columns_to_drop, train_data, test_data, label):
    X_train = train_data.drop([label] + columns_to_drop, axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop, axis=1)

    columns_num = X_train.select_dtypes(include='number').columns
    columns_cat = X_train.select_dtypes(include='O').columns

    X_train[columns_num] = X_train[columns_num].fillna(X_train[columns_num].median())
    X_train[columns_cat] = X_train[columns_cat].fillna(X_train[columns_cat].mode().loc[0])

    X_test[columns_num] = X_test[columns_num].fillna(X_train[columns_num].median())
    X_test[columns_cat] = X_test[columns_cat].fillna(X_train[columns_cat].mode().loc[0])

    encoder_onehot = OneHotEncoder(dtype=int, sparse_output=False, handle_unknown='ignore')

    train_data_onehot = encoder_onehot.fit_transform(X_train[columns_cat])
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot)

    test_data_onehot = encoder_onehot.transform(X_test[columns_cat])
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot)

    X_train = pd.concat([X_train[columns_num], train_data_onehot], axis=1)
    X_test = pd.concat([X_test[columns_num], test_data_onehot], axis=1)

    return X_train, Y_train, X_test

In [7]:
def evaluation(X_train, Y_train):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []

    cv = ShuffleSplit(random_state=0)

    for model in models:
        model = clone(model)
        cv_scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring='accuracy', n_jobs=n_core)
        model_scores.append(cv_scores.mean())

    cv_results = pd.DataFrame(columns=['name', 'score_mean'])
    cv_results['name'] = model_names
    cv_results['score_mean'] = model_scores

    return cv_results

In [26]:
columns_to_drop = ['PassengerId', 'Name']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data, test_data, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 6574) (8693,) (4277, 6574)


In [9]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.774023
1,SVC,0.781839
2,DecisionTreeClassifier,0.765517
3,RandomForestClassifier,0.787586
4,GradientBoostingClassifier,0.786897
5,XGBClassifier,0.783563
6,CatBoostClassifier,0.788161


In [10]:
def predict(model, X_train, Y_train, X_test, id, label, filename = 'result_basic.csv'):
    model = clone(model)

    model.fit(X_train, Y_train)
    predictions = model.predict(X_test).astype(bool)

    result = pd.DataFrame({
        id.name: id,
        label: predictions
    })
    result.to_csv('./Result/' + filename, index=False)

In [27]:
model = DecisionTreeClassifier(random_state=0)
predict(model, X_train, Y_train, X_test, id, label)

# Improve

In [28]:
train_data_new, test_data_new = train_data.copy(), test_data.copy()

train_data_new['Cabin'] = train_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])
test_data_new['Cabin'] = test_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])

train_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data_new['Cabin'].str.split('/', expand=True)
test_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = test_data_new['Cabin'].str.split('/', expand=True)

columns_to_drop = ['PassengerId', 'Name', 'Cabin', 'CabinNum']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data_new, test_data_new, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 24) (8693,) (4277, 24)


In [20]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.780575
1,SVC,0.781839
2,DecisionTreeClassifier,0.733793
3,RandomForestClassifier,0.786207
4,GradientBoostingClassifier,0.796437
5,XGBClassifier,0.792874
6,CatBoostClassifier,0.803563


In [21]:
cv = ShuffleSplit(random_state=0)
model = RandomForestClassifier(random_state=0, oob_score=True)
selector = RFECV(estimator=model, cv=cv, scoring='accuracy', n_jobs=n_core)
selector.fit(X_train, Y_train)

In [22]:
importances = pd.DataFrame(columns=['feature', 'importance'])
importances['feature'] = selector.get_feature_names_out()
importances['importance'] = selector.estimator_.feature_importances_

importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
1,Age,0.17184
2,RoomService,0.125341
5,Spa,0.12041
6,VRDeck,0.112883
3,FoodCourt,0.108934
0,CryoSleep,0.104521
4,ShoppingMall,0.094884
7,HomePlanet_Earth,0.029127
14,CabinDeck_E,0.017816
8,HomePlanet_Europa,0.017548


In [29]:
features_to_drop = [f for f in X_train.columns if f not in selector.get_feature_names_out()]
features_to_drop

['VIP', 'CabinDeck_A', 'CabinDeck_B', 'CabinDeck_D', 'CabinDeck_T']

In [30]:
X_train = X_train.drop(features_to_drop, axis=1)
X_test = X_test.drop(features_to_drop, axis=1)

model = RandomForestClassifier(random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, id, label, 'result.csv')