# Import packages and Settings

In [1]:
import os

n_core = os.cpu_count()-3
print(n_core)

9


In [2]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from xgboost.sklearn import XGBClassifier

from catboost import CatBoostClassifier

%matplotlib inline

In [3]:
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

feature = 'Transported'
train_data[feature] = train_data[feature].astype(int)

feature = 'CryoSleep'
train_data[feature] = train_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')
test_data[feature] = test_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')

feature = 'VIP'
train_data[feature] = train_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')
test_data[feature] = test_data[feature].replace({'False': 0, 'True': 1}).astype('Int64')

label = 'Transported'

id = test_data['PassengerId']

In [5]:
models = [
    LogisticRegression(random_state=0),
    SVC(random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    XGBClassifier(random_state=0),
    CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
]

# Functions

In [6]:
def preprocess(columns_to_drop, train_data, test_data, label):
    X_train = train_data.drop([label] + columns_to_drop, axis=1)
    Y_train = train_data[label]
    X_test = test_data.drop(columns_to_drop, axis=1)

    columns_num = X_train.select_dtypes(include='number').columns
    columns_cat = X_train.select_dtypes(include='O').columns

    X_train[columns_num] = X_train[columns_num].fillna(X_train[columns_num].median())
    X_train[columns_cat] = X_train[columns_cat].fillna(X_train[columns_cat].mode().loc[0])

    X_test[columns_num] = X_test[columns_num].fillna(X_train[columns_num].median())
    X_test[columns_cat] = X_test[columns_cat].fillna(X_train[columns_cat].mode().loc[0])

    encoder_onehot = OneHotEncoder(dtype=int, sparse_output=False, handle_unknown='ignore')

    train_data_onehot = encoder_onehot.fit_transform(X_train[columns_cat])
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot)

    test_data_onehot = encoder_onehot.transform(X_test[columns_cat])
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot)

    X_train = pd.concat([X_train[columns_num], train_data_onehot], axis=1)
    X_test = pd.concat([X_test[columns_num], test_data_onehot], axis=1)

    return X_train, Y_train, X_test

In [7]:
def evaluation(X_train, Y_train):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []

    cv = ShuffleSplit(random_state=0)

    for model in models:
        model = clone(model)
        cv_scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring='accuracy', n_jobs=n_core)
        model_scores.append(cv_scores.mean())

    cv_results = pd.DataFrame(columns=['name', 'score_mean'])
    cv_results['name'] = model_names
    cv_results['score_mean'] = model_scores

    return cv_results

In [8]:
def predict(model, X_train, Y_train, X_test, id, label, filename = 'result_basic.csv'):
    model = clone(model)

    model.fit(X_train, Y_train)
    predictions = model.predict(X_test).astype(bool)

    result = pd.DataFrame({
        id.name: id,
        label: predictions
    })
    result.to_csv('./Result/' + filename, index=False)

In [9]:
def tuneParameters(model, tuning_parameters, X_train, Y_train, n_core):
    cv = ShuffleSplit(random_state=0)

    tune_model = GridSearchCV(model, param_grid=tuning_parameters, scoring='accuracy', cv=cv, n_jobs=n_core)

    tune_model.fit(X_train, Y_train)
    
    return tune_model.best_score_, tune_model.best_params_

# Base Model

In [26]:
columns_to_drop = ['PassengerId', 'Name']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data, test_data, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 6574) (8693,) (4277, 6574)


In [9]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.774023
1,SVC,0.781839
2,DecisionTreeClassifier,0.765517
3,RandomForestClassifier,0.787586
4,GradientBoostingClassifier,0.786897
5,XGBClassifier,0.783563
6,CatBoostClassifier,0.788161


In [27]:
model = DecisionTreeClassifier(random_state=0)
predict(model, X_train, Y_train, X_test, id, label)

# Improve

## Attempt1

In [9]:
train_data_new, test_data_new = train_data.copy(), test_data.copy()

train_data_new['Cabin'] = train_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])
test_data_new['Cabin'] = test_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])

train_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data_new['Cabin'].str.split('/', expand=True)
test_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = test_data_new['Cabin'].str.split('/', expand=True)

columns_to_drop = ['PassengerId', 'Name', 'Cabin', 'CabinNum']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data_new, test_data_new, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 24) (8693,) (4277, 24)


In [20]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.780575
1,SVC,0.781839
2,DecisionTreeClassifier,0.733793
3,RandomForestClassifier,0.786207
4,GradientBoostingClassifier,0.796437
5,XGBClassifier,0.792874
6,CatBoostClassifier,0.803563


In [21]:
cv = ShuffleSplit(random_state=0)
model = RandomForestClassifier(random_state=0, oob_score=True)
selector = RFECV(estimator=model, cv=cv, scoring='accuracy', n_jobs=n_core)
selector.fit(X_train, Y_train)

In [22]:
importances = pd.DataFrame(columns=['feature', 'importance'])
importances['feature'] = selector.get_feature_names_out()
importances['importance'] = selector.estimator_.feature_importances_

importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
1,Age,0.17184
2,RoomService,0.125341
5,Spa,0.12041
6,VRDeck,0.112883
3,FoodCourt,0.108934
0,CryoSleep,0.104521
4,ShoppingMall,0.094884
7,HomePlanet_Earth,0.029127
14,CabinDeck_E,0.017816
8,HomePlanet_Europa,0.017548


In [29]:
features_to_drop = [f for f in X_train.columns if f not in selector.get_feature_names_out()]
features_to_drop

['VIP', 'CabinDeck_A', 'CabinDeck_B', 'CabinDeck_D', 'CabinDeck_T']

In [30]:
X_train = X_train.drop(features_to_drop, axis=1)
X_test = X_test.drop(features_to_drop, axis=1)

model = RandomForestClassifier(random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, id, label, 'result.csv')

## Attempt2

In [10]:
train_data_new, test_data_new = train_data.copy(), test_data.copy()

train_data_new['Cabin'] = train_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])
test_data_new['Cabin'] = test_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])

train_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data_new['Cabin'].str.split('/', expand=True)
test_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = test_data_new['Cabin'].str.split('/', expand=True)

train_data_new['TotalSpending'] = train_data_new[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_data_new['TotalSpending'] = test_data_new[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

train_data_new['LuxurySpending'] = train_data_new[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)
test_data_new['LuxurySpending'] = test_data_new[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)

train_data_new['EssentialSpending'] = train_data_new[['FoodCourt', 'ShoppingMall']].sum(axis=1)
test_data_new['EssentialSpending'] = test_data_new[['FoodCourt', 'ShoppingMall']].sum(axis=1)

train_data_new['HasSpending'] = train_data_new['TotalSpending'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasSpending'] = test_data_new['TotalSpending'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['PassengerGroup'] = train_data_new['PassengerId'].apply(lambda x: x.split('_')[0])
test_data_new['PassengerGroup'] = test_data_new['PassengerId'].apply(lambda x: x.split('_')[0])

count_train = train_data_new['PassengerGroup'].value_counts()
count_test = test_data_new['PassengerGroup'].value_counts()

train_data_new['PassengerGroupSize'] = train_data_new['PassengerGroup'].apply(lambda x: count_train[x])
test_data_new['PassengerGroupSize'] = test_data_new['PassengerGroup'].apply(lambda x: count_test[x])

train_data_new['Alone'] = train_data_new['PassengerGroupSize'].apply(lambda x: 1 if x == 1 else 0)
test_data_new['Alone'] = test_data_new['PassengerGroupSize'].apply(lambda x: 1 if x == 1 else 0)

columns_to_drop = ['PassengerId', 'Name', 'Cabin', 'CabinNum', 'PassengerGroup']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data_new, test_data_new, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 30) (8693,) (4277, 30)


In [10]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.782299
1,SVC,0.782299
2,DecisionTreeClassifier,0.736552
3,RandomForestClassifier,0.784828
4,GradientBoostingClassifier,0.796207
5,XGBClassifier,0.797356
6,CatBoostClassifier,0.808736


### Feature Selection

In [11]:
cv = ShuffleSplit(random_state=0)
model = RandomForestClassifier(random_state=0, oob_score=True)
selector = RFECV(estimator=model, cv=cv, scoring='accuracy', n_jobs=n_core)
selector.fit(X_train, Y_train)

importances = pd.DataFrame(columns=['feature', 'importance'])
importances['feature'] = selector.get_feature_names_out()
importances['importance'] = selector.estimator_.feature_importances_

importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
8,LuxurySpending,0.183731
1,Age,0.147848
7,TotalSpending,0.102294
9,EssentialSpending,0.079989
3,FoodCourt,0.049263
2,RoomService,0.048569
10,HasSpending,0.047883
5,Spa,0.047826
6,VRDeck,0.04632
4,ShoppingMall,0.039706


In [12]:
features_to_drop = [f for f in X_train.columns if f not in selector.get_feature_names_out()]
features_to_drop

['VIP', 'CabinDeck_A', 'CabinDeck_B', 'CabinDeck_D', 'CabinDeck_T']

In [13]:
X_train = X_train.drop(features_to_drop, axis=1)
X_test = X_test.drop(features_to_drop, axis=1)

cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.784138
1,SVC,0.782529
2,DecisionTreeClassifier,0.741609
3,RandomForestClassifier,0.786322
4,GradientBoostingClassifier,0.798046
5,XGBClassifier,0.795862
6,CatBoostClassifier,0.808621


In [55]:
model = RandomForestClassifier(random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, id, label, 'result.csv')

### Hyperparameter Tuning

In [11]:
features_to_drop = ['VIP', 'CabinDeck_A', 'CabinDeck_B', 'CabinDeck_D', 'CabinDeck_T']

X_train = X_train.drop(features_to_drop, axis=1)
X_test = X_test.drop(features_to_drop, axis=1)

In [12]:
tuning_parameters = {
    'C': [0.01, 0.1, 1, 10, 100],
}

model = LogisticRegression(max_iter=100000, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7858620689655174
{'C': 0.01}


In [13]:
tuning_parameters = {
    'C': [0.01, 0.1, 1, 10, 100],
}

model = SVC(cache_size=2000, max_iter=100000, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7848275862068965
{'C': 10}


In [27]:
tuning_parameters = {
    'max_depth': [i for i in range(2, 12, 2)]
}

model = DecisionTreeClassifier(random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7922988505747127
{'max_depth': 6}


In [28]:
tuning_parameters = {
    'min_samples_split': [i for i in range(2, 12, 2)]
}

model = DecisionTreeClassifier(max_depth=6, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7922988505747127
{'min_samples_split': 2}


In [29]:
tuning_parameters = {
    'max_leaf_nodes': [i for i in range(5, 100, 5)]
}

model = DecisionTreeClassifier(max_depth=6, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7931034482758621
{'max_leaf_nodes': 45}


In [30]:
tuning_parameters = {
    'min_samples_leaf': [i for i in range(1, 10, 2)]
}

model = DecisionTreeClassifier(max_depth=6, max_leaf_nodes=45, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7945977011494254
{'min_samples_leaf': 9}


In [31]:
tuning_parameters = {
    'max_features': ['sqrt', 'log2', X_train.shape[1]]
}

model = DecisionTreeClassifier(max_depth=6, max_leaf_nodes=45, min_samples_leaf=9, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7945977011494254
{'max_features': 25}


In [15]:
tuning_parameters = {
    'max_depth': [i for i in range(2, 12, 2)]
}

model = RandomForestClassifier(random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8026436781609195
{'max_depth': 10}


In [16]:
tuning_parameters = {
    'min_samples_split': [i for i in range(2, 12, 2)]
}

model = RandomForestClassifier(max_depth=10, random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'min_samples_split': 4}


In [20]:
tuning_parameters = {
    'max_leaf_nodes': [i for i in range(5, 100, 5)]
}

model = RandomForestClassifier(max_depth=10, min_samples_split=4, random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.7971264367816092
{'max_leaf_nodes': 95}


In [21]:
tuning_parameters = {
    'min_samples_leaf': [i for i in range(1, 10, 2)]
}

model = RandomForestClassifier(max_depth=10, min_samples_split=4, random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'min_samples_leaf': 1}


In [22]:
tuning_parameters = {
    'n_estimators': [50, 100, 300, 500, 1000]
}

model = RandomForestClassifier(max_depth=10, min_samples_split=4, random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'n_estimators': 50}


In [23]:
tuning_parameters = {
    'max_samples': [i * 0.1 for i in range(1, 11)]
}

model = RandomForestClassifier(max_depth=10, min_samples_split=4, n_estimators=50, random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'max_samples': 1.0}


In [25]:
tuning_parameters = {
    'max_features': ['sqrt', 'log2', X_train.shape[1]]
}

model = RandomForestClassifier(max_depth=10, min_samples_split=4, n_estimators=50, random_state=0, oob_score=True)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'max_features': 'sqrt'}


In [32]:
tuning_parameters = {
    'n_estimators': [50, 100, 300, 500, 1000],
}

model = GradientBoostingClassifier(random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8026436781609195
{'n_estimators': 300}


In [33]:
tuning_parameters = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 4, 6, 8, 10],
}

model = GradientBoostingClassifier(n_estimators=300, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8020689655172415
{'max_depth': 4, 'min_samples_split': 2}


In [34]:
tuning_parameters = {
    'max_features': ['sqrt', 'log2', X_train.shape[0]]
}

model = GradientBoostingClassifier(n_estimators=300, max_depth=4, min_samples_split=2, random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8022988505747126
{'max_features': 'log2'}


In [36]:
tuning_parameters = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3]
}

model = GradientBoostingClassifier(n_estimators=300, max_depth=4, min_samples_split=2, max_features='log2', random_state=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8022988505747126
{'learning_rate': 0.1}


In [37]:
tuning_parameters = {
    'n_estimators': [50, 100, 300, 500, 1000],
}

model = XGBClassifier(seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.801264367816092
{'n_estimators': 50}


In [38]:
tuning_parameters = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_child_weight': [2, 4, 6, 8, 10]
}

model = XGBClassifier(n_estimators=50, seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8027586206896553
{'max_depth': 4, 'min_child_weight': 6}


In [39]:
tuning_parameters = {
    'gamma': [0, 0.0001, 0.001, 0.01, 0.1,]
}

model = XGBClassifier(n_estimators=50, max_depth=4, min_child_weight=6, seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8028735632183908
{'gamma': 0.01}


In [40]:
tuning_parameters = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}

model = XGBClassifier(n_estimators=50, max_depth=4, min_child_weight=6, gamma=0.01, seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'colsample_bytree': 0.6, 'subsample': 0.7}


In [41]:
tuning_parameters = {
    'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
}

model = XGBClassifier(n_estimators=50, max_depth=4, min_child_weight=6, gamma=0.01, subsample=0.7, colsample_bytree=0.6, seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'reg_alpha': 1e-05}


In [42]:
tuning_parameters = {
    'reg_lambda': [1e-5, 1e-2, 0.1, 1, 100]
}

model = XGBClassifier(n_estimators=50, max_depth=4, min_child_weight=6, gamma=0.01, subsample=0.7, colsample_bytree=0.6, reg_alpha=1e-5, seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'reg_lambda': 1}


In [43]:
tuning_parameters = {
    'eta': [0.01, 0.05, 0.1, 0.2, 0.3]
}

model = XGBClassifier(n_estimators=50, max_depth=4, min_child_weight=6, gamma=0.01, subsample=0.7, colsample_bytree=0.6, reg_alpha=1e-5, reg_lambda=1, seed=0)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8029885057471265
{'eta': 0.3}


In [44]:
tuning_parameters = {
    'iterations': [50, 100, 300, 500, 1000],
}

model = CatBoostClassifier(random_state=0, verbose=0, allow_writing_files=False)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8086206896551724
{'iterations': 1000}


In [46]:
tuning_parameters = {
    'depth': [2, 4, 6, 8, 10]
}

model = CatBoostClassifier(iterations=1000, random_state=0, verbose=0, allow_writing_files=False)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, 1)
print(best_score)
print(best_params)

0.8086206896551724
{'depth': 6}


In [47]:
tuning_parameters = {
    'l2_leaf_reg': [1e-5, 1e-2, 0.1, 1, 100]
}

model = CatBoostClassifier(iterations=1000, depth=6, random_state=0, verbose=0, allow_writing_files=False)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8072413793103449
{'l2_leaf_reg': 1}


In [51]:
tuning_parameters = {
    'random_strength': [1e-5, 0.01, 1, 10, 100]
}

model = CatBoostClassifier(iterations=1000, depth=6, l2_leaf_reg=1, random_state=0, verbose=0, allow_writing_files=False)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8072413793103449
{'random_strength': 1}


In [52]:
tuning_parameters = {
    'bagging_temperature': [0, 1e-5, 0.01, 1, 10, 100]
}

model = CatBoostClassifier(iterations=1000, depth=6, l2_leaf_reg=1, random_strength=1, random_state=0, verbose=0, allow_writing_files=False)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8072413793103449
{'bagging_temperature': 0}


In [53]:
tuning_parameters = {
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

model = CatBoostClassifier(iterations=1000, depth=6, l2_leaf_reg=1, random_strength=1, bagging_temperature=0, random_state=0, verbose=0, allow_writing_files=False)
best_score, best_params = tuneParameters(model, tuning_parameters, X_train, Y_train, n_core)
print(best_score)
print(best_params)

0.8072413793103449
{'grow_policy': 'SymmetricTree'}


In [60]:
X_train.dtypes

CryoSleep                      Int64
Age                          float32
RoomService                  float32
FoodCourt                    float32
ShoppingMall                 float32
Spa                          float32
VRDeck                       float32
TotalSpending                float32
LuxurySpending               float32
EssentialSpending            float32
HasSpending                    int64
PassengerGroupSize             int64
Alone                          int64
HomePlanet_Earth               int32
HomePlanet_Europa              int32
HomePlanet_Mars                int32
Destination_55 Cancri e        int32
Destination_PSO J318.5-22      int32
Destination_TRAPPIST-1e        int32
CabinDeck_C                    int32
CabinDeck_E                    int32
CabinDeck_F                    int32
CabinDeck_G                    int32
CabinSide_P                    int32
CabinSide_S                    int32
dtype: object

In [61]:
tuned_models = [
    ('LR', LogisticRegression(C=0.01, max_iter=100000, random_state=0)),
    ('SVC', SVC(C=10, cache_size=2000, max_iter=100000, random_state=0)),
    ('DT', DecisionTreeClassifier(max_depth=6, max_leaf_nodes=45, min_samples_leaf=9, max_features=25, random_state=0)),
    ('RF', RandomForestClassifier(max_depth=10, min_samples_split=4, n_estimators=50, max_features='sqrt', random_state=0, oob_score=True)),
    ('GB', GradientBoostingClassifier(n_estimators=300, max_depth=4, min_samples_split=2, max_features='log2', learning_rate=0.1, random_state=0)),
    ('XGB', XGBClassifier(n_estimators=50, max_depth=4, min_child_weight=6, gamma=0.01, subsample=0.7, colsample_bytree=0.6, reg_alpha=1e-5, reg_lambda=1, eta=0.3, seed=0)),
    ('CB', CatBoostClassifier(iterations=1000, depth=6, l2_leaf_reg=1, random_strength=1, bagging_temperature=0, grow_policy='SymmetricTree', random_state=0, verbose=0, allow_writing_files=False))
]

columns_float = X_train.select_dtypes(include=['Int64']).columns
X_train[columns_float] = X_train[columns_float].astype('int32')
X_test[columns_float] = X_test[columns_float].astype('int32')

model = VotingClassifier(tuned_models)
predict(model, X_train, Y_train, X_test, id, label, 'result.csv')



## Attempt3

In [42]:
train_data_new, test_data_new = train_data.copy(), test_data.copy()

train_data_new['Cabin'] = train_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])
test_data_new['Cabin'] = test_data_new['Cabin'].fillna(train_data_new['Cabin'].mode().loc[0])

train_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = train_data_new['Cabin'].str.split('/', expand=True)
test_data_new[['CabinDeck', 'CabinNum', 'CabinSide']] = test_data_new['Cabin'].str.split('/', expand=True)

for f in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    train_data_new[f+'_Log'] = np.log1p(train_data_new[f])
    test_data_new[f+'_Log'] = np.log1p(test_data_new[f])

train_data_new['TotalSpending'] = train_data_new[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_data_new['TotalSpending'] = test_data_new[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

train_data_new['TotalSpending_Log'] = np.log1p(train_data_new['TotalSpending'])
test_data_new['TotalSpending_Log'] = np.log1p(test_data_new['TotalSpending'])

train_data_new['LuxurySpending'] = train_data_new[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)
test_data_new['LuxurySpending'] = test_data_new[['RoomService', 'Spa', 'VRDeck']].sum(axis=1)

train_data_new['LuxurySpending_Log'] = np.log1p(train_data_new['LuxurySpending'])
test_data_new['LuxurySpending_Log'] = np.log1p(test_data_new['LuxurySpending'])

train_data_new['EssentialSpending'] = train_data_new[['FoodCourt', 'ShoppingMall']].sum(axis=1)
test_data_new['EssentialSpending'] = test_data_new[['FoodCourt', 'ShoppingMall']].sum(axis=1)

train_data_new['EssentialSpending_Log'] = np.log1p(train_data_new['EssentialSpending'])
test_data_new['EssentialSpending_Log'] = np.log1p(test_data_new['EssentialSpending'])

train_data_new['HasSpending'] = train_data_new['TotalSpending'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasSpending'] = test_data_new['TotalSpending'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['PassengerGroup'] = train_data_new['PassengerId'].apply(lambda x: x.split('_')[0])
test_data_new['PassengerGroup'] = test_data_new['PassengerId'].apply(lambda x: x.split('_')[0])

count_train = train_data_new['PassengerGroup'].value_counts()
count_test = test_data_new['PassengerGroup'].value_counts()

train_data_new['PassengerGroupSize'] = train_data_new['PassengerGroup'].apply(lambda x: count_train[x])
test_data_new['PassengerGroupSize'] = test_data_new['PassengerGroup'].apply(lambda x: count_test[x])

train_data_new['Alone'] = train_data_new['PassengerGroupSize'].apply(lambda x: 1 if x == 1 else 0)
test_data_new['Alone'] = test_data_new['PassengerGroupSize'].apply(lambda x: 1 if x == 1 else 0)

columns_to_drop = ['PassengerId', 'Name', 'Cabin', 'CabinNum', 'PassengerGroup']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data_new, test_data_new, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 38) (8693,) (4277, 38)


In [17]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.788506
1,SVC,0.782299
2,DecisionTreeClassifier,0.738276
3,RandomForestClassifier,0.781609
4,GradientBoostingClassifier,0.796207
5,XGBClassifier,0.797356
6,CatBoostClassifier,0.808391


### Feature Selection

In [18]:
cv = ShuffleSplit(random_state=0)
model = RandomForestClassifier(random_state=0, oob_score=True)
selector = RFECV(estimator=model, cv=cv, scoring='accuracy', n_jobs=n_core)
selector.fit(X_train, Y_train)

importances = pd.DataFrame(columns=['feature', 'importance'])
importances['feature'] = selector.get_feature_names_out()
importances['importance'] = selector.estimator_.feature_importances_

importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
15,LuxurySpending_Log,0.143379
1,Age,0.122805
14,LuxurySpending,0.108675
13,TotalSpending_Log,0.075096
12,TotalSpending,0.054457
16,EssentialSpending,0.053692
17,EssentialSpending_Log,0.049071
19,PassengerGroupSize,0.03516
0,CryoSleep,0.027588
8,FoodCourt_Log,0.027033


In [19]:
features_to_drop = [f for f in X_train.columns if f not in selector.get_feature_names_out()]
features_to_drop

['VIP',
 'Alone',
 'HomePlanet_Mars',
 'Destination_55 Cancri e',
 'Destination_PSO J318.5-22',
 'Destination_TRAPPIST-1e',
 'CabinDeck_A',
 'CabinDeck_B',
 'CabinDeck_C',
 'CabinDeck_D',
 'CabinDeck_F',
 'CabinDeck_T',
 'CabinSide_P']

In [20]:
X_train = X_train.drop(features_to_drop, axis=1)
X_test = X_test.drop(features_to_drop, axis=1)

cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.788621
1,SVC,0.782529
2,DecisionTreeClassifier,0.731379
3,RandomForestClassifier,0.781724
4,GradientBoostingClassifier,0.795287
5,XGBClassifier,0.794713
6,CatBoostClassifier,0.805517


In [55]:
columns_to_drop = ['PassengerId', 'Name', 'Cabin', 
                   'CabinNum', 'PassengerGroup', 'PassengerGroupSize', 
                   'RoomService', 'FoodCourt', 'ShoppingMall', 
                   'Spa', 'VRDeck', 'TotalSpending', 
                   'LuxurySpending', 'EssentialSpending']

X_train, Y_train, X_test = preprocess(columns_to_drop, train_data_new, test_data_new, label)
print(X_train.shape, Y_train.shape, X_test.shape)

(8693, 29) (8693,) (4277, 29)


In [22]:
cv_results = evaluation(X_train, Y_train)
cv_results

Unnamed: 0,name,score_mean
0,LogisticRegression,0.792414
1,SVC,0.778391
2,DecisionTreeClassifier,0.738851
3,RandomForestClassifier,0.78092
4,GradientBoostingClassifier,0.799885
5,XGBClassifier,0.80092
6,CatBoostClassifier,0.808621
