In [2]:
import numpy as np
import pandas as pd
import random as rnd
import re

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from pycaret.classification import *
from pycaret.anomaly import *

In [56]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [None]:
full_df = pd.concat([train_df, test_df], ignore_index=True)
sub_id = test_df.PassengerId

In [7]:
train_df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [58]:
missing_features_freq = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name', 
                        'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
missing_features_mean = ['Age']

def missing_fill(df):
    for feature in missing_features_freq:
        most_freq = df[feature].value_counts().index[0]
        df[feature].fillna(most_freq, inplace=True)
        
    for feature in missing_features_mean:
        df[feature].fillna(df[feature].mean(), inplace=True)   
missing_fill(full_df)

In [59]:
# Pandas.Series.str.split     
# when using expand=True, the split elements will expand out into separate columns. 
# If NaN is present, it is propagated throughout the columns during the split.
# 然后可以同时赋值给三列。如果不使用expand的话，不能同时赋值给三列
full_df[['Deck','Num','Side']] = full_df['Cabin'].str.split('/', expand=True)

In [60]:
full_df['Surname'] = full_df['Name'].str.split(expand=True)[0]

In [61]:
# numpy.log1p return log(1+x)
full_df['TotalCharge'] = np.log1p(full_df['RoomService'] + full_df['FoodCourt'] \
                                  + full_df['ShoppingMall'] + full_df['Spa'] + full_df['VRDeck'])

In [62]:
drop_features = ['PassengerId', 'Cabin', 'Name']
for feature in drop_features:
    full_df.drop(feature, axis=1, inplace=True)

In [63]:
le_features = ['Deck', 'Surname']
for feature in le_features:
    le = LabelEncoder()
    full_df[feature]= le.fit_transform(full_df[feature])

In [64]:
full_df['Num'] = full_df['Num'].astype('int')
full_df['CryoSleep'] = full_df['CryoSleep'].astype('str')
full_df['VIP'] = full_df['VIP'].astype('str')

In [65]:
dum_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Side']
full_df = full_df.join(pd.get_dummies(full_df[dum_features]))
for feature in dum_features:
    full_df.drop(feature, axis=1, inplace=True)

In [66]:
train_df = full_df[full_df['Transported'].notna()]
test_df = full_df[full_df['Transported'].isna()]

In [68]:
test_df = test_df.drop('Transported', axis=1)

In [76]:
test_df = test_df.reset_index(drop=True)

<font color=black size=5 face=雅黑>**Modeling**</font>

In [None]:
model = pycaret.classification.setup(data=train_df, target='Transported', session_id=100, silent=True)

In [48]:
catboost_model = pycaret.classification.create_model('catboost', fold=10, round=6)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.804598,0.896098,0.791667,0.820598,0.805873,0.609306,0.609704
1,0.783251,0.887583,0.794872,0.78481,0.789809,0.566099,0.566148
2,0.829228,0.914584,0.836538,0.83121,0.833866,0.658194,0.658208
3,0.807882,0.902325,0.794872,0.82392,0.809135,0.615872,0.616274
4,0.817734,0.909447,0.833333,0.815047,0.824089,0.63504,0.635208
5,0.848684,0.929379,0.858974,0.848101,0.853503,0.697054,0.697114
6,0.824013,0.915446,0.868167,0.803571,0.834621,0.647171,0.649381
7,0.824013,0.901155,0.829582,0.826923,0.82825,0.647813,0.647816
8,0.802632,0.89401,0.826367,0.795666,0.810726,0.604694,0.605167
9,0.809211,0.900679,0.826367,0.805643,0.815873,0.617987,0.618202


In [51]:
cat_tune = pycaret.classification.tune_model(catboost_model, n_iter=20)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7997,0.8934,0.7885,0.8146,0.8013,0.5994,0.5997
1,0.7816,0.8883,0.7885,0.7859,0.7872,0.5629,0.5629
2,0.8227,0.9132,0.8365,0.8208,0.8286,0.6449,0.6451
3,0.8062,0.8997,0.7949,0.8212,0.8078,0.6126,0.6129
4,0.8342,0.9113,0.8462,0.8328,0.8394,0.668,0.6681
5,0.8438,0.9275,0.859,0.8401,0.8494,0.6871,0.6873
6,0.8289,0.9173,0.881,0.8035,0.8405,0.6569,0.6602
7,0.8273,0.9011,0.8296,0.8323,0.8309,0.6544,0.6545
8,0.8141,0.8939,0.8457,0.8018,0.8232,0.6276,0.6286
9,0.8092,0.8989,0.8264,0.8056,0.8159,0.618,0.6182


In [None]:
tmp_pred = pycaret.classification.predict_model(cat_tune, data=test_df)

In [78]:
submission = pd.DataFrame({
    'PassengerId': sub_id,
    'Transported': tmp_pred['Label']
})

In [80]:
submission.to_csv('output/submission_pycate.csv', index=False)   # 0.80406

In [81]:
top4_model = compare_models(fold=10, n_select=4)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8151,0.9051,0.8261,0.8155,0.8206,0.6299,0.6303,2.586
lightgbm,Light Gradient Boosting Machine,0.8049,0.9017,0.8033,0.8138,0.8083,0.6098,0.6101,0.13
gbc,Gradient Boosting Classifier,0.8044,0.8951,0.8286,0.7977,0.8127,0.6083,0.6092,0.538
rf,Random Forest Classifier,0.8038,0.8893,0.7805,0.8268,0.8028,0.6079,0.6092,0.47
xgboost,Extreme Gradient Boosting,0.803,0.8945,0.7949,0.8158,0.8051,0.6059,0.6064,0.913
et,Extra Trees Classifier,0.8007,0.8806,0.7731,0.8265,0.7988,0.6017,0.6032,0.442
ada,Ada Boost Classifier,0.7961,0.8806,0.8447,0.7767,0.8091,0.591,0.5937,0.267
lr,Logistic Regression,0.787,0.8788,0.8193,0.7769,0.7975,0.5732,0.5743,1.404
ridge,Ridge Classifier,0.7747,0.0,0.751,0.7973,0.7732,0.5498,0.551,0.013
lda,Linear Discriminant Analysis,0.7745,0.8597,0.7506,0.7972,0.773,0.5495,0.5507,0.038


In [None]:
blender = blend_models(top4_model)
tmp_pred = pycaret.classification.predict_model(blender, data=test_df)
submission = pd.DataFrame({
    'PassengerId': sub_id,
    'Transported': tmp_pred['Label']
})

In [84]:
submission.to_csv('output/submission_pycaret_blender.csv', index=False) # 0.79752

In [85]:
# choose the best 5 model, use GridSearchCV to tune parameters，then stack 5 model

In [3]:
Final_RF_model = RandomForestClassifier(max_depth = 10,
                                        min_samples_leaf = 19,
                                        min_samples_split = 7,
                                        n_estimators = 300)

Final_LGB_model = LGBMClassifier(learning_rate = 0.15, 
                                 n_estimators =400, 
                                 num_leaves = 17)

Final_CAT_model = CatBoostClassifier(depth=7, 
                                     n_estimators=300, 
                                     l2_leaf_reg=7, 
                                     learning_rate = 0.15)

Fianl_GB_model = GradientBoostingClassifier(learning_rate=0.15,
                                            max_depth=3,
                                           n_estimators=400)

In [None]:
stacking_model = StackingClassifier(estimators=
            [('RF', Final_RF_model), ('LGBM', Final_LGB_model), 
             ('CAT', Final_CAT_model), ('GB', Fianl_GB_model)])
stacking_model.fit(tr, tg)

In [None]:
stack_pred = stacking_model.predict(test_df)
stack_pred = stack_pred.astype('bool')

In [102]:
submission = pd.DataFrame({
    'PassengerId': sub_id,
    'Transported': stack_pred
})

In [105]:
submission.to_csv('output/submission_stack.csv', index=False) # 0.80009