In [33]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [6]:
full_df = train_df.append(test_df, ignore_index=True) 

In [7]:
train_df.shape, test_df.shape

((8693, 14), (4277, 13))

In [8]:
full_df.shape

(12970, 14)

In [9]:
full_df['HomePlanet'] = full_df['HomePlanet'].fillna('Earth') 
full_df['CryoSleep'] = full_df['CryoSleep'].fillna(True) 

In [10]:
full_df.loc[full_df.Cabin.notna(), 'Cabin_beck'] = \
            full_df.loc[full_df.Cabin.notna(), 'Cabin'].astype(str).map(lambda x: x.split('/')[0])
full_df.loc[full_df.Cabin.notna(), 'Cabin_side'] = \
            full_df.loc[full_df.Cabin.notna(), 'Cabin'].astype(str).map(lambda x: x.split('/')[2])

In [11]:
full_df['Cabin_beck'] = full_df['Cabin_beck'].fillna('F')
full_df['Cabin_side'] = full_df['Cabin_side'].fillna('P')

In [12]:
full_df['Destination'] = full_df['Destination'].fillna('TRAPPIST-1e')

In [14]:
full_df['Age'].fillna(full_df['Age'].mean(), inplace=True)

In [15]:
full_df['VIP'] = full_df['VIP'].fillna(False)

In [16]:
full_df['RoomService'].fillna(full_df.RoomService.mode()[0], inplace=True)
full_df['FoodCourt'].fillna(full_df.FoodCourt.mode()[0], inplace=True)
full_df['ShoppingMall'].fillna(full_df.ShoppingMall.mode()[0], inplace=True)
full_df['Spa'].fillna(full_df.Spa.mode()[0], inplace=True)
full_df['VRDeck'].fillna(full_df.VRDeck.mode()[0], inplace=True)

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
le1 = LabelEncoder()
le1.fit(full_df['HomePlanet'])
full_df['HomePlanet'] = le1.transform(full_df['HomePlanet'])

In [19]:
le2 = LabelEncoder()
le2.fit(full_df['CryoSleep'])
full_df['CryoSleep'] = le2.transform(full_df['CryoSleep'])

In [20]:
le3 = LabelEncoder()
le3.fit(full_df['Destination'])
full_df['Destination'] = le3.transform(full_df['Destination'])

In [21]:
le4 = LabelEncoder()
le4.fit(full_df['VIP'])
full_df['VIP'] = le4.transform(full_df['VIP'])

In [22]:
le5 = LabelEncoder()
le5.fit(full_df['Cabin_beck'])
full_df['Cabin_beck'] = le5.transform(full_df['Cabin_beck'])

In [23]:
le6 = LabelEncoder()
le6.fit(full_df['Cabin_side'])
full_df['Cabin_side'] = le6.transform(full_df['Cabin_side'])

In [24]:
FEATURES = ["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", \
            "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Cabin_beck", "Cabin_side"]
TARGET = ["Transported"]

In [25]:
train_X = full_df[full_df.Transported.notnull()][FEATURES]
train_y = full_df[full_df.Transported.notnull()][TARGET]
train_y = train_y.astype(int).values.ravel()

test_X = full_df[full_df.Transported.isnull()][FEATURES]

<font color=black size=5 face=雅黑>**Modeling**</font>

In [26]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

In [29]:
sfolder = StratifiedKFold(n_splits=5)
classifiers = []
classifiers.append(SVC())
classifiers.append(RandomForestClassifier())
classifiers.append(GradientBoostingClassifier())
classifiers.append(DecisionTreeClassifier())


cv_results =[]
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, train_X, train_y,
                                     scoring='accuracy', cv=sfolder, n_jobs=-1))

In [40]:
cv_means = []
cv_stds = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_stds.append(cv_result.std())

cvResDf = pd.DataFrame({ 'cv_algorithm': ['SVC','RandomForestClassifier',
                                  'GradientBoostingClassifier','DecisionTreeClassifier'],
                       'cv_mean': cv_means,
                        'cv_std': cv_stds,})
cvResDf

Unnamed: 0,cv_algorithm,cv_mean,cv_std
0,SVC,0.786268,0.012855
1,RandomForestClassifier,0.78661,0.008227
2,GradientBoostingClassifier,0.795355,0.009276
3,DecisionTreeClassifier,0.740713,0.012499


In [41]:
import warnings
warnings.filterwarnings('ignore')


# choose best estimators ---> GradientBoostingClassifier
# first tune the number of estimator, because sometimes the best paramter of learning_rate, max_depth, etc would 
# depend on the number of estimator
params_test1 = {'n_estimators':[20, 40, 60, 80, 100, 120, 140, 160, 200]}
gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                  min_samples_leaf=20,max_depth=8, subsample=0.8,random_state=10), 
                        param_grid=params_test1, scoring='roc_auc', cv=5)
gsearch1.fit(train_X, train_y)

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(max_depth=8,
                                                  min_samples_leaf=20,
                                                  min_samples_split=300,
                                                  random_state=10,
                                                  subsample=0.8),
             param_grid={'n_estimators': [20, 40, 60, 80, 100, 120, 140, 160,
                                          200]},
             scoring='roc_auc')

In [42]:
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 60}, 0.8865234515438265)

In [43]:
# After the best paramter of number of estimator is decided, tune learning_rate, max_depth and min_sample_split
params_test2 = {'learning_rate': [0.06, 0.08, 0.1, 0.12, 0.14, 0.16],
                'max_depth': range(3, 14, 2),
                'min_samples_split': range(100, 801, 200)}
gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=60, subsample=0.8,random_state=10), 
                        param_grid=params_test2, scoring='roc_auc', cv=5)
gsearch2.fit(train_X, train_y)

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(n_estimators=60,
                                                  random_state=10,
                                                  subsample=0.8),
             param_grid={'learning_rate': [0.06, 0.08, 0.1, 0.12, 0.14, 0.16],
                         'max_depth': range(3, 14, 2),
                         'min_samples_split': range(100, 801, 200)},
             scoring='roc_auc')

In [44]:
gsearch2.best_params_, gsearch2.best_score_

({'learning_rate': 0.12, 'max_depth': 7, 'min_samples_split': 300},
 0.8869924854438354)

In [45]:
gmb = GradientBoostingClassifier(n_estimators=60, learning_rate=0.12, max_depth=7, 
                                 min_samples_split=300, subsample=0.8,random_state=10)
gmb.fit(train_X, train_y)
gmb_pred = gmb.predict(test_X)

In [49]:
gmb_output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': gmb_pred})
gmb_output['Transported'] = gmb_output['Transported'].astype(bool)
gmb_output.to_csv('output/gmb_submission.csv', index=False)

<font color=black size=5 face=雅黑>**LightGBM**</font>

In [53]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [54]:
from sklearn.model_selection import train_test_split

lgb_train_X, lgb_valid_X, lgb_train_y, lgb_valid_y = \
          train_test_split(train_X, train_y, test_size=0.2, stratify=train_y)

In [55]:
lgb_train = lgb.Dataset(lgb_train_X, lgb_train_y)
lgb_val = lgb.Dataset(lgb_valid_X, lgb_valid_y)


In [62]:
lgb_params = {
    'objective': 'binary',
    'seed': 42,
    'metric': 'auc',
    'learning_rate': 0.1,
    'num_leaves': 40,
    'verbose':-1,
    'deterministic': True
    
}

In [63]:
lgbm = lgb.train(lgb_params, lgb_train, valid_sets=lgb_val, 
                 num_boost_round=1000, early_stopping_rounds=100, verbose_eval=50)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.869956
[100]	valid_0's auc: 0.867482
Early stopping, best iteration is:
[48]	valid_0's auc: 0.870153


In [73]:
lgbm_pred = lgbm.predict(test_X)
lgbm_pred2 = np.where(lgbm_pred>=0.5, 1, 0)

In [75]:
lgbm_output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': lgbm_pred2})
lgbm_output['Transported'] = lgbm_output['Transported'].astype(bool)
lgbm_output.to_csv('output/lgbm_submission.csv', index=False)

In [69]:
# first tune learning_rate and n_estimators together

estimator = LGBMClassifier(objective='binary', seed=42, metric ='auc', num_leaves=40)
param_gird1 = {
    'learning_rate': [0.04, 0.06, 0.08, 1.0, 1.2, 1.4],
    'n_estimators': [20, 40, 60, 80]
}
gsearch_lgbm1 = GridSearchCV(estimator=estimator, param_grid=param_gird1)
gsearch_lgbm1.fit(train_X, train_y)

GridSearchCV(estimator=LGBMClassifier(metric='auc', num_leaves=40,
                                      objective='binary', seed=42),
             param_grid={'learning_rate': [0.04, 0.06, 0.08, 1.0, 1.2, 1.4],
                         'n_estimators': [20, 40, 60, 80]})

In [70]:
gsearch_lgbm1.best_params_, gsearch_lgbm1.best_score_

({'learning_rate': 0.06, 'n_estimators': 60}, 0.8006464437652155)

In [None]:
estimator = LGBMClassifier(objective='binary', seed=42, metric ='auc', learning_rate=0.06, n_estimators=60)
param_gird2 = {
    'num_leaves': range(30, 80, 10),
    'feature_fraction': [0.7, 0.8, 0.9],
    'bagging_fraction': [0.7, 0.8, 0.9],
    'bagging_freq': [3, 4, 5, 6]
}
gsearch_lgbm2 = GridSearchCV(estimator=estimator, param_grid=param_gird2)
gsearch_lgbm2.fit(train_X, train_y)

In [72]:
gsearch_lgbm2.best_params_, gsearch_lgbm2.best_score_

({'bagging_fraction': 0.7,
  'bagging_freq': 5,
  'feature_fraction': 0.7,
  'num_leaves': 30},
 0.8038672146671069)

In [76]:
lgbm2 = LGBMClassifier(objective='binary', seed=42, metric ='auc', learning_rate=0.06, 
                       n_estimators=60, num_leaves=30, bagging_fraction=0.7, bagging_freq=5,
                       feature_fraction=0.7)
lgbm2.fit(train_X, train_y)



LGBMClassifier(bagging_fraction=0.7, bagging_freq=5, feature_fraction=0.7,
               learning_rate=0.06, metric='auc', n_estimators=60, num_leaves=30,
               objective='binary', seed=42)

In [77]:
lgbm2_pred = lgbm2.predict(test_X)
lgbm2_pred2 = np.where(lgbm2_pred>=0.5, 1, 0)

In [78]:
lgbm2_output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': lgbm2_pred2})
lgbm2_output['Transported'] = lgbm2_output['Transported'].astype(bool)
lgbm2_output.to_csv('output/lgbm2_submission.csv', index=False)