In [54]:
import numpy as np
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [55]:
def fix_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

SEED = 42
fix_seed(SEED)

In [59]:
train_df.loc[train_df.Transported==True].shape[0]/train_df.shape[0] #没有样本不均衡的问题

0.5036236051995858

In [60]:
full_df = train_df.append(test_df, ignore_index=True)

In [61]:
full_df['HomePlanet'] = full_df['HomePlanet'].fillna('Earth')
full_df['CryoSleep'] = full_df['CryoSleep'].fillna(True)

In [62]:
full_df.loc[full_df.Cabin.notna(), 'Cabin_beck'] = \
         full_df.loc[full_df.Cabin.notna(), 'Cabin'].astype(str).map(lambda x: x.split('/')[0])
full_df.loc[full_df.Cabin.notna(), 'Cabin_side'] = \
         full_df.loc[full_df.Cabin.notna(), 'Cabin'].astype(str).map(lambda x: x.split('/')[2])                                                                                                                                                   

In [63]:
full_df['Cabin_beck'] = full_df['Cabin_beck'].fillna('F')
full_df['Cabin_side'] = full_df['Cabin_side'].fillna('P')

In [64]:
full_df['Destination'] = full_df['Destination'].fillna('TRAPPIST-1e')

In [65]:
full_df['Age'].fillna(full_df.Age.mean(), inplace=True)

In [66]:
full_df['VIP'].fillna(False, inplace=True)

In [67]:
full_df['RoomService'].fillna(full_df.RoomService.mode()[0], inplace=True)
full_df['FoodCourt'].fillna(full_df.FoodCourt.mode()[0], inplace=True)
full_df['ShoppingMall'].fillna(full_df.ShoppingMall.mode()[0], inplace=True)
full_df['Spa'].fillna(full_df.Spa.mode()[0], inplace=True)
full_df['VRDeck'].fillna(full_df.VRDeck.mode()[0], inplace=True)

In [70]:
from sklearn.preprocessing import LabelEncoder

In [71]:
full_data = full_df.copy()

In [72]:
columns = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Cabin_beck", "Cabin_side"]
for column in columns:
    labelencoder = LabelEncoder()
    full_data[column] = labelencoder.fit_transform(full_data[column])

In [74]:
FEATURES = ["HomePlanet", "CryoSleep", "Destination", "Age", "VIP", "RoomService", \
            "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Cabin_beck", "Cabin_side"]
TARGET = ["Transported"]
FEATURES_TARGET = FEATURES + TARGET

train = full_data[full_data.Transported.notna()][FEATURES_TARGET]
train["Transported"] = train.Transported.astype(int)

X_test = full_data[full_data.Transported.isna()][FEATURES]

In [78]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
folds = train.copy()
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #shuffle就是打乱了数据集，然后分成五份
for n, (train_index, val_index) in enumerate(fold.split(folds, folds["Transported"])):
    folds.loc[val_index, "fold"] = int(n)
folds["fold"] = folds.fold.astype(int)
folds.groupby(['fold', 'Transported']).size()

1. Kfold with LightGBM 

In [None]:
import lightgbm as lgb
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [103]:
scores = []
scores_best = []
y_pred_pro = []


for i in range(5):

    fix_seed(SEED)
    
    p_train = folds[folds.fold != i]
    p_val = folds[folds.fold == i]

    p_train = p_train.reset_index(drop=True)
    p_val = p_val.reset_index(drop=True)

    dtrain = lgb.Dataset(p_train[FEATURES], p_train[TARGET])
    dvalid = lgb.Dataset(p_val[FEATURES], p_val[TARGET])

    params = {'objective': 'binary',
              'seed': 42,
              'metric': 'auc',
              'learning_rate': 0.05,
              'num_leaves': 40,
              'verbose':-1,
              'deterministic': True
              }

    lgbm = lgb.train(params, dtrain, num_boost_round=1000, valid_sets=dvalid, 
                    early_stopping_rounds=100,  verbose_eval=50)
    
    oof_pred = lgbm.predict(p_val[FEATURES])
    oof_pred = np.where(oof_pred >= 0.5, 1, 0)
    score = accuracy_score(p_val[TARGET], oof_pred)
    scores.append(score)

# 验证了一下，num_iteration=model.best_iteration不需要加    
#     oof_pred2 = lgbm.predict(p_val[FEATURES], num_iteration=lgbm.best_iteration)
#     oof_pred2 = np.where(oof_pred2 >= 0.5, 1, 0)
#     score2 = accuracy_score(p_val[TARGET], oof_pred2)
#     scores_best.append(score2)

    y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)
    y_pred_pro.append(y_pred)


Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.891947
[100]	valid_0's auc: 0.894789
[150]	valid_0's auc: 0.893187
[200]	valid_0's auc: 0.892006
Early stopping, best iteration is:
[104]	valid_0's auc: 0.894801
Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.878752
[100]	valid_0's auc: 0.88506
[150]	valid_0's auc: 0.884762
[200]	valid_0's auc: 0.884697
Early stopping, best iteration is:
[138]	valid_0's auc: 0.885426
Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.88681
[100]	valid_0's auc: 0.890884
[150]	valid_0's auc: 0.889636
Early stopping, best iteration is:
[93]	valid_0's auc: 0.891048
Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc: 0.892081
[100]	valid_0's auc: 0.896471
[150]	valid_0's auc: 0.895156
Early stopping, best iteration is:
[90]	valid_0's auc: 0.896829
Training until validation scores don't improve for 100 rounds
[50]	valid_0's auc

In [120]:
y_pred_all = np.mean(y_pred_pro, axis=0)
y_pred_all = np.where(y_pred >= 0.5, 1, 0)
y_pred_all = y_pred_all.astype(bool)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': y_pred_all})
output.to_csv('output/lightgbm_submission_ver02.csv', index=False)

2. Optuna : Optimization of hyper parameters of Light GBM

In [122]:
import optuna.integration.lightgbm as lgbo

In [None]:
p_train = folds[folds.fold != 0]
p_val = folds[folds.fold == 0]

p_train = p_train.reset_index(drop=True)
p_val = p_val.reset_index(drop=True)

dtrain = lgbo.Dataset(p_train[FEATURES], p_train[TARGET])
dvalid = lgbo.Dataset(p_val[FEATURES], p_val[TARGET])

params = {
        'objective': 'binary', 
        "seed":42,
        'metric': "auc",
        "verbose":-1,
        "deterministic":True}

booster = lgbo.LightGBMTuner(params, dtrain, num_boost_round=1000, valid_sets=dvalid, 
                early_stopping_rounds=100,  verbose_eval=50, optuna_seed=42)

booster.run()

lgbm_model = booster.get_best_booster()

In [126]:
lgbm_model.params

{'objective': 'binary',
 'seed': 42,
 'metric': 'auc',
 'verbose': -1,
 'deterministic': True,
 'feature_pre_filter': False,
 'lambda_l1': 0.0,
 'lambda_l2': 0.0,
 'num_leaves': 31,
 'feature_fraction': 0.7,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'num_iterations': 1000,
 'early_stopping_round': 100}

In [128]:
oof_pred = lgbm_model.predict(p_val[FEATURES])
oof_pred = np.where(oof_pred >= 0.5, 1, 0)
score = accuracy_score(p_val[TARGET], oof_pred)
score

0.8142610695802185

In [None]:
y_pred = lgbm_model.predict(X_test)

In [131]:
y_pred = np.where(y_pred >= 0.5, 1, 0)
y_pred = y_pred.astype(bool)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': y_pred})
output.to_csv('output/lightgbm_submission_ver02_optuna.csv', index=False)

3. Kfold with optuna

In [136]:
import optuna

In [143]:
def objective_fold(num):    
    
    def objective(trial):
        
        for fold in range(5):
            if fold != num:
                continue
                
            p_train = folds[folds.fold != fold]
            p_val = folds[folds.fold == fold]

            p_train = p_train.reset_index(drop=True)
            p_val = p_val.reset_index(drop=True)

            dtrain = lgbo.Dataset(p_train[FEATURES], p_train[TARGET])
            dvalid = lgbo.Dataset(p_val[FEATURES], p_val[TARGET])


            lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-8, 10.0)
            lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-8, 10.0)

            learning_rate = trial.suggest_uniform('learning_rate', 0, 0.3)

            feature_fraction = trial.suggest_uniform('feature_fraction', 0, 1.0)

            bagging_fraction = trial.suggest_uniform('bagging_fraction', 0, 1.0)
            bagging_freq = trial.suggest_int('bagging_freq', 5, 500)

            num_leaves = trial.suggest_int('num_leaves', 5, 1000)
            num_iterations = trial.suggest_int('num_iterations', 5, 1000)

            min_child_samples = trial.suggest_int('min_child_samples', 5, 500)
            min_child_weight = trial.suggest_int('min_child_weight', 5, 500)

            max_depth = trial.suggest_int('max_depth', 5, 100)

            lgbm_params = {
                           'objective': 'binary', 
                           'seed': 42, 
                           'metric': 'None', # Change here 
                           'lambda_l1': lambda_l1,
                           'lambda_l2': lambda_l2,
                           'learning_rate': learning_rate,
                           'feature_fraction': feature_fraction,
                           'bagging_fraction': bagging_fraction,
                           'bagging_freq': bagging_freq,
                           'num_leaves': num_leaves,
                           'num_iterations': num_iterations,
                           'min_child_samples': min_child_samples,
                           'min_child_weight': min_child_weight,
                           'max_depth': max_depth,
                           'verbosity': -1,
                           'deterministic':True
                            }
            model = lgb.train(params, dtrain, num_boost_round=1000, valid_sets=dvalid, 
                              early_stopping_rounds=100, verbose_eval=50)

            oof_pred = model.predict(p_val[FEATURES])
            oof_pred2 = np.where(oof_pred >= 0.5, 1, 0)
            score = accuracy_score(p_val[TARGET], oof_pred2)
            
        return score
        
    return objective

In [None]:
scores = []
all_preds = []

for fold in range(5):
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective_fold(fold), n_trials=100)
    trial = study.best_trial
    
    lgbm_params = {
                   'objective': 'binary', 
                   'seed': 42, 
                   'metric': 'None', # Change here 
                   'verbose':-1,
                   'deterministic':True
                  }
    lgbm_params.update(**lgbm_params, **trial.params)
    
    p_train = folds[folds["fold"] != fold]
    p_val = folds[folds["fold"] == fold]

    p_train = p_train.reset_index(drop=True)
    p_val = p_val.reset_index(drop=True)

    dtrain = lgb.Dataset(p_train[FEATURES], p_train[TARGET])
    dvalid = lgb.Dataset(p_val[FEATURES], p_val[TARGET])
    
    model = lgb.train(params, dtrain, num_boost_round=1000, valid_sets=dvalid, 
                      early_stopping_rounds=100, verbose_eval=50)
    
    oof_pred = model.predict(p_val[FEATURES])
    oof_pred = np.where(oof_pred >= 0.5, 1, 0)
    score = accuracy_score(p_val[TARGET], oof_pred)
    scores.append(score)


    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    all_preds.append(y_pred)

In [156]:
all_pred = np.mean(all_preds, axis=0)
y_pred = np.where(all_pred >= 0.5, 1, 0)
y_pred = y_pred.astype(bool)
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Transported': y_pred})
output.to_csv('output/lightgbm_submission_ver02_optuna_kfold.csv', index=False)