# Binary Classification with a Bank Churn Dataset4
(アンサンブル学習)
## 1. ライブラリ

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

## 2. データ整理
### 2.1. データ取入れ

In [2]:
# Binary Classification with a Bank Churn Dataset のデータセット
dir_path = '/kaggle/input/playground-series-s4e1/'
test = pd.read_csv(dir_path + 'test.csv')       # テストデータ
train = pd.read_csv(dir_path + 'train.csv')     # 訓練データ
samp_sub = pd.read_csv(dir_path + 'sample_submission.csv')

### 2.2. 型変換

In [3]:
# 国データを整数データに変換するカテゴリ
Geography_cat = {'France'  : 0,
                 'Spain'   : 1,
                 'Germany' : 2}
# 性別データを整数データに変換するカテゴリ
Gender_cat = {'Female' : 0,
              'Male'   : 1}

# 全データの型変換
for data in [train, test]:
    data.Geography = data.Geography.map(Geography_cat)
    data.Gender = data.Gender.map(Gender_cat)

### 2.3. スケーリング

In [4]:
for data in [train, test]:
    data.CreditScore = (data.CreditScore-data.CreditScore.min())/(data.CreditScore.max()-data.CreditScore.min())
    data.Geography = data.Geography/2
    data.Age = data.Age/100
    data.Tenure = data.Tenure/10
    data.Balance = (data.Balance-data.Balance.min())/(data.Balance.max()-data.Balance.min())
    data.NumOfProducts = data.NumOfProducts/4
    data.EstimatedSalary = (data.EstimatedSalary-data.EstimatedSalary.min())/(data.EstimatedSalary.max()-data.EstimatedSalary.min())

## 3. testデータ予測関数

In [5]:
basic_features = ['CreditScore', 'Geography', 'Gender', 'Age',
                  'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
                  'IsActiveMember', 'EstimatedSalary']
def predictor(model, features=basic_features, train=train, test=test):
    model.fit(train[features], train.Exited)       #モデルの学習
    prediction = model.predict(test[features])     # テストデータの予測
    output = pd.DataFrame({'id':test.id, 'Exited':prediction})   # 提出データの整理
    output.to_csv('submission.csv', index=False)   # 出力ファイルの生成

## 4. 各モデルの最適化
（アンサンブル学習の準備）
### 4.1. RandomForestClassifier

In [6]:
# rfc1 = RandomForestClassifier(criterion='gini', n_jobs=-1)
# predictor(rfc1)
# private score:0.74357

# rfc2 = RandomForestClassifier(criterion='entropy', n_jobs=-1)
# predictor(rfc2)
# private score:0.74269

# rfc3 = RandomForestClassifier(criterion='log_loss', n_jobs=-1)
# predictor(rfc3)
# private score:0.74420

# rfc4 = RandomForestClassifier(criterion='log_loss', max_depth=3, n_jobs=-1)
# predictor(rfc4)
# private score:0.62036

# rfc5 = RandomForestClassifier(criterion='log_loss', max_depth=5, n_jobs=-1)
# predictor(rfc5)
# private score:0.69232

# rfc6 = RandomForestClassifier(criterion='log_loss', max_depth=7, n_jobs=-1)
# predictor(rfc6)
# private score:0.72262

# rfc7 = RandomForestClassifier(criterion='log_loss', max_depth=9, n_jobs=-1)
# predictor(rfc7)
# private score:0.73689

# rfc8 = RandomForestClassifier(criterion='log_loss', max_depth=11, n_jobs=-1)
# predictor(rfc8)
# private score:0.74317

# rfc9 = RandomForestClassifier(criterion='log_loss', max_depth=13, n_jobs=-1)
# predictor(rfc9)
# private score:0.74734

# rfc10 = RandomForestClassifier(criterion='log_loss', max_depth=15, n_jobs=-1)
# predictor(rfc10)
# private score:0.74767

# rfc11 = RandomForestClassifier(criterion='log_loss', max_depth=17, n_jobs=-1)
# predictor(rfc11)
# private score:0.74794

# rfc12 = RandomForestClassifier(criterion='log_loss', max_depth=19, n_jobs=-1)
# predictor(rfc12)
# private score:0.74688

# rfc13 = RandomForestClassifier(criterion='log_loss', max_depth=21, n_jobs=-1)
# predictor(rfc13)
# private score:0.74588

# rfc14 = RandomForestClassifier(criterion='log_loss', max_depth=30, n_jobs=-1)
# predictor(rfc14)
# private score:0.74387

# rfc15 = RandomForestClassifier(criterion='log_loss', max_depth=40, n_jobs=-1)
# predictor(rfc15)
# private score:0.74412

# rfc16 = RandomForestClassifier(criterion='log_loss', max_depth=50, n_jobs=-1)
# predictor(rfc16)
# private score:0.74277

# rfc17 = RandomForestClassifier(criterion='log_loss', max_depth=16, n_jobs=-1)
# predictor(rfc17)
# private score:0.74689

# rfc18 = RandomForestClassifier(criterion='log_loss', max_depth=18, n_jobs=-1)
# predictor(rfc18)
# private score:0.74724

# rfc19 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=2, n_jobs=-1)
# predictor(rfc19)
# private score:0.74350

# rfc20 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=3, n_jobs=-1)
# predictor(rfc20)
# private score:0.74831

# rfc21 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=4, n_jobs=-1)
# predictor(rfc21)
# private score:0.74878

# rfc22 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_jobs=-1)
# predictor(rfc22)
# private score:0.74940

# rfc23 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=6, n_jobs=-1)
# predictor(rfc23)
# private score:0.74834

# rfc24 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=7, n_jobs=-1)
# predictor(rfc24)
# private score:0.74802

# rfc25 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=8, n_jobs=-1)
# predictor(rfc25)
# private score:0.74786

# rfc26 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=100, n_jobs=-1)
# predictor(rfc26)
# private score:0.74839

# rfc27 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=200, n_jobs=-1)
# predictor(rfc27)
# private score:0.74893

# rfc28 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=400, n_jobs=-1)
# predictor(rfc28)
# private score:0.74854

# rfc29 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=700, n_jobs=-1)
# predictor(rfc29)
# private score:0.74887

# rfc30 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=1000, n_jobs=-1)
# predictor(rfc30)
# private score:0.74840

# rfc31 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=1500, n_jobs=-1)
# predictor(rfc31)
# private score:0.74899

# rfc32 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=2000, n_jobs=-1)
# predictor(rfc32)
# private score:0.74925

# rfc33 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=4000, n_jobs=-1)
# predictor(rfc33)
# private score:0.74875

# rfc34 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=7000, n_jobs=-1)
# predictor(rfc34)
# private score:0.74897

# rfc35 = RandomForestClassifier(criterion='log_loss', max_depth=17, max_features=5, n_estimators=10000, n_jobs=-1)
# predictor(rfc35)
# private score:0.74838

# 最適パラメータ
rfc_par = {'criterion':'log_loss',
           'max_depth':17,
           'max_features':5,
           'n_estimators':2000,
           'n_jobs':-1}

### 4.2. KNeighborsClassifier

In [7]:
# knc40 = KNeighborsClassifier(weights='distance', p=1, n_jobs=-1)
# predictor(knc40)
# private score:0.71134

# knc41 = KNeighborsClassifier(weights='distance', p=2, n_jobs=-1)
# predictor(knc41)
# private score:0.71606

# knc42 = KNeighborsClassifier(weights='uniform', p=1, n_jobs=-1)
# predictor(knc42)
# private score:0.71145

# knc43 = KNeighborsClassifier(weights='uniform', p=2, n_jobs=-1)
# predictor(knc43)
# private score:0.71616

# knc44 = KNeighborsClassifier(n_neighbors=10, weights='uniform', p=2, n_jobs=-1)
# predictor(knc44)
# private score:0.69526

# knc45 = KNeighborsClassifier(n_neighbors=20, weights='uniform', p=2, n_jobs=-1)
# predictor(knc45)
# private score:0.69426

# knc46 = KNeighborsClassifier(n_neighbors=30, weights='uniform', p=2, n_jobs=-1)
# predictor(knc46)
# private score:0.69132

# knc47 = KNeighborsClassifier(n_neighbors=40, weights='uniform', p=2, n_jobs=-1)
# predictor(knc47)
# private score:0.68839

# 最適パラメータ
knc_par = {'n_neighbors':5,
           'weights':'uniform',
           'p':2,
           'n_jobs':-1}

### 4.3. LogisticRegression

In [8]:
# lr48 = LogisticRegression(tol=0.00001, solver='sag', n_jobs=-1)
# predictor(lr48)
# private score:0.66611

# lr49 = LogisticRegression(tol=0.00003, solver='sag', n_jobs=-1)
# predictor(lr49)
# private score:0.66613

# lr50 = LogisticRegression(tol=0.0001, solver='sag', n_jobs=-1)
# predictor(lr50)
# private score:0.66611

# lr51 = LogisticRegression(tol=0.0003, solver='sag', n_jobs=-1)
# predictor(lr51)
# private score:0.66612

# lr52 = LogisticRegression(tol=0.001, solver='sag', n_jobs=-1)
# predictor(lr52)
# private score:0.66726

# lr53 = LogisticRegression(tol=0.003, solver='sag', n_jobs=-1)
# predictor(lr53)
# private score:0.66494

# lr54 = LogisticRegression(tol=0.01, solver='sag', n_jobs=-1)
# predictor(lr54)
# private score:0.66440

# lr55 = LogisticRegression(tol=0.03, solver='sag', n_jobs=-1)
# predictor(lr55)
# private score:0.66921

# lr56 = LogisticRegression(tol=0.1, solver='sag', n_jobs=-1)
# predictor(lr56)
# private score:0.69621

# lr57 = LogisticRegression(tol=0.3, solver='sag', n_jobs=-1)
# predictor(lr57)
# private score:0.63410

# 最適パラメータ
lr_par = {'tol':0.1,
          'solver':'sag',
          'n_jobs':-1}

#### 4.4. MLPClassifier

In [9]:
# mlp58 = MLPClassifier(activation='identity', solver='lbfgs')
# predictor(mlp58)
# private score:0.66663

# mlp59 = MLPClassifier(activation='logistic', solver='lbfgs')
# predictor(mlp59)
# private score:0.72274

# mlp60 = MLPClassifier(activation='tanh', solver='lbfgs')
# predictor(mlp60)
# private score:0.73592

# mlp61 = MLPClassifier(activation='relu', solver='lbfgs')
# predictor(mlp61)
# private score:0.74314

# mlp66 = MLPClassifier(activation='identity', solver='sgd')
# predictor(mlp66)
# private score:0.66851

# mlp67 = MLPClassifier(activation='logistic', solver='sgd')
# predictor(mlp67)
# private score:0.66535

# mlp68 = MLPClassifier(activation='tanh', solver='sgd')
# predictor(mlp68)
# private score:0.66503

# mlp69 = MLPClassifier(activation='relu', solver='sgd')
# predictor(mlp69)
# private score:0.73543

# mlp70 = MLPClassifier(activation='identity', solver='adam')
# predictor(mlp70)
# private score:0.67357

# mlp71 = MLPClassifier(activation='logistic', solver='adam')
# predictor(mlp71)
# private score:0.73711

# mlp72 = MLPClassifier(activation='tanh', solver='adam')
# predictor(mlp72)
# private score:0.74035

# mlp73 = MLPClassifier(activation='relu', solver='adam')
# predictor(mlp73)
# private score:0.74780

# mlp74 = MLPClassifier(hidden_layer_sizes=(50), activation='relu', solver='adam')
# predictor(mlp74)
# private score:0.74696

# mlp75 = MLPClassifier(hidden_layer_sizes=(100), activation='relu', solver='adam')
# predictor(mlp75)
# private score:0.73301

# mlp76 = MLPClassifier(hidden_layer_sizes=(200), activation='relu', solver='adam')
# predictor(mlp76)
# private score:0.74563

# mlp77 = MLPClassifier(hidden_layer_sizes=(300), activation='relu', solver='adam')
# predictor(mlp77)
# private score:0.71652

# mlp78 = MLPClassifier(hidden_layer_sizes=(400), activation='relu', solver='adam')
# predictor(mlp78)
# private score:0.74475

# mlp79 = MLPClassifier(hidden_layer_sizes=(50, 100), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp79)
# private score:0.76331

# mlp80 = MLPClassifier(hidden_layer_sizes=(100, 50), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp80)
# private score:0.70116

# mlp81 = MLPClassifier(hidden_layer_sizes=(100, 100), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp81)
# private score:0.72391

# mlp82 = MLPClassifier(hidden_layer_sizes=(200, 200), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp82)
# private score:0.69070

# mlp83 = MLPClassifier(hidden_layer_sizes=(300, 300), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp83)
# private score:0.76499

# mlp84 = MLPClassifier(hidden_layer_sizes=(400, 400), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp84)
# private score:0.79350

# mlp85 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp85)
# private score:0.73743

# mlp86 = MLPClassifier(hidden_layer_sizes=(150, 150, 150), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp86)
# private score:0.77377

# mlp87 = MLPClassifier(hidden_layer_sizes=(200, 200, 200), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp87)
# private score:0.73910

# mlp88 = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, activation='relu', solver='adam')
# predictor(mlp88)
# private score:0.76808

# mlp89 = MLPClassifier(hidden_layer_sizes=(100, 100), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp89)
# private score:0.79043

# mlp90 = MLPClassifier(hidden_layer_sizes=(150, 150), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp90)
# private score:0.75756

# mlp91 = MLPClassifier(hidden_layer_sizes=(200, 200), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp91)
# private score:0.77556

# mlp92 = MLPClassifier(hidden_layer_sizes=(300, 300), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp92)
# private score:0.79298

# mlp93 = MLPClassifier(hidden_layer_sizes=(400, 400), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp93)
# private score:0.50000

# mlp94 = MLPClassifier(hidden_layer_sizes=(100, 100, 100), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp94)
# private score:0.50000

# mlp95 = MLPClassifier(hidden_layer_sizes=(200, 200, 200), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp95)
# private score:0.50000

# mlp96 = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp96)
# private score:0.50000

# mlp97 = MLPClassifier(hidden_layer_sizes=(400, 400, 400), learning_rate='adaptive', learning_rate_init=0.1, activation='logistic', solver='adam')
# predictor(mlp97)
# private score:0.50000

# 最適パラメータ
mlp_par = {'hidden_layer_sizes':(400, 400),
           'learning_rate':'adaptive',
           'learning_rate_init':0.1,
           'activation':'relu',
           'solver':'adam'}

### 4.5. CatBoostClassifier

In [10]:
# cbc98 = CatBoostClassifier(bootstrap_type='Bayesian', grow_policy='SymmetricTree')
# predictor(cbc98)
# private score:0.75177

# cbc99 = CatBoostClassifier(bootstrap_type='Bayesian', grow_policy='Depthwise')
# predictor(cbc99)
# private score:0.75409

# cbc100 = CatBoostClassifier(bootstrap_type='Bayesian', grow_policy='Lossguide')
# predictor(cbc100)
# private score:0.75484

# cbc101 = CatBoostClassifier(bootstrap_type='Bernoulli', grow_policy='SymmetricTree')
# predictor(cbc101)
# private score:0.75113

# cbc102 = CatBoostClassifier(bootstrap_type='Bernoulli', grow_policy='Depthwise')
# predictor(cbc102)
# private score:0.75382

# cbc103 = CatBoostClassifier(bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc103)
# private score:0.75564

# cbc104 = CatBoostClassifier(bootstrap_type='MVS', grow_policy='SymmetricTree')
# predictor(cbc104)
# private score:0.75123

# cbc105 = CatBoostClassifier(bootstrap_type='MVS', grow_policy='Depthwise')
# predictor(cbc105)
# private score:0.75338

# cbc106 = CatBoostClassifier(bootstrap_type='MVS', grow_policy='Lossguide')
# predictor(cbc106)
# private score:0.75555

# cbc112 = CatBoostClassifier(iterations=100, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc112)
# private score:0.75593

# cbc113 = CatBoostClassifier(iterations=300, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc113)
# private score:0.75380

# cbc114 = CatBoostClassifier(iterations=500, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc114)
# private score:0.75408

# cbc115 = CatBoostClassifier(iterations=700, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc115)
# private score:0.75439

# cbc116 = CatBoostClassifier(iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc116)
# private score:0.75565

# cbc117 = CatBoostClassifier(iterations=1000, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc117)
# private score:0.75564

# cbc118 = CatBoostClassifier(iterations=1100, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc118)
# private score:0.75506

# cbc119 = CatBoostClassifier(iterations=1200, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc119)
# private score:0.75497

# cbc120 = CatBoostClassifier(learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc120)
# private score:0.75591

# cbc121 = CatBoostClassifier(learning_rate=0.07, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc121)
# private score:0.75500

# cbc122 = CatBoostClassifier(learning_rate=0.09, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc122)
# private score:0.75482

# cbc123 = CatBoostClassifier(learning_rate=0.1, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc123)
# private score:0.75435

# cbc124 = CatBoostClassifier(learning_rate=0.11, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc124)
# private score:0.75513

# cbc125 = CatBoostClassifier(learning_rate=0.13, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc125)
# private score:0.75373

# cbc126 = CatBoostClassifier(learning_rate=0.15, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc126)
# private score:0.75400

# cbc127 = CatBoostClassifier(learning_rate=0.03, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc127)
# private score:0.75507

# cbc128 = CatBoostClassifier(learning_rate=0.02, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc128)
# private score:0.75457

# cbc129 = CatBoostClassifier(learning_rate=0.01, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc129)
# private score:0.75358

# cbc130 = CatBoostClassifier(learning_rate=0.005, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc130)
# private score:0.74833

# cbc131 = CatBoostClassifier(depth=3, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc131)
# private score:0.75096

# cbc132 = CatBoostClassifier(depth=4, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc132)
# private score:0.75275

# cbc133 = CatBoostClassifier(depth=5, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc133)
# private score:0.75425

# cbc134 = CatBoostClassifier(depth=6, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc134)
# private score:0.75591

# cbc135 = CatBoostClassifier(depth=7, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc135)
# private score:0.75670

# cbc136 = CatBoostClassifier(depth=8, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc136)
# private score:0.75568

# cbc137 = CatBoostClassifier(depth=9, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc137)
# private score:0.75700

# cbc138 = CatBoostClassifier(depth=10, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc138)
# private score:0.75665

# cbc139 = CatBoostClassifier(depth=11, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc139)
# private score:0.75593

# cbc140 = CatBoostClassifier(depth=12, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc140)
# private score:0.75749

# cbc141 = CatBoostClassifier(depth=13, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc141)
# private score:0.75681

# cbc142 = CatBoostClassifier(depth=14, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc142)
# private score:0.75679

# cbc143 = CatBoostClassifier(model_shrink_mode='Decreasing', model_shrink_rate=0.01, depth=14, learning_rate=0.05, iterations=900, bootstrap_type='Bernoulli', grow_policy='Lossguide')
# predictor(cbc143)
# private score:0.76066

# 最適パラメータ
cbc_par = {'model_shrink_mode':'Decreasing',
           'model_shrink_rate':0.01,
           'depth':14,
           'learning_rate':0.05,
           'iterations':900,
           'bootstrap_type':'Bernoulli',
           'grow_policy':'Lossguide'}

## 5. アンサンブル学習
### 5.1. アンサンブル学習の関数
#### 5.1.1. バギング関数

In [11]:
def bagging(models, weights, features=basic_features, train=train, test=test, train_ratio=0.7, use_proba=False):
    """
    models: 予測に使うモデルのリスト
    weights: 各モデルの重み
    features: 特徴量の指定(trainの特徴量)
    train: 訓練データ
    test: テストデータ
    train_ratio: 各モデルの学習に使うデータ数がtrainデータに占める割合
    use_proba: 予測確率の使用
    """
    if len(models) != len(weights): # モデル数と重みの数が合わない場合にエラーメッセージを出す
        print("error:weights is not correspond to models\n")
        return 0
    else:
        sum_weight = sum(weights) # 全重みの計算
        prediction = np.zeros(len(test))
        for e, model in enumerate(models):
            # 訓練データの生成
            data = train.sample(int(train_ratio*len(train)))
            # モデルの学習
            model.fit(data[features], data.Exited)
            # モデルの予測結果（予測確率）
            if use_proba:
                prediction = prediction + model.predict_proba(test[features])[:,1]*(weights[e]/sum_weight)
            else:
                prediction = prediction + model.predict(test[features])*(weights[e]/sum_weight)
        # 予測結果の統合
        prediction = (prediction >= 0.5) * 1
        # 出力ファイルの生成
        output = pd.DataFrame({'id':test.id, 'Exited':prediction})
        output.to_csv('submission.csv', index=False)

#### 5.1.2. スタッキング関数

In [12]:
def stacking(models, final_model, features=basic_features, train=train, test=test, train_ratio=0.7, use_proba=False):
    """
    models: 予測モデルのリスト
    final_model: 統合モデル
    features: 特徴量の指定(trainの特徴量)
    train: 訓練データ
    test: テストデータ
    train_ratio: 各モデルの学習に使うデータ数がtrainデータに占める割合
    use_proba: 予測確率の使用
    """
    val_predictions = pd.DataFrame()   # 統合モデルの訓練に使うデータ(予測モデルの予測結果(予測確率))
    tes_predictions = pd.DataFrame()   # 予測に使うデータ(予測モデルの予測結果(予測確率))
    tra, val = train_test_split(train, test_size=0.3)   # 訓練データと検証データ(予測モデルと統合モデルそれぞれの訓練データ)の生成
    for e, model in enumerate(models):
        # 予測モデルの学習
        model.fit(tra[features], tra.Exited)
        # 予測モデルの予測結果(予測確率)
        if use_proba:
            val_predictions[str(e)] = pd.DataFrame(model.predict_proba(val[features])[:,1])
            tes_predictions[str(e)] = pd.DataFrame(model.predict_proba(test[features])[:,1])
        else:
            val_predictions[str(e)] = pd.DataFrame(model.predict(val[features]))
            tes_predictions[str(e)] = pd.DataFrame(model.predict(test[features]))
    # 統合モデルの学習
    final_model.fit(val_predictions, val.Exited)
    # 統合モデルの予測結果
    prediction = final_model.predict(tes_predictions)
    # 出力ファイルの生成
    output = pd.DataFrame({'id':test.id, 'Exited':prediction})
    output.to_csv('submission.csv', index=False)

### 5.2. アンサンブル学習の実行
#### 5.2.0. 最適化されたモデル

In [13]:
rfc = RandomForestClassifier(criterion='log_loss',max_depth=17,max_features=5,n_estimators=2000,n_jobs=-1)
knc = KNeighborsClassifier(n_neighbors=5,weights='uniform',p=2,n_jobs=-1)
lr = LogisticRegression(tol=0.1,solver='sag',n_jobs=-1)
mlp = MLPClassifier(hidden_layer_sizes=(400, 400),learning_rate='adaptive',learning_rate_init=0.1,activation='relu',solver='adam')
cbc = CatBoostClassifier(model_shrink_mode='Decreasing',model_shrink_rate=0.01,depth=14,learning_rate=0.05,iterations=900,bootstrap_type='Bernoulli',grow_policy='Lossguide')

#### 5.2.1. バギング

In [14]:
# 149
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,1,1]
# bagging(models=models, weights=weights)
# private score:0.75087

# 152
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.74533

# 154
# models = [rfc, knc, lr, mlp, cbc]
# weights = [2,1,1,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.74256

# 155
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,2,1,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.73030

# 156
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,2,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.72659

# 157
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,2,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.75761

# 158
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,1,2]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.74158

# 159
# models = [rfc, knc, lr, mlp, cbc]
# weights = [2,1,1,3,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.73485

# 160
# models = [rfc, knc, lr, mlp, cbc]
# weights = [2,1,1,1,1]
# bagging(models=models, weights=weights)
# private score:0.75007

# 161
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,2,1,1,1]
# bagging(models=models, weights=weights)
# private score:0.74392

# 162
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,2,1,1]
# bagging(models=models, weights=weights)
# private score:0.74194

# 163
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,2,1]
# bagging(models=models, weights=weights)
# private score:0.74829

# 164
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,1,2]
# bagging(models=models, weights=weights)
# private score:0.75720

# 165
# models = [rfc, knc, lr, mlp, cbc]
# weights = [3,1,1,1,1]
# bagging(models=models, weights=weights)
# private score:0.75075

# 166
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,3,1,1,1]
# bagging(models=models, weights=weights)
# private score:0.72171

# 167
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,3,1,1]
# bagging(models=models, weights=weights)
# private score:0.70617

# 168
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,3,1]
# bagging(models=models, weights=weights)
# private score:0.73809

# 169
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,1,3]
# bagging(models=models, weights=weights)
# private score:0.75053

# 170
# models = [rfc, knc, lr, mlp, cbc]
# weights = [3,1,1,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.74893

# 173
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,3,1,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.72921

# 174
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,3,1,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.71056

# 175
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,3,1]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.69119

# 176
# models = [rfc, knc, lr, mlp, cbc]
# weights = [1,1,1,1,3]
# bagging(models=models, weights=weights, use_proba=True)
# private score:0.73896

#### 5.2.2. スタッキング

In [15]:
# 151
# models = [rfc, knc, lr, mlp, cbc]
# final_model = LogisticRegression(tol=0.1,solver='sag',n_jobs=-1)
# stacking(models=models, final_model=final_model)
# private score:0.75012

# 153
# models = [rfc, knc, lr, mlp, cbc]
# final_model = LogisticRegression(tol=0.1, solver='sag', n_jobs=-1)
# stacking(models=models, final_model=final_model, use_proba=True)
# private score:0.74708

# 177
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model)
# private score:0.75442

# 178
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, use_proba=True)
# private score:0.74537

# 179
# models = [rfc, knc, lr, mlp, cbc]
# final_model = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# stacking(models=models, final_model=final_model)
# private score:0.75243

# 180
# models = [rfc, knc, lr, mlp, cbc]
# final_model = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# stacking(models=models, final_model=final_model, use_proba=True)
# private score:0.74466

# 181
# models = [rfc, knc, lr, mlp, cbc]
# final_model = KNeighborsClassifier(n_neighbors=20, n_jobs=-1)
# stacking(models=models, final_model=final_model)
# private score:0.74572

# 182
# models = [rfc, knc, lr, mlp, cbc]
# final_model = KNeighborsClassifier(n_neighbors=20, n_jobs=-1)
# stacking(models=models, final_model=final_model, use_proba=True)
# private score:0.74126

# 185
# models = [rfc, knc, lr, mlp, cbc]
# final_model = CatBoostClassifier(model_shrink_mode='Decreasing', model_shrink_rate=0.01, learning_rate=0.05, iterations=900)
# stacking(models=models, final_model=final_model)
# private score:0.76014

# 186
# models = [rfc, knc, lr, mlp, cbc]
# final_model = CatBoostClassifier(model_shrink_mode='Decreasing', model_shrink_rate=0.01, learning_rate=0.05, iterations=900)
# stacking(models=models, final_model=final_model, use_proba=True)
# private score:0.75106

# 187
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.5)
# private score:0.75692

# 188
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.6)
# private score:0.76661

# 189
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.7)
# private score:0.74842

# 190
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.8)
# private score:0.76150

# 191
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.9)
# private score:0.72628

# 193
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.5, use_proba=True)
# private score:0.74024

# 194
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.6, use_proba=True)
# private score:0.74166

# 195
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.7, use_proba=True)
# private score:0.74101

# 196
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.8, use_proba=True)
# private score:0.76083

# 197
# models = [rfc, knc, lr, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.9, use_proba=True)
# private score:0.76313

# 198
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.5, use_proba=True)
# private score:0.76572

# 199
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.6, use_proba=True)
# private score:0.77386

# 200
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.7, use_proba=True)
# private score:0.68458

# 201
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.8, use_proba=True)
# private score:0.77575

# 202
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.9, use_proba=True)
# private score:0.75827

# 203
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.5)
# private score:0.74186

# 204
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.6)
# private score:0.75978

# 205
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.7)
# private score:0.75451

# 206
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.8)
# private score:0.74787

# 207
# models = [rfc, mlp, cbc]
# final_model = MLPClassifier(hidden_layer_sizes=(300, 300, 300), learning_rate='adaptive', learning_rate_init=0.1, max_iter=1000)
# stacking(models=models, final_model=final_model, train_ratio=0.9)
# private score:0.76459