# Ensemble의 Ensemble

In [28]:
import os
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
# 현재경로 확인
os.getcwd()

'C:\\Users\\Hongjin\\Desktop\\김다윗\\개인\\01. 공부\\Git\\ML_code\\03_앙상블기법'

### Data Loading

In [3]:
data = pd.read_csv('../00_Data/otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
'''
id: 고유 아이디
feat_1 ~ feat_93: 설명변수
target: 타겟변수 (1~9)
'''

'\nid: 고유 아이디\nfeat_1 ~ feat_93: 설명변수\ntarget: 타겟변수 (1~9)\n'

In [5]:
nCar = data.shape[0] # 데이터 개수
nVar = data.shape[1] # 변수 개수
print('nCar: %d' % nCar, 'nVar: %d' % nVar )

nCar: 61878 nVar: 95


### 의미없는 변수 제거

In [6]:
data = data.drop(['id'], axis=1)

### 타겟변수 카테고리화

In [7]:
mapping_dict = {'Class_1':1,
                'Class_2':2,
                'Class_3':3,
                'Class_4':4,
                'Class_5':5,
                'Class_6':6,
                'Class_7':7,
                'Class_8':8,
                'Class_9':9,}

after_mapping_target = data['target'].apply(lambda x:mapping_dict[x])

In [8]:
after_mapping_target[:5]

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

### 학습 및 평가 데이터

In [9]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42) # 학습데이터와 평가데이터의 비율을 8:2 로 분할| 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) # 데이터 개수 확인

(49502, 93) (12376, 93) (49502,) (12376,)


## XGBoost

In [12]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.1-py3-none-win_amd64.whl (89.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.1


In [13]:
%%time
import xgboost as xgb

xgb_dtrain = xgb.DMatrix(data=train_x, label=train_y) # 학습데이터를 XGBoost 모델에 맞게 변환
xgb_dtest = xgb.DMatrix(data=test_x) # 평가데이터도 변환

xgb_param = {'max_depth':10, # 트리 깊이
             'learning_rate':0.01, # step size
             'n_estimators':100, # 트리 생성 개수
             'objective':'multi:softmax', # 목적함수
             'num_class':len(set(train_y))+1} # num_class 보다 1 커야함

xgb_model = xgb.train(params=xgb_param, dtrain=xgb_dtrain) # 학습
xgb_model_predict = xgb_model.predict(xgb_dtest) # 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict)*100), '%')

Parameters: { "n_estimators" } are not used.

Accuracy: 76.67 %
Wall time: 6.24 s


## LightGBM

In [14]:
!pip install lightgbm



In [17]:
%%time
import lightgbm as lgb

lgb_dtrain = lgb.Dataset(data=train_x, label=train_y) # 학습데이터를 LightGBM 모델에 맞게 변환

lgb_param = {'max_depth':10, # 트리 깊이
             'learning_rate':0.01, # Step size
             'n_estimators':100, # 트리 생성 개수
             'objective':'multiclass', # 목적함수
             'num_class':len(set(train_y))+1} # num_class 보다 1 커야함

lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain) # 학습
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis=1) # 예측
print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict)*100), '%')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Wall time: 3.59 s


## Catboost

In [18]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.1.1-cp39-none-win_amd64.whl (74.0 MB)
Collecting plotly
  Downloading plotly-5.11.0-py2.py3-none-any.whl (15.3 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.1.0-py3-none-any.whl (23 kB)
Installing collected packages: tenacity, plotly, catboost
Successfully installed catboost-1.1.1 plotly-5.11.0 tenacity-8.1.0


In [20]:
%%time
import catboost as cb

cb_dtrain = cb.Pool(data=train_x, label=train_y) # 학습데이터를 Catboost 모델에 맞게 변환

cb_param = {'max_depth':10, # 트리 깊이
            'learning_rate':0.01, # Step size
            'n_estimators':100, # 트리 생성 개수
            'eval_metric':'Accuracy', # 평가 척도
            'loss_function':'MultiClass'} # 손실함수, 목적함수

cb_model = cb.train(pool=cb_dtrain, params=cb_param) # 학습
cb_model_predict = np.argmax(cb_model.predict(test_x), axis=1) + 1 # 예측, Softmax의 결과값 중 가장 큰 값의 Label로 예측, 인덱스의 순서를 맞추기 위해 +1
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict)*100), '%')

0:	learn: 0.5907034	total: 531ms	remaining: 52.5s
1:	learn: 0.6356107	total: 957ms	remaining: 46.9s
2:	learn: 0.6411256	total: 1.5s	remaining: 48.4s
3:	learn: 0.6480344	total: 1.95s	remaining: 46.9s
4:	learn: 0.6508222	total: 2.41s	remaining: 45.8s
5:	learn: 0.6499939	total: 2.87s	remaining: 45s
6:	learn: 0.6507818	total: 3.31s	remaining: 44s
7:	learn: 0.6548422	total: 3.74s	remaining: 43s
8:	learn: 0.6559533	total: 4.16s	remaining: 42.1s
9:	learn: 0.6560947	total: 4.57s	remaining: 41.1s
10:	learn: 0.6568421	total: 5s	remaining: 40.5s
11:	learn: 0.6588219	total: 5.47s	remaining: 40.1s
12:	learn: 0.6592259	total: 5.91s	remaining: 39.5s
13:	learn: 0.6611248	total: 6.32s	remaining: 38.8s
14:	learn: 0.6625591	total: 6.73s	remaining: 38.1s
15:	learn: 0.6631853	total: 7.16s	remaining: 37.6s
16:	learn: 0.6639328	total: 7.58s	remaining: 37s
17:	learn: 0.6668821	total: 8s	remaining: 36.4s
18:	learn: 0.6669630	total: 8.37s	remaining: 35.7s
19:	learn: 0.6675286	total: 8.77s	remaining: 35.1s
20:	l

## Ensemble의 Ensemble

In [24]:
%%time
import random

sample = 10
bagging_predict_result = []
for _ in range(sample):
    data_index = [data_index for data_index in range(train_x.shape[0])]
    random_index = np.random.choice(data_index, train_x.shape[0])
    
    lgb_dtrain = lgb.Dataset(data=train_x.iloc[random_index, :], label=train_y.iloc[random_index])
    lgb_param = {'max_depth':14,
                 'learning_rate':0.01,
                 'n_estimators':500,
                 'objective':'regression'}
    
    lgb_model = lgb.train(params=lgb_param, train_set=lgb_dtrain)
    predict1 = lgb_model.predict(test_x)
    bagging_predict_result.append(predict1)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2966
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score 4.858248
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2961
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score 4.831239
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2958
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score 4.842229
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can

In [25]:
bagging_predict = []
for idx_test in range(test_x.shape[0]):
    temp_predict = []
    for idx in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[idx][idx_test])
    bagging_predict.append(np.mean(temp_predict))

In [30]:
print("Accuracy: %.2f" % (accuracy_score(test_y, cb_model_predict)*100), '%')

Accuracy: 69.64 %
