In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

import numpy as np
import random
from tqdm import tqdm
from glob import glob

## 1. 데이터 불러오기

In [2]:
path = './data/'

train = pd.read_csv(path + 'train_features.csv')
train_label = pd.read_csv(path + 'train_labels.csv')
test = pd.read_csv(path + 'test_features.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

## 3. 데이터 전처리

In [3]:
features = ['id', 'acc_x', 'acc_y', 'acc_z', 'gy_x', 'gy_y', 'gy_z']
ml_train = train[features].groupby('id').agg(['max', 'min', 'mean'])
ml_test = test[features].groupby('id').agg(['max', 'min', 'mean'])

In [4]:
ml_train.shape, ml_test.shape

((3125, 18), (782, 18))

In [5]:
ml_label = train_label['label']

## 4. ML 모델링

In [6]:
import gc

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [7]:
# 모델 1번: xgboost

def build_xgboost(split_num, train, target, test, rnd):
    
    params = {
                'colsample_bytree': 0.7,
                'subsample': 0.8,
                'eta': 0.04,
                'max_depth': 12,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':61,
                }
    
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((train.shape[0], 61)), np.zeros((test.shape[0], 61))
    
    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=233*rnd)
    for train_idx, val_idx in skf.split(train, target):

        # split train, validation set
        X = train.iloc[train_idx]
        y = target.iloc[train_idx]
        valid_x = train.iloc[val_idx]
        valid_y = target.iloc[val_idx]

        d_train = xgb.DMatrix(X, y)
        d_valid = xgb.DMatrix(valid_x, valid_y)
        d_temp = xgb.DMatrix(valid_x)
        d_test = xgb.DMatrix(test)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        
        #run traning
        model = xgb.train(params, d_train, 2000, watchlist, 
                        early_stopping_rounds=50,
                        verbose_eval=100)

        # save feat
        train_pred[val_idx] = model.predict(d_temp)
        test_pred += model.predict(d_test)/split_num
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred, test_pred

xgb_train1, xgb_test1 = build_xgboost(5, ml_train, ml_label, ml_test, 1)
xgb_train2, xgb_test2 = build_xgboost(5, ml_train, ml_label, ml_test, 2)

[0]	train-mlogloss:3.62825	valid-mlogloss:3.67101
[100]	train-mlogloss:0.28305	valid-mlogloss:1.21516
[200]	train-mlogloss:0.08127	valid-mlogloss:1.10142
[300]	train-mlogloss:0.04497	valid-mlogloss:1.09137
[361]	train-mlogloss:0.03647	valid-mlogloss:1.08973
------------------
[0]	train-mlogloss:3.61803	valid-mlogloss:3.65704
[100]	train-mlogloss:0.27679	valid-mlogloss:1.28499
[200]	train-mlogloss:0.07877	valid-mlogloss:1.19083
[300]	train-mlogloss:0.04396	valid-mlogloss:1.18798
[319]	train-mlogloss:0.04090	valid-mlogloss:1.18980
------------------
[0]	train-mlogloss:3.62390	valid-mlogloss:3.64505
[100]	train-mlogloss:0.28280	valid-mlogloss:1.18239
[200]	train-mlogloss:0.08126	valid-mlogloss:1.08809
[299]	train-mlogloss:0.04555	valid-mlogloss:1.08363
------------------
[0]	train-mlogloss:3.62030	valid-mlogloss:3.65042
[100]	train-mlogloss:0.28112	valid-mlogloss:1.21359
[200]	train-mlogloss:0.08108	valid-mlogloss:1.11948
[300]	train-mlogloss:0.04497	valid-mlogloss:1.11898
[326]	train-mlo

In [8]:
# 모델 2번: catboost

def build_catboost(split_num, train, target, test, rnd):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((train.shape[0], 61)), np.zeros((test.shape[0], 61))

    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=233*rnd)
    for train_idx, val_idx in skf.split(train, target):

        # split train, validation set
        X = train.iloc[train_idx]
        y = target.iloc[train_idx]
        valid_x = train.iloc[val_idx]
        valid_y = target.iloc[val_idx]

        model = cb.CatBoostClassifier(iterations=1500,
                                      learning_rate=0.01,
                                      l2_leaf_reg=3.5,
                                      depth=8,
                                      rsm=0.98,
                                      loss_function= 'MultiClass',
                                      eval_metric='AUC',
                                      use_best_model=True,
                                      random_seed=42,
                                      verbose=50)

        model.fit(X, y,
                  eval_set=(valid_x, valid_y),
                  early_stopping_rounds=30)
        
        # save feat
        train_pred[val_idx] = model.predict(valid_x)
        test_pred += model.predict(test)/split_num
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred, test_pred

catboost_train1, catboost_test1 = build_catboost(5, ml_train, ml_label, ml_test, 1)
catboost_train2, catboost_test2 = build_catboost(5, ml_train, ml_label, ml_test, 2)

0:	test: 0.8788202	best: 0.8788202 (0)	total: 546ms	remaining: 13m 38s
50:	test: 0.9751737	best: 0.9751737 (50)	total: 22.4s	remaining: 10m 37s
100:	test: 0.9781930	best: 0.9781930 (100)	total: 44.2s	remaining: 10m 11s
150:	test: 0.9802737	best: 0.9802737 (150)	total: 1m 5s	remaining: 9m 48s
200:	test: 0.9810318	best: 0.9811934 (196)	total: 1m 27s	remaining: 9m 26s
250:	test: 0.9817006	best: 0.9818524 (238)	total: 1m 49s	remaining: 9m 4s
300:	test: 0.9824387	best: 0.9824387 (300)	total: 2m 11s	remaining: 8m 42s
350:	test: 0.9826683	best: 0.9827424 (348)	total: 2m 33s	remaining: 8m 20s
400:	test: 0.9832590	best: 0.9834020 (398)	total: 2m 54s	remaining: 7m 58s
450:	test: 0.9836716	best: 0.9837902 (431)	total: 3m 16s	remaining: 7m 37s
500:	test: 0.9843569	best: 0.9843569 (500)	total: 3m 38s	remaining: 7m 15s
550:	test: 0.9846588	best: 0.9848202 (539)	total: 3m 59s	remaining: 6m 52s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9848201669
bestIteration = 539

Shrink m

100:	test: 0.9783585	best: 0.9784273 (95)	total: 42.7s	remaining: 9m 51s
150:	test: 0.9792417	best: 0.9794221 (133)	total: 1m 3s	remaining: 9m 29s
200:	test: 0.9804051	best: 0.9804151 (197)	total: 1m 25s	remaining: 9m 9s
250:	test: 0.9814011	best: 0.9814011 (250)	total: 1m 46s	remaining: 8m 48s
300:	test: 0.9818473	best: 0.9821082 (295)	total: 2m 7s	remaining: 8m 27s
350:	test: 0.9824234	best: 0.9824234 (350)	total: 2m 28s	remaining: 8m 6s
400:	test: 0.9830334	best: 0.9830731 (393)	total: 2m 49s	remaining: 7m 44s
450:	test: 0.9833647	best: 0.9834558 (425)	total: 3m 10s	remaining: 7m 23s
500:	test: 0.9839403	best: 0.9840245 (496)	total: 3m 32s	remaining: 7m 3s
550:	test: 0.9842076	best: 0.9842622 (549)	total: 3m 53s	remaining: 6m 42s
600:	test: 0.9847087	best: 0.9847662 (594)	total: 4m 14s	remaining: 6m 21s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.9848129314
bestIteration = 608

Shrink model to first 609 iterations.
------------------
0:	test: 0.8798594	best: 

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [13]:
# # https://dacon.io/competitions/official/235689/codeshare/2347?page=1&dtype=recent&ptype=pub
# # 데이터 증강을 통해 과접합을 줄여보자 - DACON.

# x_train = []

# for uid in tqdm(train['id'].unique()):
#     temp = np.array(train[train['id'] == uid].iloc[:,2:], np.float32).T
#     x_train.append(temp)

# x_train = np.array(x_train, np.float32)
# x_train = x_train[:,:,:,np.newaxis]

# y_train = train_label['label']

# x_test = []

# for uid in tqdm(test['id'].unique()):
#     temp = np.array(test[test['id'] == uid].iloc[:,2:], np.float32).T
#     x_test.append(temp)

# x_test = np.array(x_test, np.float32)
# x_test = x_test[:,:,:,np.newaxis]

In [14]:
def aug(data, shift):
    shift_data = np.roll(data, shift, axis=2)
    return shift_data

# # 데이터 증강
# shift_data = []
# shift_label = []
# for n in tqdm(range(100)):
#     shifted = aug(x_train, n*6)
#     shift_data.append(shifted)
#     shift_label.append(y_train)

In [15]:
x_train = np.array(train.iloc[:,2:]).reshape(-1, 600, 6, 1)
y_train = tf.keras.utils.to_categorical(train_label['label'])
x_test = np.array(test.iloc[:,2:]).reshape(-1, 600, 6, 1)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(3125, 600, 6, 1)
(3125, 61)
(782, 600, 6, 1)


In [16]:
# 모델 3번: lstm

# def build_lstm(split_num, train, target, test, rnd):
#     # return train pred prob and test pred prob 
#     train_pred, test_pred = np.zeros((x_train.shape[0], 61)), np.zeros((x_test.shape[0], 61))
    
#     es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
#     mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     for train_idx, val_idx in mskf.split(train, target):

#         # split train, validation set
#         X = train[train_idx]
#         y = target[train_idx]
#         valid_x = train[val_idx]
#         valid_y = target[val_idx]

#         #가벼운 모델 생성
#         model = Sequential()
#         model.add(LSTM(32, input_shape=(600,6)))
#         model.add(Dense(128, activation='relu'))
#         model.add(Dense(61, activation='softmax'))

#         model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#         model.fit(X,y,
#                   epochs=100,
#                   batch_size=64,
#                   validation_data=[valid_x,valid_y],
#                   callbacks=[es]
#                  )
        
#         # save feat
#         train_pred[val_idx] = model.predict(valid_x)
#         test_pred += model.predict(test)/split_num
        
#         # release
#         del model
#         gc.collect()
#         print('------------------')
        
#     return train_pred, test_pred

# lstm_train1, lstm_test1 = build_lstm(5, x_train, y_train, x_test, 1)
# lstm_train2, lstm_test2 = build_lstm(5, x_train, y_train, x_test, 2)

In [17]:
# 모델 4번: cnn

def build_cnn(split_num, train, target, test, rnd):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((x_train.shape[0], 61)), np.zeros((x_test.shape[0], 61))

    es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in mskf.split(train, target):

        # split train, validation set
        X = train[train_idx]
        y = target[train_idx]
        valid_x = train[val_idx]
        valid_y = target[val_idx]

        #가벼운 모델 생성
        model = Sequential()

        model.add(Conv2D(input_shape = (x_train.shape[1], x_train.shape[2], x_train.shape[3]), filters = 64, kernel_size = (3,3), strides = (1,1), padding = 'same'))
        model.add(Activation('relu'))
        model.add(Conv2D(filters = 512, kernel_size = (3,3), strides = (1,1), padding = 'same'))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size = (2,2)))
        model.add(Conv2D(filters = 256, kernel_size = (3,3), strides = (1,1), padding = 'same'))
        model.add(Activation('relu'))
        model.add(Conv2D(filters = 256, kernel_size = (3,3), strides = (1,1), padding = 'same'))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size = (2,2)))

        # prior layer should be flattend to be connected to dense layers
        model.add(Flatten())
        # dense layer with 50 neurons
        model.add(Dense(128, activation = 'relu'))
        # final layer with 3 neurons to classify the instances
        model.add(Dense(61, activation = 'softmax'))

        adam = optimizers.Adam(lr = 0.001)
        model.compile(optimizer = adam, metrics = ['accuracy'], loss = 'categorical_crossentropy')
        model.fit(X,y, epochs=100, batch_size=128, validation_split=0.2)
        
        # save feat
        train_pred[val_idx] = model.predict(valid_x)
        test_pred += model.predict(test)/split_num
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred, test_pred

cnn_train1, cnn_test1 = build_cnn(5, x_train, y_train, x_test, 1)
cnn_train2, cnn_test2 = build_cnn(5, x_train, y_train, x_test, 2)



Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100


Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100


Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100


Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100


Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100


Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100


Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------
Train on 2000 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100


Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100


Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
------------------


In [18]:
# # 모델 5번: transformer
# # https://www.tensorflow.org/tutorials/text/transformer

# class Transformer(tf.keras.Model):
#     def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
#                target_vocab_size, pe_input, pe_target, rate=0.1):
#         super(Transformer, self).__init__()

#         self.tokenizer = Encoder(num_layers, d_model, num_heads, dff, 
#                                input_vocab_size, pe_input, rate)

#         self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
#                                target_vocab_size, pe_target, rate)

#         self.final_layer = tf.keras.layers.Dense(target_vocab_size)

#     def call(self, inp, tar, training, enc_padding_mask, 
#            look_ahead_mask, dec_padding_mask):

#         enc_output = self.tokenizer(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

#         # dec_output.shape == (batch_size, tar_seq_len, d_model)
#         dec_output, attention_weights = self.decoder(
#             tar, enc_output, training, look_ahead_mask, dec_padding_mask)

#         final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

#         return final_output, attention_weights


# def build_transformer(split_num, train, target, test, rnd):
#     # return train pred prob and test pred prob 
#     train_pred, test_pred = np.zeros((train.shape[0], 1)), np.zeros((test.shape[0], 1))

#     skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=233*rnd)
#     for train_idx, val_idx in skf.split(train, target):

#         # split train, validation set
#         X = train[train_idx]
#         y = target[train_idx]
#         valid_x = train[val_idx]
#         valid_y = target[val_idx]

#         #가벼운 모델 생성
#         model = Sequential()
#         model.add(LSTM(32, input_shape=(600,6)))
#         model.add(Dense(128, activation='relu'))
#         model.add(Dense(61, activation='softmax'))

#         model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#         model.fit(X,y, epochs=30, batch_size=128, validation_split=0.2)
        
#         # save feat
#         submission.iloc[:,1:]=model.predict(test_X)
#         train_pred[val_idx] = model.predict(valid_x).reshape(-1,1)
#         test_pred += model.predict(test).reshape(-1,1)/split_num
        
#         # release
#         del model
#         gc.collect()
#         print('------------------')
        
#     return train_pred, test_pred

# transformer_train1, transformer_test1 = build_transformer(5, train, train_y, test, 1)
# transformer_train2, transformer_test2 = build_transformer(5, train, train_y, test, 2)

In [19]:
xgb1_onehot = np.argmax(xgb_train1, axis=1).reshape(-1,1)
xgb2_onehot = np.argmax(xgb_train2, axis=1).reshape(-1,1)
catboost1_onehot = np.argmax(catboost_train1, axis=1).reshape(-1,1)
catboost2_onehot = np.argmax(catboost_train2, axis=1).reshape(-1,1)
cnn1_onehot = np.argmax(cnn_train1, axis=1).reshape(-1,1)
cnn2_onehot = np.argmax(cnn_train2, axis=1).reshape(-1,1)

xgb1_onehot_test = np.argmax(xgb_test1, axis=1).reshape(-1,1)
xgb2_onehot_test = np.argmax(xgb_test2, axis=1).reshape(-1,1)
catboost1_onehot_test = np.argmax(catboost_test1, axis=1).reshape(-1,1)
catboost2_onehot_test = np.argmax(catboost_test2, axis=1).reshape(-1,1)
cnn1_onehot_test = np.argmax(cnn_test1, axis=1).reshape(-1,1)
cnn2_onehot_test = np.argmax(cnn_test2, axis=1).reshape(-1,1)

train_final = np.hstack([xgb1_onehot, xgb2_onehot,
                         catboost1_onehot, catboost2_onehot,
#                          lstm_train1, lstm_train2,
                         cnn1_onehot, cnn2_onehot])

test_final = np.hstack([xgb1_onehot_test, xgb2_onehot_test,
                        catboost1_onehot_test, catboost2_onehot_test,
#                         lstm_test1, lstm_test2,
                        cnn1_onehot_test, cnn2_onehot_test])

print(train_final.shape)
print(test_final.shape)

# https://m.blog.naver.com/PostView.nhn?blogId=wideeyed&logNo=221343373342&proxyReferer=https:%2F%2Fwww.google.com%2F

(3125, 6)
(782, 6)


In [27]:
# 최종 앙상블

def ensemble_xgb(split_num, train, target, test):

    test_pred = np.zeros((test.shape[0], 61))
    
    params = {
                'colsample_bytree': 0.7,
                'subsample': 0.8,
                'eta': 0.04,
                'max_depth': 12,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':61,
                }
    
    skf = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=2021)
    for train_idx, val_idx in skf.split(train, target):

        # split train, validation set
        X = train[train_idx]
        y = target.iloc[train_idx]
        valid_x = train[val_idx]
        valid_y = target.iloc[val_idx]
        
        d_train = xgb.DMatrix(X, y)
        d_valid = xgb.DMatrix(valid_x, valid_y)
        d_test = xgb.DMatrix(test)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        
        #run traning
        model = xgb.train(params, d_train, 2000, watchlist, 
                        early_stopping_rounds=50,
                        verbose_eval=100)

        # save feat
        test_pred += model.predict(d_test)/split_num
        
        # release
        del model
        gc.collect()
        print('------------------')
    
    sample_submssion = pd.read_csv(path + 'sample_submission.csv')
    sample_submssion.iloc[:,1:] = test_pred
    sample_submssion.to_csv("ensemble.csv", index = False)
    
    print(sample_submssion)
    
ensemble_xgb(5, train_final, ml_label, test_final)

[0]	train-mlogloss:3.59783	valid-mlogloss:3.60179
[100]	train-mlogloss:1.23838	valid-mlogloss:1.48832
[173]	train-mlogloss:1.14565	valid-mlogloss:1.49086
------------------
[0]	train-mlogloss:3.60263	valid-mlogloss:3.61267
[100]	train-mlogloss:1.22994	valid-mlogloss:1.52272
[176]	train-mlogloss:1.13704	valid-mlogloss:1.52951
------------------
[0]	train-mlogloss:3.60738	valid-mlogloss:3.61075
[100]	train-mlogloss:1.25926	valid-mlogloss:1.40497
[195]	train-mlogloss:1.15417	valid-mlogloss:1.40005
------------------
[0]	train-mlogloss:3.60012	valid-mlogloss:3.60772
[100]	train-mlogloss:1.25102	valid-mlogloss:1.47219
[190]	train-mlogloss:1.15331	valid-mlogloss:1.46661
------------------
[0]	train-mlogloss:3.60298	valid-mlogloss:3.59853
[100]	train-mlogloss:1.26126	valid-mlogloss:1.40048
[180]	train-mlogloss:1.16569	valid-mlogloss:1.40216
------------------
       id         0         1         2         3         4         5  \
0    3125  0.007827  0.005372  0.001812  0.003146  0.001994  0