In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = './data/'  # data 저장 위치

In [3]:
# 데이터 설명을 확인하면
# ueser_id가 10000부터 24999까지 총 15000개가 연속적으로 존재.
train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_err.head()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,10000,20201101025616,model_3,05.15.2138,15,1
1,10000,20201101030309,model_3,05.15.2138,12,1
2,10000,20201101030309,model_3,05.15.2138,11,1
3,10000,20201101050514,model_3,05.15.2138,16,1
4,10000,20201101050515,model_3,05.15.2138,4,0


In [4]:
train_err.shape

(16554663, 6)

In [5]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [6]:
print(np.sort(train_err.errtype.unique()))
# errtype이 1부터 42까지 29를 제외한 41개가 존재.

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 30 31 32 33 34 35 36 37 38 39 40 41 42]


In [7]:
# user_id와 errtype만을 사용하여 데이터 셋 생성
# 모든 일자에 대해서 errtype별 발생 건수를 count
# pandas의 groupby를 활용할 경우 큰 연산 자원이 필요.
# numpy로 placeholder를 만들어 구현함.
id_error = train_err[['user_id','errtype']].values
id_error

array([[10000,    15],
       [10000,    12],
       [10000,    11],
       ...,
       [24999,     4],
       [24999,     4],
       [24999,    15]], dtype=int64)

In [8]:
error = np.zeros((train_user_number,42))
error

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:41<00:00, 396887.18it/s]


(15000, 42)

In [10]:
error[0]

array([  0.,   0.,   8., 104.,   0.,   1.,   1.,   0.,   0.,   7.,  15.,
        16.,   1.,  10.,  59.,  61.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,  32.,   1.,   1.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])

### model_nm, fwver encoding

##### model

In [11]:
id_model = train_err[['user_id','model_nm']]
id_model

Unnamed: 0,user_id,model_nm
0,10000,model_3
1,10000,model_3
2,10000,model_3
3,10000,model_3
4,10000,model_3
...,...,...
16554658,24999,model_3
16554659,24999,model_3
16554660,24999,model_3
16554661,24999,model_3


In [12]:
le = preprocessing.LabelEncoder()
model_pre = le.fit_transform(train_err.model_nm)
model_pre

array([3, 3, 3, ..., 3, 3, 3])

In [13]:
id_model['model_nm'] = model_pre
id_model

Unnamed: 0,user_id,model_nm
0,10000,3
1,10000,3
2,10000,3
3,10000,3
4,10000,3
...,...,...
16554658,24999,3
16554659,24999,3
16554660,24999,3
16554661,24999,3


In [14]:
np.sort(id_model.model_nm.unique())
# model = 1~8번까지 존재

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [15]:
id_model = id_model.values
id_model

array([[10000,     3],
       [10000,     3],
       [10000,     3],
       ...,
       [24999,     3],
       [24999,     3],
       [24999,     3]], dtype=int64)

In [16]:
error_model = np.zeros((train_user_number,8))
error_model

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
for person_idx, err in tqdm(id_model):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error_model[person_idx - train_user_id_min,err - 1] += 1
error_model.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:42<00:00, 392112.78it/s]


(15000, 8)

In [18]:
error_model

array([[   0.,    0.,  317., ...,    0.,    0.,    0.],
       [   0., 2365.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,  306., ...,    0.,    0.,    0.],
       ...,
       [   0.,    0.,    0., ...,    0.,    0.,  826.],
       [   0.,    0.,    0., ...,    0.,    0.,  155.],
       [   0.,    0.,  570., ...,    0.,    0.,    0.]])

In [19]:
error_sum = np.concatenate((error, error_model), axis = 1)
error_sum

array([[  0.,   0.,   8., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   2., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,   0.,   0., 826.],
       [  0.,   0.,   0., ...,   0.,   0., 155.],
       [  0.,   0.,   4., ...,   0.,   0.,   0.]])

##### y 값

In [20]:
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
train_prob

Unnamed: 0,user_id,time
0,19224,20201102200000
1,23664,20201116140000
2,15166,20201114130000
3,12590,20201108210000
4,15932,20201103210000
...,...,...
5424,20167,20201125120000
5425,16270,20201110120000
5426,19114,20201106230000
5427,21505,20201104110000


In [21]:
problem = np.zeros(15000)
problem

array([0., 0., 0., ..., 0., 0., 0.])

In [22]:
# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_prob.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [23]:
problem

array([0., 1., 0., ..., 1., 1., 0.])

##### test 분리 

In [40]:
# 변수 이름 변경
# error  -> train_x
# problem-> train_y

train_x = error_sum
train_y = problem
print(train_x.shape)
print(train_y.shape)

(15000, 50)
(15000,)


In [32]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p)
    return "pr_auc", score, True

In [33]:
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정

params =      {
                'boosting_type' : 'dart',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }

In [48]:
# np.arange(start = 400, stop = 600 , step = 50)

array([400, 450, 500, 550])

In [49]:
# l = [150, 200, 250, 600, 650]

In [225]:
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
# k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# for n in tqdm(l):


#     for train_idx, val_idx in k_fold.split(train_x):

#         # split train, validation set
#         X = train_x[train_idx]
#         y = train_y[train_idx]
#         valid_x = train_x[val_idx]
#         valid_y = train_y[val_idx]

#         d_train= lgb.Dataset(X, y)
#         d_val  = lgb.Dataset(valid_x, valid_y)

#         #run traning
#         model = lgb.train(
#                             params,
#                             train_set       = d_train,
#                             num_boost_round = n,
#                             valid_sets      = d_val,
#                             feval           = f_pr_auc,
#                             verbose_eval = 0,
#                             early_stopping_rounds = 5
#                            )

#         # cal valid prediction
#         valid_prob = model.predict(valid_x)
#         valid_pred = np.where(valid_prob > threshold, 1, 0)

#         # cal scores
#         recall    = recall_score(    valid_y, valid_pred)
#         precision = precision_score( valid_y, valid_pred)
#         auc_score = roc_auc_score(   valid_y, valid_prob)

#         # append scores
#         models.append(model)
#         recalls.append(recall)
#         precisions.append(precision)
#         auc_scores.append(auc_score)
        
#         score_d[n] = np.mean(auc_scores)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7541
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7693
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
[LightGBM] [Info] Number of positive: 4718, number of negative: 7282
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7798

 20%|████████████████▊                                                                   | 1/5 [00:04<00:19,  5.00s/it]

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7541
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7693
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
[LightGBM] [Info] Number of positive: 4718, number of negative: 7282
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

 40%|█████████████████████████████████▌                                                  | 2/5 [00:12<00:17,  5.73s/it]

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7541
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7693
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
[LightGBM] [Info] Number of positive: 4718, number of negative: 7282
You can set `force_col_wise

 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:22<00:13,  6.88s/it]

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7541
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7693
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
[LightGBM] [Info] Number of positive: 4718, number of negative: 7282
You can set `force_row_wise

 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:51<00:13, 13.58s/it]

[LightGBM] [Info] Number of positive: 3633, number of negative: 8367
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7541
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302750 -> initscore=-0.834237
[LightGBM] [Info] Start training from score -0.834237
[LightGBM] [Info] Number of positive: 4828, number of negative: 7172
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7693
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.402333 -> initscore=-0.395752
[LightGBM] [Info] Start training from score -0.395752
[LightGBM] [Info] Number of positive: 4718, number of negative: 7282
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7798

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:22<00:00, 16.60s/it]


In [226]:
max(score_d.values())

0.8079543516978503

In [227]:
score_d

{150: 0.8071308751756886,
 200: 0.8077724503233598,
 250: 0.8079543516978503,
 600: 0.8073895265360036,
 650: 0.8067686019501515}

In [222]:
auc_scores

[0.8036902116237389,
 0.8068422913719944,
 0.7845992568586623,
 0.8151718353470154,
 0.8153475342416716]

250  or 600

0.8063485034160338

In [134]:
#-------------------------------------------------------------------------------------
# 5 Kfold cross validation
k_fold = KFold(n_splits=7, shuffle=True, random_state=6)

for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)

    #run traning
    model = lgb.train(
                            params,
                            train_set       = d_train,
                            num_boost_round = 20,
                            valid_sets      = d_val,
                            feval           = f_pr_auc,
                            verbose_eval = 0,
                            early_stopping_rounds = 3
                           )

    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)

    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

[LightGBM] [Info] Number of positive: 4323, number of negative: 8534
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5881
[LightGBM] [Info] Number of data points in the train set: 12857, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.336237 -> initscore=-0.680109
[LightGBM] [Info] Start training from score -0.680109
[LightGBM] [Info] Number of positive: 4288, number of negative: 8569
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5880
[LightGBM] [Info] Number of data points in the train set: 12857, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333515 -> initscore=-0.692331
[LightGBM] [Info] Start training from score -0.692331
[LightGBM] [Info] Number of positive: 4287, number of negative: 8570
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5689

In [123]:
print(np.mean(auc_scores))

0.8108926210733043


In [99]:
auc_scores

[0.8000399501871104,
 0.8264022387841968,
 0.8061527660211649,
 0.8045059948176667,
 0.7968155017335345,
 0.819000304947552,
 0.8294736891202343,
 0.7987866173296331,
 0.8020197940115702,
 0.8105490059401139,
 0.7987651947861746,
 0.812827725927041,
 0.8142075466929513,
 0.8217793664641657,
 0.805806106967249,
 0.8021831986746127,
 0.8305743243243242,
 0.799828398384926,
 0.807185287379501,
 0.8145815238966551,
 0.8013592482667107,
 0.8180562775233182,
 0.8134618437998014,
 0.8062633019154758,
 0.8290371038863124,
 0.8006310403080066,
 0.8162853331345052,
 0.8019319404203864,
 0.8158710854363028,
 0.8146717749381067,
 0.8064633823399602,
 0.8289542827115081,
 0.79885627013436]

In [93]:
index_num = 0
index_list = []
for n in auc_scores:
    if n > 0.8 : 
        index_list.append(index_num)
    index_num += 1

In [50]:
index_list

[0, 1, 2, 3, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 18]

##### feature를 
    dart = 0.803999 (num_boost_round = 50)
    
    dart = 0.804435 (num_boost_round = 110)
    
    dart = 0.804692 (num_boost_round = 120)
    dart = 0.804931 (num_boost_round = 125)
    dart = 0.804945 (num_boost_round = 126)    
    dart = 0.804966 (num_boost_round = 127)
    
    
    
    dart = 0.804340 (num_boost_round = 130)
    
    dart = 0.802653 (num_boost_round = 300) 
    dart = 0.802423 (num_boost_round = 500)
    dart = 0.797607 (num_boost_round = 800)
    dart = 0.793358 (num_boost_round = 1000)
    dart = 0.787656 (num_boost_round = 1500)
    dart = 0.782559 (num_boost_round = 2000)
    gbdt = 0.791413
    goss = 0.790277


In [51]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_err.head()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,model_1,04.16.3553,31,1
1,30000,20201101030227,model_1,04.16.3553,33,2
2,30000,20201101030228,model_1,04.16.3553,15,1
3,30000,20201101030256,model_1,04.16.3553,22,1
4,30000,20201101030300,model_1,04.16.3553,11,1


In [52]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [53]:
id_error = test_err[['user_id','errtype']].values
id_error

array([[30000,    31],
       [30000,    33],
       [30000,    15],
       ...,
       [44998,    15],
       [44998,    16],
       [44998,    31]], dtype=int64)

In [54]:
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:41<00:00, 397544.01it/s]

(14999, 42)





In [55]:
test_x

array([[  0.,   0.,   0., ...,  92.,   0.,   2.],
       [  0.,   0.,   3., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ..., 113.,   1.,   2.],
       ...,
       [  0.,   0.,   0., ..., 329.,   2.,   3.],
       [  0.,   0.,   0., ...,  30.,  49.,   0.],
       [  0.,   0.,   0., ..., 269.,   0.,   0.]])

##### 추가 전처리

In [56]:
le.fit(train_err.model_nm)


model_pre = le.transform(test_err.model_nm)
model_pre

array([1, 1, 1, ..., 1, 1, 1])

In [57]:
test_err['model_nm'] = model_pre
test_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
0,30000,20201101030227,1,04.16.3553,31,1
1,30000,20201101030227,1,04.16.3553,33,2
2,30000,20201101030228,1,04.16.3553,15,1
3,30000,20201101030256,1,04.16.3553,22,1
4,30000,20201101030300,1,04.16.3553,11,1
...,...,...,...,...,...,...
16532643,44998,20201130210050,1,04.16.3553,40,0
16532644,44998,20201130211831,1,04.16.3553,31,1
16532645,44998,20201130211832,1,04.16.3553,15,1
16532646,44998,20201130212259,1,04.16.3553,16,1


# 수정

In [58]:
error_model = np.zeros((test_user_number,8))
error_model

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
id_model = test_err[['user_id','model_nm']].values
id_model

array([[30000,     1],
       [30000,     1],
       [30000,     1],
       ...,
       [44998,     1],
       [44998,     1],
       [44998,     1]], dtype=int64)

In [60]:
for person_idx, err in tqdm(id_model):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
error_model.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:41<00:00, 401517.08it/s]


(14999, 8)

In [61]:
error_model

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [62]:
test_sum = np.concatenate((test_x, error_model), axis = 1)
test_sum

array([[2687.,   63.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,  287., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       ...,
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [ 873.,    0.,    0., ...,    0.,    0.,    0.]])

In [107]:
test_sum.shape

(14999, 50)

In [108]:
models[-2].predict(test_sum)

array([0.88258815, 0.19763027, 0.32618359, ..., 0.51142378, 0.84056571,
       0.32423258])

In [109]:
# 예측
pred_y_list = []
pred_y = models[0].predict(test_sum)
pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [110]:
len(models)

40

In [135]:
pred_y_list = []
for model in models:
    pred_y = model.predict(test_sum)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [136]:
pred_ensemble

array([[0.84931011],
       [0.21076116],
       [0.29187697],
       ...,
       [0.45002092],
       [0.83425092],
       [0.37587512]])

In [137]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [138]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.849310
1,30001,0.210761
2,30002,0.291877
3,30003,0.716275
4,30004,0.637930
...,...,...
14994,44994,0.374836
14995,44995,0.315276
14996,44996,0.450021
14997,44997,0.834251


In [139]:
sample_submssion.to_csv("features_modle20.csv", index = False)