# 라이브러리 불러오기

In [None]:
! pip install category_encoders
! pip install optuna



In [None]:
# basic
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import os
import category_encoders as ce
import optuna
import warnings

In [None]:
! pip install catboost



In [None]:
# sklearn
import sklearn
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

In [None]:
# model
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, StackingRegressor

In [None]:
# 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 데이터 불러오기
path='/content/drive/MyDrive/BOAZ/물류 유통량 예측 경진대회/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
submission = pd.read_csv(path+'sample_submission.csv')

In [None]:
# 데이터 확인
train.head()

Unnamed: 0,index,송하인_격자공간고유번호,수하인_격자공간고유번호,물품_카테고리,운송장_건수
0,0,5011000595017300,2871000192069300,음반,3
1,1,4148000690043300,5011000264024400,문화컨텐츠,3
2,2,5011000078068400,1120000007005400,농산물,3
3,3,4127100048006400,5011000587019400,기타식품,7
4,4,5011000078068400,2823700010076300,농산물,3


In [None]:
test.head()

Unnamed: 0,index,송하인_격자공간고유번호,수하인_격자공간고유번호,물품_카테고리
0,0,4167000577042200,5011000435014100,선케어
1,1,1156000009012200,5011000172034400,구강위생용품
2,2,4122000363057300,5011000361097300,캠핑
3,3,5011000436041400,2826000084036400,아웃도어가구
4,4,4150000241065200,5011000169044300,분유/이유식/아기간식


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31684 entries, 0 to 31683
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         31684 non-null  int64 
 1   송하인_격자공간고유번호  31684 non-null  int64 
 2   수하인_격자공간고유번호  31684 non-null  int64 
 3   물품_카테고리       31684 non-null  object
 4   운송장_건수        31684 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 1.2+ MB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         7920 non-null   int64 
 1   송하인_격자공간고유번호  7920 non-null   int64 
 2   수하인_격자공간고유번호  7920 non-null   int64 
 3   물품_카테고리       7920 non-null   object
dtypes: int64(3), object(1)
memory usage: 247.6+ KB


In [None]:
# null 여부 확인
train.isnull().sum()

index           0
송하인_격자공간고유번호    0
수하인_격자공간고유번호    0
물품_카테고리         0
운송장_건수          0
dtype: int64

# 데이터 전처리

대회 데이터가 재가공되었기 때문에 칼럼명을 변경해준다.

In [None]:
# 킬럼명 변경
train.rename(columns={'송하인_격자공간고유번호' : 'SEND_SPG_INNB',
                        '수하인_격자공간고유번호' : 'REC_SPG_INNB',
                        '물품_카테고리' : 'DL_GD_MCLS_NM',
                        '운송장_건수' : 'INVC_CONT'}, inplace=True)

train

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_MCLS_NM,INVC_CONT
0,0,5011000595017300,2871000192069300,음반,3
1,1,4148000690043300,5011000264024400,문화컨텐츠,3
2,2,5011000078068400,1120000007005400,농산물,3
3,3,4127100048006400,5011000587019400,기타식품,7
4,4,5011000078068400,2823700010076300,농산물,3
...,...,...,...,...,...
31679,31679,4471000290087200,5011000213073200,스포츠잡화,3
31680,31680,1129000014045300,5011000319087100,스마트디바이스,4
31681,31681,1129000014045300,5011000263065200,스마트디바이스,6
31682,31682,4127300065073100,5011000264061200,지갑,7


In [None]:
test.rename(columns={'송하인_격자공간고유번호' : 'SEND_SPG_INNB',
                        '수하인_격자공간고유번호' : 'REC_SPG_INNB',
                        '물품_카테고리' : 'DL_GD_MCLS_NM'}, inplace=True)

test

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_MCLS_NM
0,0,4167000577042200,5011000435014100,선케어
1,1,1156000009012200,5011000172034400,구강위생용품
2,2,4122000363057300,5011000361097300,캠핑
3,3,5011000436041400,2826000084036400,아웃도어가구
4,4,4150000241065200,5011000169044300,분유/이유식/아기간식
...,...,...,...,...
7915,7915,5011000266051200,4623000417038100,농산물
7916,7916,1154500001098300,5011000264055100,문화컨텐츠
7917,7917,5013000610049100,1147000018091400,농산물
7918,7918,5013000610049100,3117000039026100,농산물


국토연구원 데이터 ( https://www.bigdata-region.kr/#/dataset/0ad3c882-f7ee-4faf-970d-00c53cb65a84) 를 참고한 결과 격자공간고유번호 칼럼을 가공해야 한다.


---


격자공간고유번호의 1~5자리: 시군구

격자공간고유번호의 1~10자리: 격자공간명

격자공간고유번호의 1~2자리: 지역 

을 나타낸다.

In [None]:
# str10 (격자공간명)
train['SEND_SPG_INNB_str10'] = train['SEND_SPG_INNB'].astype(str).str[:10]
train['SEND_SPG_INNB_str10'] = train['SEND_SPG_INNB_str10'].astype(int)
test['SEND_SPG_INNB_str10'] = test['SEND_SPG_INNB'].astype(str).str[:10]
test['SEND_SPG_INNB_str10'] = test['SEND_SPG_INNB_str10'].astype(int)
train['REC_SPG_INNB_str10'] = train['REC_SPG_INNB'].astype(str).str[:10]
train['REC_SPG_INNB_str10'] = train['REC_SPG_INNB_str10'].astype(int)
test['REC_SPG_INNB_str10'] = test['REC_SPG_INNB'].astype(str).str[:10]
test['REC_SPG_INNB_str10'] = test['REC_SPG_INNB_str10'].astype(int)


ssi10 = set(train.SEND_SPG_INNB_str10)
#ssi10_t = set(test.SEND_SPG_INNB_str10)
rsi10 = set(train.REC_SPG_INNB_str10)
#rsi10_t = set(test.REC_SPG_INNB_str10)

#print('SEND_SPG_INNB 차집합 수 :', len(ssi10.difference(ssi10_t)))
#print('REC_SPG_INNB 차집합 수 :', len(rsi10.difference(rsi10_t)))

ssi10.update(rsi10)
#ssi10_t.update(rsi10_t)

#print('UPDATE 후 차집합 수 :', len(ssi10.difference(ssi10_t)))

"""
SEND_SPG_INNB_str10_index = []
for i in list(ssi10.difference(ssi10_t)):
    train = train.drop(train[train['SEND_SPG_INNB_str10'] == i].index,axis='index')
    train = train.drop(train[train['REC_SPG_INNB_str10'] == i].index,axis='index')
    print(len(set(train.SEND_SPG_INNB_str10).difference(set(test.SEND_SPG_INNB_str10))))
"""

"\nSEND_SPG_INNB_str10_index = []\nfor i in list(ssi10.difference(ssi10_t)):\n    train = train.drop(train[train['SEND_SPG_INNB_str10'] == i].index,axis='index')\n    train = train.drop(train[train['REC_SPG_INNB_str10'] == i].index,axis='index')\n    print(len(set(train.SEND_SPG_INNB_str10).difference(set(test.SEND_SPG_INNB_str10))))\n"

In [None]:
# str10 mapping
dictionary_str10 = {}
for i,s in enumerate(ssi10):
    dictionary_str10[s] = i
    
train['SEND_SPG_INNB_str10'] = train.SEND_SPG_INNB_str10.map(dictionary_str10)
train['REC_SPG_INNB_str10'] = train.REC_SPG_INNB_str10.map(dictionary_str10)
test['SEND_SPG_INNB_str10'] = test.SEND_SPG_INNB_str10.map(dictionary_str10)
test['REC_SPG_INNB_str10'] = test.REC_SPG_INNB_str10.map(dictionary_str10)

In [None]:
# str5 (시군구)
train['SEND_SPG_INNB_str5'] = train['SEND_SPG_INNB'].astype(str).str[:5]
train['SEND_SPG_INNB_str5'] = train['SEND_SPG_INNB_str5'].astype(int)
test['SEND_SPG_INNB_str5'] = test['SEND_SPG_INNB'].astype(str).str[:5]
test['SEND_SPG_INNB_str5'] = test['SEND_SPG_INNB_str5'].astype(int)
train['REC_SPG_INNB_str5'] = train['REC_SPG_INNB'].astype(str).str[:5]
train['REC_SPG_INNB_str5'] = train['REC_SPG_INNB_str5'].astype(int)
test['REC_SPG_INNB_str5'] = test['REC_SPG_INNB'].astype(str).str[:5]
test['REC_SPG_INNB_str5'] = test['REC_SPG_INNB_str5'].astype(int)

ssi5 = set(train.SEND_SPG_INNB_str5)
#ssi5_t = set(test.SEND_SPG_INNB_str5)
rsi5 = set(train.REC_SPG_INNB_str5)
#rsi5_t = set(test.REC_SPG_INNB_str5)

#print('SEND_SPG_INNB 차집합 수 :', len(ssi5.difference(ssi5_t)))
#print('REC_SPG_INNB 차집합 수 :', len(rsi5.difference(rsi5_t)))

ssi5.update(rsi5)
#ssi5_t.update(rsi5_t)

"""
SEND_SPG_INNB_str5_index = []
for i in list(ssi5.difference(ssi5_t)):
    train = train.drop(train[train['SEND_SPG_INNB_str5'] == i].index,axis='index')
    train = train.drop(train[train['REC_SPG_INNB_str5'] == i].index,axis='index')
    print(len(set(train.SEND_SPG_INNB_str5).difference(set(test.SEND_SPG_INNB_str5))))
"""

"\nSEND_SPG_INNB_str5_index = []\nfor i in list(ssi5.difference(ssi5_t)):\n    train = train.drop(train[train['SEND_SPG_INNB_str5'] == i].index,axis='index')\n    train = train.drop(train[train['REC_SPG_INNB_str5'] == i].index,axis='index')\n    print(len(set(train.SEND_SPG_INNB_str5).difference(set(test.SEND_SPG_INNB_str5))))\n"

In [None]:
# str5 mapping
dictionary_str5 = {}
for i,s in enumerate(ssi5):
    dictionary_str5[s] = i
    
train['SEND_SPG_INNB_str5'] = train.SEND_SPG_INNB_str5.map(dictionary_str5)
train['REC_SPG_INNB_str5'] = train.REC_SPG_INNB_str5.map(dictionary_str5)
test['SEND_SPG_INNB_str5'] = test.SEND_SPG_INNB_str5.map(dictionary_str5)
test['REC_SPG_INNB_str5'] = test.REC_SPG_INNB_str5.map(dictionary_str5)

In [None]:
# str2 (지역)
train['SEND_SPG_INNB_str2'] = train['SEND_SPG_INNB'].astype(str).str[:2]
train['SEND_SPG_INNB_str2'] = train['SEND_SPG_INNB_str2'].astype(int)
test['SEND_SPG_INNB_str2'] = test['SEND_SPG_INNB'].astype(str).str[:2]
test['SEND_SPG_INNB_str2'] = test['SEND_SPG_INNB_str2'].astype(int)
train['REC_SPG_INNB_str2'] = train['REC_SPG_INNB'].astype(str).str[:2]
train['REC_SPG_INNB_str2'] = train['REC_SPG_INNB_str2'].astype(int)
test['REC_SPG_INNB_str2'] = test['REC_SPG_INNB'].astype(str).str[:2]
test['REC_SPG_INNB_str2'] = test['REC_SPG_INNB_str2'].astype(int)

ssi2 = set(train.SEND_SPG_INNB_str2)
#ssi2_t = set(test.SEND_SPG_INNB_str2)
rsi2 = set(train.REC_SPG_INNB_str2)
#rsi2_t = set(test.REC_SPG_INNB_str2)

#print('SEND_SPG_INNB 차집합 수 :', len(ssi2.difference(ssi2_t)))
#print('REC_SPG_INNB 차집합 수 :', len(rsi2.difference(rsi2_t)))

ssi2.update(rsi2)
#ssi2_t.update(rsi2_t)

#print('UPDATE 후 차집합 수 :', len(ssi2.difference(ssi2_t)))
#print(':', ssi2.difference(ssi2_t))

In [None]:
# st2 mapping
dictionary_str2 = {}
for i,s in enumerate(ssi2):
    dictionary_str2[s] = i
    
train['SEND_SPG_INNB_str2'] = train.SEND_SPG_INNB_str2.map(dictionary_str2)
train['REC_SPG_INNB_str2'] = train.REC_SPG_INNB_str2.map(dictionary_str2)
test['SEND_SPG_INNB_str2'] = test.SEND_SPG_INNB_str2.map(dictionary_str2)
test['REC_SPG_INNB_str2'] = test.REC_SPG_INNB_str2.map(dictionary_str2)

In [None]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_MCLS_NM,INVC_CONT,SEND_SPG_INNB_str10,REC_SPG_INNB_str10,SEND_SPG_INNB_str5,REC_SPG_INNB_str5,SEND_SPG_INNB_str2,REC_SPG_INNB_str2
0,0,5011000595017300,2871000192069300,음반,3,4564,8249,240,6,16,3
1,1,4148000690043300,5011000264024400,문화컨텐츠,3,7842,4312,123,240,8,16
2,2,5011000078068400,1120000007005400,농산물,3,4197,6052,240,241,16,0
3,3,4127100048006400,5011000587019400,기타식품,7,509,4560,84,240,8,16
4,4,5011000078068400,2823700010076300,농산물,3,4197,3451,240,134,16,3


기존과 달리 칼럼을 끊어서 끊긴 대로 의미를 부여하였다.

In [None]:
# TargetEncoder
encoder = ce.target_encoder.TargetEncoder(cols=['DL_GD_MCLS_NM'])
encoder.fit(train['DL_GD_MCLS_NM'],train['INVC_CONT'])
train['DL_GD_MCLS_NM'] = encoder.transform(train['DL_GD_MCLS_NM'])
test['DL_GD_MCLS_NM'] = encoder.transform(test['DL_GD_MCLS_NM'])

In [None]:
train.head()

Unnamed: 0,index,SEND_SPG_INNB,REC_SPG_INNB,DL_GD_MCLS_NM,INVC_CONT,SEND_SPG_INNB_str10,REC_SPG_INNB_str10,SEND_SPG_INNB_str5,REC_SPG_INNB_str5,SEND_SPG_INNB_str2,REC_SPG_INNB_str2
0,0,5011000595017300,2871000192069300,4.092014,3,4564,8249,240,6,16,3
1,1,4148000690043300,5011000264024400,4.976169,3,7842,4312,123,240,8,16
2,2,5011000078068400,1120000007005400,4.322081,3,4197,6052,240,241,16,0
3,3,4127100048006400,5011000587019400,6.088825,7,509,4560,84,240,8,16
4,4,5011000078068400,2823700010076300,4.322081,3,4197,3451,240,134,16,3


1 ~ 10자리, 1 ~ 1자리 기준으로 나누어 생성한 변수들을 실험하에 제거하였으며

최종적으로 1 ~ 5자리 (시군구) 만 사용하였다.

In [None]:
train_all = train
test_all = test

train_all = train_all.drop(['SEND_SPG_INNB','REC_SPG_INNB','DL_GD_MCLS_NM'],axis=1)
test_all = test_all.drop(['SEND_SPG_INNB','REC_SPG_INNB','DL_GD_MCLS_NM'],axis=1)

train_all = train_all.drop(['SEND_SPG_INNB_str10','REC_SPG_INNB_str10','SEND_SPG_INNB_str2','REC_SPG_INNB_str2'],axis=1)
test_all = test_all.drop(['SEND_SPG_INNB_str10','REC_SPG_INNB_str10','SEND_SPG_INNB_str2','REC_SPG_INNB_str2'],axis=1)

X = train_all.drop(['INVC_CONT','index'],axis=1)
y = train_all['INVC_CONT']
#y_median = np.median(train_all['INVC_CONT'].values)
X_test = test_all[X.columns]

# rmes 식 지정
def rmse(y_pred, y_test):
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
pip install --use-deprecated=legacy-resolver pycaret[full]



# 1) pycaret 사용하기

In [None]:
from pycaret.regression import *

exp_101=setup(data=train_all, 
              target='INVC_CONT',
              session_id=123,
              ignore_features=['index'])

Unnamed: 0,Description,Value
0,session_id,123
1,Target,INVC_CONT
2,Original Data,"(31684, 4)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(22178, 2)"


In [None]:
best_3_l = compare_models(sort='RMSE', n_select=3) 

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,2.4269,44.4362,6.5552,0.024,0.4379,0.4607,0.955
lightgbm,Light Gradient Boosting Machine,2.4448,44.6198,6.5723,0.0172,0.4445,0.4646,0.266
catboost,CatBoost Regressor,2.4392,44.9017,6.6078,-0.0007,0.444,0.4622,4.854
xgboost,Extreme Gradient Boosting,2.4513,45.3101,6.6396,-0.0123,0.4485,0.4636,3.093
en,Elastic Net,2.4482,46.5635,6.6655,0.0079,0.4416,0.4663,0.016
br,Bayesian Ridge,2.4481,46.5635,6.6655,0.0079,0.4417,0.4663,0.015
lasso,Lasso Regression,2.4479,46.5637,6.6655,0.0079,0.4415,0.4663,0.028
ridge,Ridge Regression,2.4485,46.5635,6.6656,0.0078,0.4417,0.4664,0.015
lar,Least Angle Regression,2.4485,46.5635,6.6656,0.0078,0.4417,0.4664,0.018
lr,Linear Regression,2.4485,46.5635,6.6656,0.0078,0.4417,0.4664,0.479


상위 3개의 모델 (gbr, light gbm, catboost) 을 가지고 블렌딩을 진행한다.

In [None]:
blended_l = blend_models(estimator_list= best_3_l, fold=5, optimize='RMSE') # 추출한 상위 3가지 모델 블랜딩
pred_holdout = predict_model(blended_l)
final_blended_model_l = finalize_model(blended_l)
pred_esb_l = predict_model(final_blended_model_l, test_all) 

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.5175,51.6345,7.1857,0.1343,0.4445,0.4372
1,2.4052,32.6817,5.7168,-0.125,0.4446,0.4967
2,2.3884,32.97,5.7419,-0.0738,0.4384,0.4592
3,2.414,60.6572,7.7883,0.0936,0.4372,0.455
4,2.4764,45.933,6.7774,0.0498,0.4469,0.4708
Mean,2.4403,44.7753,6.642,0.0158,0.4423,0.4638
SD,0.0487,10.8282,0.8117,0.0991,0.0038,0.0197


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,2.3799,40.0487,6.3284,0.0605,0.4364,0.4589


In [None]:
# 데이터 저장
submission['운송장_건수'] = np.round(pred_esb_l['Label'])
submission

Unnamed: 0,index,운송장_건수
0,0,5.0
1,1,6.0
2,2,5.0
3,3,5.0
4,4,6.0
...,...,...
7915,7915,4.0
7916,7916,5.0
7917,7917,4.0
7918,7918,4.0


In [None]:
submission.to_csv('pycaret_blend3.csv' ,index = False) 

# 2) Stacking 사용하기

성능이 좋게 나온 상위 4개의 모델과 앙상블 모델인 ExtraTreesRegressor, BaggingRegressor, RandomForestRegressor 을 사용해서 stacking을 진행한다.

In [None]:
model1 = GradientBoostingRegressor() # pycaret 1위
model2 = XGBRegressor(learning_rate = 0.1, metrics = rmse, random_state=42) # pycaret 4위
model3 = RandomForestRegressor()
model4 = BaggingRegressor()
model5 = LGBMRegressor() # pycaret 2위
model6 = ExtraTreesRegressor()
model7 = CatBoostRegressor(learning_rate = 0.1, bootstrap_type = 'Bernoulli') # pycaret 3위

In [None]:
# GradientBoostingRegressor	, XGBRegressor, RandomForestRegressor, BaggingRegressor, LGBMRegressor, ExtraTreesRegressor, CatBoostRegressor
estimators = [('gbr',model1),('xgb',model2),('rfr',model3),('br',model4),('lgb',model5),('etr',model6), ('cat',model7)]

In [None]:
stackingmodel = StackingRegressor(estimators=estimators,final_estimator=model1)
stackingmodel.fit(X, y)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
0:	learn: 6.9177378	total: 3.57ms	remaining: 3.57s
1:	learn: 6.8696693	total: 7.37ms	remaining: 3.68s
2:	learn: 6.8666637	total: 10.1ms	remaining: 3.35s
3:	learn: 6.8352354	total: 12.9ms	remaining: 3.22s
4:	learn: 6.7955373	total: 15.7ms	remaining: 3.12s
5:	learn: 6.7666821	total: 18.5ms	remaining: 3.06s
6:	learn: 6.7428600	total: 21.3ms	remaining: 3.02s
7:	learn: 6.7185797	total: 24.1ms	remaining: 2.99s
8:	learn: 6.6983566	total: 26.9ms	remaining: 2.96s
9:	learn: 6.6875347	total: 29.7ms	remaining: 2.94s
10:	learn: 6.6840701	total: 32.3ms	remaining: 2.91s
11:	learn: 6.6696194	total: 35.1ms	remaining: 2.89s
12:	learn: 6.6650024	total: 37.9ms	remaining: 2.88s
13:	learn: 6.6530655	total: 40.9ms	remaining: 2.88s
14:	learn: 6.6519731	total: 43.5ms	remaining: 2.85s
15:	learn: 6.6420426	total: 46.2ms	remaining: 2.84s
16:	learn: 6.6412190	total: 49ms	remaining: 2.83s
17:	learn: 6.6377918	total: 52.1ms	remaining: 2.84s
18:	learn: 6.6337047	total

StackingRegressor(cv=None,
                  estimators=[('gbr',
                               GradientBoostingRegressor(alpha=0.9,
                                                         ccp_alpha=0.0,
                                                         criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.1,
                                                         loss='ls', max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
            

In [None]:
# TR 성능평가
stack_pred_tr=stackingmodel.predict(X)
stack_pred_tr

array([4.62045291, 5.49053768, 4.62045291, ..., 4.62045291, 5.27611401,
       5.0061265 ])

In [None]:
# rmse 확인
rmse(stack_pred_tr,y)

6.498361063421981

In [None]:
# TS target 계산
stack_pred = stackingmodel.predict(X_test)
stack_pred

array([6.00057624, 5.57694734, 5.2231473 , ..., 4.32456405, 4.11458712,
       6.9479899 ])

In [None]:
# 데이터 저장
submission = pd.read_csv(path+'sample_submission.csv')

Unnamed: 0,SEND_SPG_INNB_str5,REC_SPG_INNB_str5
0,171,240
1,83,240
2,78,240
3,240,142
4,128,240
...,...,...
7915,240,45
7916,81,240
7917,243,65
7918,243,105


In [None]:
submission['운송장_건수'] = np.round(stack_pred)
submission

Unnamed: 0,index,운송장_건수
0,0,6.0
1,1,6.0
2,2,5.0
3,3,5.0
4,4,6.0
...,...,...
7915,7915,5.0
7916,7916,6.0
7917,7917,4.0
7918,7918,4.0


In [None]:
submission.to_csv('stack_pred7.csv',index=False)