In [1]:
# PyCaret 라이브러리 설치
!pip install pycaret
!pip install markupsafe==2.0.1
!pip install catboost


# 베이지안 탐색 라이브러리
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[K     |████████████████████████████████| 320 kB 7.7 MB/s 
[?25hCollecting pyod
  Downloading pyod-1.0.4.tar.gz (134 kB)
[K     |████████████████████████████████| 134 kB 56.1 MB/s 
Collecting lightgbm>=2.3.1
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 50.3 MB/s 
Collecting numba<0.55
  Downloading numba-0.54.1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.8 MB/s 
Collecting mlxtend>=0.17.0
  Downloading mlxtend-0.20.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 51.3 MB/s 
[?25hCollecting spacy<2.4.0
  Downloading spacy-2.3.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.4 MB)
[K     |██████████████████████████████

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe==2.0.1
  Downloading MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.1.1
    Uninstalling MarkupSafe-2.1.1:
      Successfully uninstalled MarkupSafe-2.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires markupsafe~=2.1.1, but you have markupsafe 2.0.1 which is incompatible.[0m
Successfully installed markupsafe-2.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     

In [2]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    for files in os.listdir(path):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [4]:
from google.colab import drive
drive.mount('/content/drive') # 구글 드라이브를 사용하는 경우

path = '/content/drive/MyDrive/dacon/water_level/data/'

_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

Mounted at /content/drive


In [5]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission_raw.copy()
submission.name = "submission"

In [6]:
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,
                              format=format)
    return df

In [7]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

In [8]:
df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [9]:
# 데이터 시간대 확인하기
def check_datetime(df):
    print(df.name)
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

rain_data
2012-05-01 00:00:00
2022-07-18 23:50:00
water_data
2012-05-01 00:00:00
2022-07-18 23:50:00
submission
2022-06-01 00:00:00
2022-07-18 23:50:00


In [10]:
data = pd.concat((df_rf,df_water),axis=1)

In [11]:
data.shape

(276336, 17)

In [12]:
# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
_data = data.reset_index(drop=True)

_data.index += 1
tot=_data.sort_index()
tot=tot.iloc[1:-1]

In [13]:
tot.isna().sum()

rf_10184100         0
rf_10184110         0
rf_10184140         0
swl               743
inf               743
sfw               743
ecpc              743
tototf            743
tide_level       4927
wl_1018662         59
fw_1018662      16380
wl_1018680         59
fw_1018680     196848
wl_1018683         59
fw_1018683       1279
wl_1019630         59
fw_1019630         59
dtype: int64

In [14]:
pred_col = submission.columns
pred_col

Index(['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630'], dtype='object')

In [15]:
tot.fillna(tot.mean(),inplace=True)

In [16]:
train_data=tot.iloc[:-len(submission),:]
test_data=tot.iloc[-len(submission):,:]

In [17]:
test_data.isna().sum()

rf_10184100    0
rf_10184110    0
rf_10184140    0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     0
wl_1018662     0
fw_1018662     0
wl_1018680     0
fw_1018680     0
wl_1018683     0
fw_1018683     0
wl_1019630     0
fw_1019630     0
dtype: int64

In [18]:
test_data.shape

(6912, 17)

In [19]:
submission.shape

(6912, 4)

In [21]:
from pycaret.regression import *

s1 = setup(data=train_data.drop(pred_col.drop('wl_1018662'), axis=1), target = 'wl_1018662', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,wl_1018662
2,Original Data,"(269422, 14)"
3,Missing Values,False
4,Numeric Features,13
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(215537, 12)"


In [None]:
# s1_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0764,0.2319,0.4798,1.0,0.0014,0.0002,34.504
rf,Random Forest Regressor,0.0796,0.416,0.6398,0.9999,0.0014,0.0002,67.444
dt,Decision Tree Regressor,0.0948,0.5678,0.7514,0.9999,0.0018,0.0003,0.888
catboost,CatBoost Regressor,0.4793,2.3223,1.4893,0.9996,0.0027,0.0013,33.578
lightgbm,Light Gradient Boosting Machine,0.4325,2.8046,1.6598,0.9995,0.0029,0.0012,1.574
gbr,Gradient Boosting Regressor,1.873,11.4008,3.376,0.998,0.0092,0.0055,38.64
knn,K Neighbors Regressor,2.717,33.5287,5.7883,0.9942,0.0135,0.0075,47.574
ada,AdaBoost Regressor,13.8927,288.2311,16.9721,0.9504,0.055,0.045,24.126
ridge,Ridge Regression,25.2594,1534.7052,39.1711,0.7359,0.0946,0.0718,0.064
lar,Least Angle Regression,25.2596,1534.7052,39.1711,0.7359,0.0946,0.0718,0.072


In [None]:
# tuned_mode11 = [tune_model(i, choose_better=True) for i in s1_top3]

# blended_model1 = blend_models(estimator_list=tuned_model1, optimize='rmse', choose_better=True)

INFO:logs:Initializing tune_model()
INFO:logs:tune_model(estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False), fold=None, round=4, n_iter=10, custom_grid=None, optimize=R2, custom_scorer=None, search_library=scikit-learn, search_algorithm=None, early_stopping=False, early_stopping_max_iters=10, choose_better=True, fit_kwargs=None, groups=None, return_tuner=False, verbose=True, tuner_verbose=True, display=None, return_train_score=False, kwargs={})
INFO:logs:Checking exceptions
INFO:logs:Preparing display monitor


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


INFO:logs:Copying training dataset
INFO:logs:Checking base model
INFO:logs:Base model : Extra Trees Regressor
INFO:logs:Declaring metric variables
INFO:logs:Defining Hyperparameters
INFO:logs:Tuning with n_jobs=-1
INFO:logs:Initializing RandomizedSearchCV


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.7min


KeyboardInterrupt: ignored

In [22]:
blended_model1 = create_model('et')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0702,0.1846,0.4297,1.0,0.0013,0.0002
1,0.075,0.2,0.4472,1.0,0.0014,0.0002
2,0.0756,0.2139,0.4625,1.0,0.0015,0.0002
3,0.0749,0.2622,0.512,1.0,0.0017,0.0002
4,0.0793,0.28,0.5292,1.0,0.0015,0.0002
5,0.0773,0.228,0.4775,1.0,0.0014,0.0002
6,0.0726,0.2171,0.4659,1.0,0.0013,0.0002
7,0.0683,0.134,0.366,1.0,0.0011,0.0002
8,0.0714,0.2119,0.4603,1.0,0.0014,0.0002
9,0.0691,0.1671,0.4088,1.0,0.0012,0.0002


In [23]:
from pycaret.regression import *

s2 = setup(data=train_data.drop(pred_col.drop('wl_1018680'), axis=1), target = 'wl_1018680', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,wl_1018680
2,Original Data,"(269422, 14)"
3,Missing Values,False
4,Numeric Features,13
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(215537, 12)"


In [24]:
# s2_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

In [None]:
# tuned_mode12 = [tune_model(i, choose_better=True) for i in s2_top3]

# blended_model2 = blend_models(estimator_list=tuned_model2, optimize='rmse', choose_better=True)

In [25]:
blended_model2 = create_model('et')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.6316,17.9667,4.2387,0.9959,0.0129,0.0082
1,2.6311,17.7177,4.2092,0.9961,0.0128,0.0082
2,2.5796,17.0974,4.1349,0.9963,0.0127,0.0081
3,2.6224,17.5985,4.1951,0.9958,0.0129,0.0082
4,2.6373,18.046,4.2481,0.996,0.0129,0.0082
5,2.6331,18.1438,4.2596,0.996,0.0129,0.0082
6,2.6247,18.2126,4.2676,0.996,0.013,0.0082
7,2.6073,17.6378,4.1997,0.996,0.0128,0.0081
8,2.6212,17.8954,4.2303,0.9963,0.0129,0.0082
9,2.629,17.7847,4.2172,0.9962,0.0129,0.0082


In [26]:
from pycaret.regression import *

s3 = setup(data=train_data.drop(pred_col.drop('wl_1018683'), axis=1), target = 'wl_1018683', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,wl_1018683
2,Original Data,"(269422, 14)"
3,Missing Values,False
4,Numeric Features,13
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(215537, 12)"


In [None]:
# s3_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

In [None]:
# tuned_mode13 = [tune_model(i, choose_better=True) for i in s3_top3]

# blended_model3 = blend_models(estimator_list=tuned_model3, optimize='rmse', choose_better=True)

In [27]:
blended_model3 = create_model('et')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.5979,5.7916,2.4066,0.9985,0.0074,0.005
1,1.5802,5.6342,2.3736,0.9986,0.0073,0.005
2,1.6202,6.0782,2.4654,0.9985,0.0076,0.0051
3,1.6036,5.9138,2.4318,0.9984,0.0075,0.0051
4,1.6023,5.8959,2.4281,0.9986,0.0074,0.005
5,1.6009,5.9097,2.431,0.9985,0.0074,0.005
6,1.6024,5.8577,2.4203,0.9986,0.0074,0.0051
7,1.5935,5.7189,2.3914,0.9985,0.0074,0.005
8,1.6009,5.8613,2.421,0.9986,0.0074,0.005
9,1.614,5.9733,2.444,0.9986,0.0075,0.0051


In [28]:
from pycaret.regression import *

s4 = setup(data=train_data.drop(pred_col.drop('wl_1019630'), axis=1), target = 'wl_1019630', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,wl_1019630
2,Original Data,"(269422, 14)"
3,Missing Values,False
4,Numeric Features,13
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(215537, 12)"


In [None]:
# s4_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

In [None]:
# tuned_mode14 = [tune_model(i, choose_better=True) for i in s4_top3]

# blended_model4 = blend_models(estimator_list=tuned_model4, optimize='rmse', choose_better=True)

In [29]:
blended_model4 = create_model('et')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.003,0.0011,0.0326,1.0,0.0001,0.0
1,0.0032,0.0012,0.0341,1.0,0.0001,0.0
2,0.0037,0.0052,0.0723,1.0,0.0001,0.0
3,0.003,0.0009,0.0301,1.0,0.0001,0.0
4,0.0033,0.0016,0.0399,1.0,0.0001,0.0
5,0.0032,0.0015,0.0392,1.0,0.0001,0.0
6,0.0035,0.0013,0.0359,1.0,0.0001,0.0
7,0.0028,0.0011,0.0333,1.0,0.0001,0.0
8,0.0031,0.0011,0.0336,1.0,0.0001,0.0
9,0.0045,0.0073,0.0856,1.0,0.0002,0.0


In [30]:
prediction1 = predict_model(blended_model1, data = test_data)
prediction2 = predict_model(blended_model2, data = test_data)
prediction3 = predict_model(blended_model3, data = test_data)
prediction4 = predict_model(blended_model4, data = test_data)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,334.4892,120638.6968,347.3308,-11413.6109,5.7921,0.0311


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,323.4727,111201.4456,333.4688,-10520.6756,5.7626,0.0169


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,319.4388,107866.3319,328.4301,-10205.1133,5.7514,0.0121


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,302.9451,95524.9271,309.0711,-9037.3923,5.7029,0.0


In [31]:
_submission_raw[pred_col[0]] = prediction1['Label'].values
_submission_raw[pred_col[1]] = prediction2['Label'].values
_submission_raw[pred_col[2]] = prediction3['Label'].values
_submission_raw[pred_col[3]] = prediction4['Label'].values

In [32]:
_submission_raw.to_csv('/content/drive/MyDrive/dacon/water_level/submission_et_test.csv')

In [33]:
prediction1.shape

(6912, 18)

In [34]:
_submission_raw.shape

(6912, 4)

In [35]:
_submission_raw.isna().sum()

wl_1018662    0
wl_1018680    0
wl_1018683    0
wl_1019630    0
dtype: int64

In [36]:
_submission_raw

Unnamed: 0_level_0,wl_1018662,wl_1018680,wl_1018683,wl_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-01 00:00,278.700012,274.860012,273.58,270.299988
2022-06-01 00:10,278.700012,274.280012,273.76,269.299988
2022-06-01 00:20,280.700012,270.770012,272.05,267.299988
2022-06-01 00:30,283.700012,273.650012,273.12,264.299988
2022-06-01 00:40,284.700012,272.720012,271.17,261.299988
...,...,...,...,...
2022-07-18 23:10,287.700012,290.070012,287.47,303.299988
2022-07-18 23:20,287.700012,294.850012,298.79,306.299988
2022-07-18 23:30,286.700012,298.420012,300.54,308.299988
2022-07-18 23:40,298.700012,303.150012,307.89,310.299988


In [None]:
# 리더보드 점수: 4.5210107544