In [1]:
# PyCaret 라이브러리 설치
!pip install pycaret
!pip install markupsafe==2.0.1
!pip install catboost


# 베이지안 탐색 라이브러리
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe~=2.1.1
  Using cached MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.0.1
    Uninstalling MarkupSafe-2.0.1:
      Successfully uninstalled MarkupSafe-2.0.1
Successfully installed markupsafe-2.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe==2.0.1
  Using cached MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.1.1
    Uninstalling MarkupSafe-2.1.1:
      Successfully uninstalled MarkupSafe-2.1.1
[31mERROR: pip's dependency resolver does not cu

In [2]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    for files in os.listdir(path):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [4]:
from google.colab import drive
drive.mount('/content/drive') # 구글 드라이브를 사용하는 경우

path = '/content/drive/MyDrive/dacon/water_level/data/'

_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission_raw.copy()
submission.name = "submission"

In [6]:
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,
                              format=format)
    return df

In [7]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

In [8]:
df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [9]:
# 데이터 시간대 확인하기
def check_datetime(df):
    print(df.name)
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

rain_data
2012-05-01 00:00:00
2022-07-18 23:50:00
water_data
2012-05-01 00:00:00
2022-07-18 23:50:00
submission
2022-06-01 00:00:00
2022-07-18 23:50:00


In [10]:
data = pd.concat((df_rf,df_water),axis=1)

In [11]:
data.shape

(276336, 17)

In [12]:
# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
_data = data.reset_index(drop=True)

_data.index += 1
tot=_data.sort_index()
tot=tot.iloc[1:-1]

In [13]:
tot.isna().sum()

rf_10184100         0
rf_10184110         0
rf_10184140         0
swl               743
inf               743
sfw               743
ecpc              743
tototf            743
tide_level       4927
wl_1018662         59
fw_1018662      16380
wl_1018680         59
fw_1018680     196848
wl_1018683         59
fw_1018683       1279
wl_1019630         59
fw_1019630         59
dtype: int64

In [14]:
pred_col = submission.columns
pred_col

Index(['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630'], dtype='object')

In [15]:
tot.fillna(tot.mean(),inplace=True)

In [16]:
train_data=tot.iloc[:-len(submission),:]
test_data=tot.iloc[-len(submission):,:]

In [17]:
test_data.isna().sum()

rf_10184100    0
rf_10184110    0
rf_10184140    0
swl            0
inf            0
sfw            0
ecpc           0
tototf         0
tide_level     0
wl_1018662     0
fw_1018662     0
wl_1018680     0
fw_1018680     0
wl_1018683     0
fw_1018683     0
wl_1019630     0
fw_1019630     0
dtype: int64

In [18]:
test_data.shape

(6912, 17)

In [19]:
submission.shape

(6912, 4)

In [20]:
from pycaret.regression import *

s1 = setup(data=train_data.drop(pred_col.drop('wl_1018662'), axis=1), target = 'wl_1018662', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,wl_1018662
2,Original Data,"(269422, 14)"
3,Missing Values,False
4,Numeric Features,13
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(215537, 12)"


In [21]:
s1_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0764,0.2319,0.4798,1.0,0.0014,0.0002,34.504
rf,Random Forest Regressor,0.0796,0.416,0.6398,0.9999,0.0014,0.0002,67.444
dt,Decision Tree Regressor,0.0948,0.5678,0.7514,0.9999,0.0018,0.0003,0.888
catboost,CatBoost Regressor,0.4793,2.3223,1.4893,0.9996,0.0027,0.0013,33.578
lightgbm,Light Gradient Boosting Machine,0.4325,2.8046,1.6598,0.9995,0.0029,0.0012,1.574
gbr,Gradient Boosting Regressor,1.873,11.4008,3.376,0.998,0.0092,0.0055,38.64
knn,K Neighbors Regressor,2.717,33.5287,5.7883,0.9942,0.0135,0.0075,47.574
ada,AdaBoost Regressor,13.8927,288.2311,16.9721,0.9504,0.055,0.045,24.126
ridge,Ridge Regression,25.2594,1534.7052,39.1711,0.7359,0.0946,0.0718,0.064
lar,Least Angle Regression,25.2596,1534.7052,39.1711,0.7359,0.0946,0.0718,0.072


In [None]:
tuned_mode11 = [tune_model(i, search_library='scikit-optimize', search_algorithm='bayesian', choose_better=True) for i in s1_top3]

blended_model1 = blend_models(estimator_list=tuned_model1, optimize='rmse', choose_better=True)

INFO:logs:Initializing tune_model()
INFO:logs:tune_model(estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=123, verbose=0, warm_start=False), fold=None, round=4, n_iter=10, custom_grid=None, optimize=R2, custom_scorer=None, search_library=scikit-optimize, search_algorithm=bayesian, early_stopping=False, early_stopping_max_iters=10, choose_better=True, fit_kwargs=None, groups=None, return_tuner=False, verbose=True, tuner_verbose=True, display=None, return_train_score=False, kwargs={})
INFO:logs:Checking exceptions
INFO:logs:Preparing display monitor


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


INFO:logs:Copying training dataset
INFO:logs:Checking base model
INFO:logs:Base model : Extra Trees Regressor
INFO:logs:Declaring metric variables
INFO:logs:Defining Hyperparameters
INFO:logs:Tuning with n_jobs=-1
INFO:logs:Initializing skopt.BayesSearchCV


Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
from pycaret.regression import *

s2 = setup(data=train_data.drop(pred_col.drop('wl_1018680'), axis=1), target = 'wl_1018680', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

In [None]:
s2_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

In [None]:
tuned_mode12 = [tune_model(i, search_library='scikit-optimize', search_algorithm='bayesian', choose_better=True) for i in s2_top3]

blended_model2 = blend_models(estimator_list=tuned_model2, optimize='rmse', choose_better=True)

In [None]:
from pycaret.regression import *

s3 = setup(data=train_data.drop(pred_col.drop('wl_1018683'), axis=1), target = 'wl_1018683', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

In [None]:
s3_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

In [None]:
tuned_mode13 = [tune_model(i, search_library='scikit-optimize', search_algorithm='bayesian', choose_better=True) for i in s3_top3]

blended_model3 = blend_models(estimator_list=tuned_model3, optimize='rmse', choose_better=True)

In [None]:
from pycaret.regression import *

s4 = setup(data=train_data.drop(pred_col.drop('wl_1019630'), axis=1), target = 'wl_1019630', train_size=0.8,
           normalize=True,
                   normalize_method ='robust',
                   transformation=True,
             silent = True, session_id = 123)

In [None]:
s4_top3 = compare_models(n_select = 3, fold=5, sort="RMSE", exclude=["knn", "ada", "gbr"])

In [None]:
tuned_mode14 = [tune_model(i, search_library='scikit-optimize', search_algorithm='bayesian', choose_better=True) for i in s4_top3]

blended_model4 = blend_models(estimator_list=tuned_model4, optimize='rmse', choose_better=True)

In [None]:
prediction1 = predict_model(blended_model1, data = test_data)
prediction2 = predict_model(blended_model2, data = test_data)
prediction3 = predict_model(blended_model3, data = test_data)
prediction4 = predict_model(blended_model4, data = test_data)

In [None]:
submission[pred_col[0]] = prediction1
submission[pred_col[1]] = prediction2
submission[pred_col[2]] = prediction3
submission[pred_col[3]] = prediction4

In [None]:
submission.to_csv('/content/drive/MyDrive/dacon/water_level/submission_test.csv', index=False)

In [None]:
submission.to_csv('/content/drive/MyDrive/dacon/water_level/submission_test2.csv')