## 1. Setup

In [1]:
import os
os.environ["PYTHONHASHSEED"] = str(42)
import sys
sys.path.append("C:\\Miniconda\\envs\\kogas_env1\\Lib\\site-packages")
import shutil
import datetime
import random as rnd
from glob import glob
import gc
import optuna
from optuna import Trial, create_study
from optuna.samplers import TPESampler

import numpy as np
from numpy import random as np_rnd
import pandas as pd
from scipy.stats import linregress
from scipy.stats import trim_mean

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, QuantileTransformer, PowerTransformer
from sklearn import metrics as skl_merics
from sklearn.decomposition import PCA

from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import RegressorChain, MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import ExtraTreeRegressor

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
# import seaborn as sns

import warnings
warnings.simplefilter(action="ignore")

pd.set_option("display.max_rows", 50)

plt.rcParams["axes.unicode_minus"] = False
font_path = "C:\\Users\\kogas\\Desktop\\nanum-square\\NanumSquareR.ttf"
font_name = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font_name)

In [2]:
!python --version

Python 3.8.13


In [3]:
def seed_everything(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    rnd.seed(seed)
    np_rnd.seed(seed)

def diff(x1, x2):
    x2 = set(x2)
    return [i for i in x1 if i not in x2]

def softmax(x, multiplier=2.0):
    x = x * multiplier
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [4]:
class CFG:
    debug = False
    train_timesteps = 12 * 10
    test_timesteps = 168
    pca_threshold = 0.9

In [5]:
folder_path = "C:\\Users\\kogas\\Desktop\\jupyter_root_folder\\YJ_notebooks\\"
# C:\Users\kogas\Desktop\jupyter_root_folder\YJ_notebooks
seed_everything()

**data loading**

In [6]:
df_supply = pd.read_csv(folder_path + '\\dataset\\월별공급량및비중.csv')
df_supply.columns = ["year", "month", "target_civil", "target_ind", "total", "weight_civil", "weight_ind"]
df_indust = pd.read_csv(folder_path + '\\dataset\\제조업 부가가치(분기별).csv')
df_indust.columns = ["year", "quarter", "qva"]
df_commer = pd.read_csv(folder_path + '\\dataset\\상업용 상대가격(기준=2015).csv')
df_commer.columns = ["year", "month", "relative_price", "gas_price", "oil_price"]

**시간 관련 feature 입력**

In [7]:
quarter_dic = {
    "Q1": [1, 2, 3],
    "Q2": [4, 5, 6],
    "Q3": [7, 8, 9],
    "Q4": [10, 11, 12],
}
# df_indust 에는 쿼터가 없기 때문에, 다른 데이터에 쿼터를 달아준다
month_dic = {
    1 : 'Q1',
    2 : 'Q1',
    3 : 'Q1',
    4 : 'Q2',
    5 : 'Q2',
    6 : 'Q2',
    7 : 'Q3',
    8 : 'Q3',
    9 : 'Q3',
    10 : 'Q4',
    11 : 'Q4',
    12 : 'Q4'
}

In [8]:
df_supply["quarter"] = df_supply["month"].apply(lambda x: month_dic[x])
df_indust["month"] = df_indust["quarter"].apply(lambda x: quarter_dic[x])
df_commer["quarter"] = df_commer["month"].apply(lambda x: month_dic[x])

In [9]:
df_supply[["year", "month"]] = df_supply[["year", "month"]].astype("int32")
df_supply.head()

Unnamed: 0,year,month,target_civil,target_ind,total,weight_civil,weight_ind,quarter
0,1996,1,605519.0,83809.0,689328.0,0.87842,0.12158,Q1
1,1996,2,566323.0,70427.0,636750.0,0.8894,0.1106,Q1
2,1996,3,477514.0,62652.0,540166.0,0.88401,0.11599,Q1
3,1996,4,337794.0,47050.0,384844.0,0.87774,0.12226,Q2
4,1996,5,184522.0,30709.0,215231.0,0.85732,0.14268,Q2


In [10]:
df_indust = df_indust.explode("month")
df_indust["qva"] /= 3
df_indust[["year", "month"]] = df_indust[["year", "month"]].astype("int32")
df_indust.head()

Unnamed: 0,year,quarter,qva,month
0,1996,Q1,12183.433333,1
0,1996,Q1,12183.433333,2
0,1996,Q1,12183.433333,3
1,1996,Q2,12384.133333,4
1,1996,Q2,12384.133333,5


In [11]:
df_commer[["year", "month"]] = df_commer[["year", "month"]].astype("int32")
df_commer.head()

Unnamed: 0,year,month,relative_price,gas_price,oil_price,quarter
0,1996,1,0.97,26.94,27.86,Q1
1,1996,2,0.93,26.94,29.04,Q1
2,1996,3,0.96,26.94,27.99,Q1
3,1996,4,0.94,26.94,28.74,Q2
4,1996,5,0.92,26.94,29.18,Q2


In [12]:
df_full = df_supply[["year", "quarter", "month"]]
concat_list = [df_supply, df_indust, df_commer]
for i in concat_list:  
    df_full = pd.merge(df_full, i, on=["year", "quarter", "month"], how="left")

In [13]:
# df_full.columns = ["year", "month", "target_civil", "target_ind", "total", "weight_civil", "weight_ind", "quarter", "qva", "qva_norm2015", "relative_price", "price_gas", "price_oil"]
df_full = df_full.drop(["weight_ind", "total"], axis=1)
# df_test.columns = ["year", "month", "target_civil", "target_ind"]

In [14]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            300 non-null    int32  
 1   quarter         300 non-null    object 
 2   month           300 non-null    int32  
 3   target_civil    300 non-null    float64
 4   target_ind      300 non-null    float64
 5   weight_civil    300 non-null    float64
 6   qva             300 non-null    float64
 7   relative_price  300 non-null    float64
 8   gas_price       300 non-null    float64
 9   oil_price       300 non-null    float64
dtypes: float64(7), int32(2), object(1)
memory usage: 23.4+ KB


In [15]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18


## 외부데이터 로딩

In [16]:
external_data = {
    "tmper": None,
    "hum": None,
    "eura_snow": None,
    "neg_north": None,
    "sea_ice": None,
}

**온도**

In [17]:
tmp_df = []

for i in glob("C:\\Users\\kogas\\Desktop\\external_data\\20220930\\기온\\*"):
    if i.split("\\")[-1][:2] in ["서울", "인천", "대전", "대구", "광주", "울산", "부산"]:
        tmp = pd.read_csv(i, encoding="cp949")
        tmp = tmp.iloc[:, 1:]
        tmp.columns = ["지점명", "일시", "평균기온", "최고기온", "최고기온시각", "최저기온", "최저기온시각", "일교차"]
        tmp = tmp[["지점명", "일시", "평균기온", "최고기온", "최저기온", "일교차"]]
        tmp["일시"] = pd.to_datetime(tmp["일시"])
        tmp["year"] = tmp["일시"].dt.year
        tmp["month"] = tmp["일시"].dt.month
        tmp = tmp.loc[(tmp["일시"] >= pd.to_datetime("1996-01-01")) &( tmp["일시"] <= pd.to_datetime("2020-12-31"))]
        tmp_df.append(tmp)

tmp_df = pd.concat(tmp_df, axis=0, ignore_index=True)

In [18]:
df_full[["전국평균_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.groupby(["year", "month"]).mean().values
df_full[["전국표준편차_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.groupby(["year", "month"]).std().values
df_full[["서울_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.loc[tmp_df["지점명"] == "서울"].groupby(["year", "month"]).mean().values
df_full[["부산_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.loc[tmp_df["지점명"] == "부산"].groupby(["year", "month"]).mean().values

In [19]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,전국표준편차_최저기온,전국표준편차_일교차,서울_평균기온,서울_최고기온,서울_최저기온,서울_일교차,부산_평균기온,부산_최고기온,부산_최저기온,부산_일교차
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,4.00243,3.127946,-2.187097,2.329032,-5.609677,7.93871,3.770968,8.512903,-0.36129,8.874194
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,4.438573,3.207891,-1.589655,3.182759,-5.472414,8.655172,3.437931,8.765517,-0.834483,9.6
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,3.203499,3.40194,4.880645,9.451613,1.022581,8.429032,7.854839,12.425806,3.758065,8.667742
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,4.484583,3.596942,10.233333,15.143333,5.853333,9.29,12.23,17.386667,8.183333,9.203333
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,2.636438,3.648928,18.358065,24.196774,13.283871,10.912903,17.983871,22.683871,14.312903,8.370968


**습도**

In [20]:
tmp_df = []

for i in glob("C:\\Users\\kogas\\Desktop\\external_data\\20221008\\습도\\*"):
    if i.split("\\")[-1][:2] in ["서울", "인천", "대전", "대구", "광주", "울산", "부산"]:
        tmp = pd.read_csv(i, encoding="cp949")
        tmp = tmp.iloc[:, 1:]
        tmp.columns = ["지점명", "일시", "평균습도", "최저습도"]
        tmp = tmp[["지점명", "일시", "평균습도", "최저습도"]]
        tmp["일시"] = pd.to_datetime(tmp["일시"])
        tmp["year"] = tmp["일시"].dt.year
        tmp["month"] = tmp["일시"].dt.month
        tmp = tmp.loc[(tmp["일시"] >= pd.to_datetime("1996-01-01")) & (tmp["일시"] <= pd.to_datetime("2020-12-31"))]
        tmp_df.append(tmp)
    else:
        continue
#     break

tmp_df = pd.concat(tmp_df, axis=0, ignore_index=True)

In [21]:
tmp

Unnamed: 0,지점명,일시,평균습도,최저습도,year,month
0,인천,1996-01-01,35.3,26.0,1996.0,1.0
1,인천,1996-01-02,48.8,26.0,1996.0,1.0
2,인천,1996-01-03,47.5,23.0,1996.0,1.0
3,인천,1996-01-04,43.8,30.0,1996.0,1.0
4,인천,1996-01-05,73.5,54.0,1996.0,1.0
...,...,...,...,...,...,...
9127,인천,2020-12-27,61.3,40.0,2020.0,12.0
9128,인천,2020-12-28,84.1,67.0,2020.0,12.0
9129,인천,2020-12-29,71.0,49.0,2020.0,12.0
9130,인천,2020-12-30,48.8,38.0,2020.0,12.0


In [22]:
df_full[["전국평균_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.groupby(["year", "month"]).mean().values
df_full[["전국표준편차_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.groupby(["year", "month"]).std().values
df_full[["서울_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.loc[tmp_df["지점명"] == "서울"].groupby(["year", "month"]).mean().values
df_full[["부산_" + str(i) for i in tmp_df.groupby(["year", "month"]).mean().columns]] = tmp_df.loc[tmp_df["지점명"] == "부산"].groupby(["year", "month"]).mean().values

In [23]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,부산_최저기온,부산_일교차,전국평균_평균습도,전국평균_최저습도,전국표준편차_평균습도,전국표준편차_최저습도,서울_평균습도,서울_최저습도,부산_평균습도,부산_최저습도
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,-0.36129,8.874194,52.41659,34.43318,18.130323,17.436562,50.487097,31.096774,43.629032,31.129032
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,-0.834483,9.6,51.459113,31.270936,15.450475,12.886423,47.213793,27.310345,46.496552,30.172414
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,3.758065,8.667742,57.628571,34.414747,15.621801,15.624452,53.76129,27.709677,55.432258,38.645161
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,8.183333,9.203333,52.87619,30.719048,17.869661,15.956943,52.31,29.033333,51.77,34.6
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,14.312903,8.370968,62.302765,37.824885,12.39256,13.480326,57.848387,32.193548,65.793548,46.129032


**소비매출 데이터**

In [24]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\kosis_korea_retail_sale.csv", encoding="cp949")
tmp_df.isna().sum().sum()

0

In [25]:
tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      300 non-null    object 
 1   총합      300 non-null    float64
 2   내구재     300 non-null    float64
 3   준내구재    300 non-null    float64
 4   비내구재    300 non-null    float64
dtypes: float64(4), object(1)
memory usage: 11.8+ KB


In [26]:
tmp_df.head()

Unnamed: 0,날짜,총합,내구재,준내구재,비내구재
0,199601월,5.6,10.8,6.6,2.2
1,199602월,16.6,7.2,13.3,24.2
2,199603월,12.2,5.4,10.4,17.8
3,199604월,10.4,7.8,6.9,14.6
4,199605월,15.4,21.7,9.3,14.8


In [27]:
tmp_df["날짜"] = tmp_df["날짜"].apply(lambda x: datetime.datetime.strptime(x, "%Y%m월"))
tmp_df["year"] = tmp_df["날짜"].dt.year
tmp_df["month"] = tmp_df["날짜"].dt.month
df_full[["총합", "내구재", "준내구재", "비내구재"]] = tmp_df[["총합", "내구재", "준내구재", "비내구재"]].values

In [28]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,전국표준편차_평균습도,전국표준편차_최저습도,서울_평균습도,서울_최저습도,부산_평균습도,부산_최저습도,총합,내구재,준내구재,비내구재
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,18.130323,17.436562,50.487097,31.096774,43.629032,31.129032,5.6,10.8,6.6,2.2
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,15.450475,12.886423,47.213793,27.310345,46.496552,30.172414,16.6,7.2,13.3,24.2
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,15.621801,15.624452,53.76129,27.709677,55.432258,38.645161,12.2,5.4,10.4,17.8
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,17.869661,15.956943,52.31,29.033333,51.77,34.6,10.4,7.8,6.9,14.6
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,12.39256,13.480326,57.848387,32.193548,65.793548,46.129032,15.4,21.7,9.3,14.8


**수출입 데이터**

In [29]:
# 수출입 데이터
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\kosis_korea_trade_balance.csv")
tmp_df.columns = ["날짜", "수출", "수출_yoy", "수입", "수입_yoy", "trade_balance"]
tmp_df.isna().sum().sum()

0

In [30]:
tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   날짜             301 non-null    object 
 1   수출             301 non-null    int64  
 2   수출_yoy         301 non-null    float64
 3   수입             301 non-null    int64  
 4   수입_yoy         301 non-null    float64
 5   trade_balance  301 non-null    int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 14.2+ KB


In [31]:
tmp_df.head()

Unnamed: 0,날짜,수출,수출_yoy,수입,수입_yoy,trade_balance
0,199512월,118,9.9,118,11.3,0
1,199601월,99,27.8,120,34.4,-21
2,199602월,99,17.2,115,16.1,-16
3,199603월,117,16.9,121,4.0,-4
4,199604월,106,4.7,127,14.3,-20


In [32]:
tmp_df["날짜"] = tmp_df["날짜"].apply(lambda x: datetime.datetime.strptime(x, "%Y%m월"))
tmp_df["year"] = tmp_df["날짜"].dt.year
tmp_df["month"] = tmp_df["날짜"].dt.month
df_full[["수출", "수출_yoy", "수입_yoy", "trade_balance"]] = tmp_df[["수출", "수출_yoy", "수입_yoy", "trade_balance"]].iloc[1:].values

In [33]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,부산_평균습도,부산_최저습도,총합,내구재,준내구재,비내구재,수출,수출_yoy,수입_yoy,trade_balance
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,43.629032,31.129032,5.6,10.8,6.6,2.2,99.0,27.8,34.4,-21.0
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,46.496552,30.172414,16.6,7.2,13.3,24.2,99.0,17.2,16.1,-16.0
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,55.432258,38.645161,12.2,5.4,10.4,17.8,117.0,16.9,4.0,-4.0
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,51.77,34.6,10.4,7.8,6.9,14.6,106.0,4.7,14.3,-20.0
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,65.793548,46.129032,15.4,21.7,9.3,14.8,112.0,5.5,7.3,-14.0


In [34]:
df_full["부산_평균습도"] /= 100

**GDP 데이터**

In [35]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\kosis_korea_gdp.csv")
tmp_df.columns = ["날짜", "gdp_nominal", "gdp_real_gwr"]
tmp_df.isna().sum().sum()

0

In [36]:
tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   날짜            104 non-null    object 
 1   gdp_nominal   104 non-null    object 
 2   gdp_real_gwr  104 non-null    float64
dtypes: float64(1), object(2)
memory usage: 2.6+ KB


In [37]:
tmp_df.head()

Unnamed: 0,날짜,gdp_nominal,gdp_real_gwr
0,19951/4,98054.3,9.6
1,19952/4,108417.9,10.0
2,19953/4,113779.6,10.8
3,19954/4,116736.9,8.2
4,19961/4,112294.5,8.1


In [38]:
tmp_df["quarter"] = tmp_df["날짜"].apply(lambda x: "Q" + x[-3])
tmp_df["날짜"] = tmp_df["날짜"].apply(lambda x: datetime.datetime.strptime(x[:4], "%Y"))
tmp_df["year"] = tmp_df["날짜"].dt.year
# tmp_df["month"] = tmp_df["quarter"].apply(lambda x: quarter_dic[x])
tmp_df["month"] = [3, 6, 9, 12] * (104 // 4)

In [39]:
tmp_df

Unnamed: 0,날짜,gdp_nominal,gdp_real_gwr,quarter,year,month
0,1995-01-01,98054.3,9.6,Q1,1995,3
1,1995-01-01,108417.9,10.0,Q2,1995,6
2,1995-01-01,113779.6,10.8,Q3,1995,9
3,1995-01-01,116736.9,8.2,Q4,1995,12
4,1996-01-01,112294.5,8.1,Q1,1996,3
...,...,...,...,...,...,...
99,2019-01-01,499505.6,2.6,Q4,2019,12
100,2020-01-01,460085.6,1.5,Q1,2020,3
101,2020-01-01,475980.0,-2.5,Q2,2020,6
102,2020-01-01,496340.3,-0.9,Q3,2020,9


In [40]:
merge_df = pd.DataFrame(pd.date_range("1996-01", "2020-12", freq="MS"), columns=["날짜"])
merge_df["year"] = merge_df["날짜"].dt.year
merge_df["month"] = merge_df["날짜"].dt.month

In [41]:
merge_df

Unnamed: 0,날짜,year,month
0,1996-01-01,1996,1
1,1996-02-01,1996,2
2,1996-03-01,1996,3
3,1996-04-01,1996,4
4,1996-05-01,1996,5
...,...,...,...
295,2020-08-01,2020,8
296,2020-09-01,2020,9
297,2020-10-01,2020,10
298,2020-11-01,2020,11


In [42]:
merge_df = merge_df.merge(tmp_df[["year", "month", "gdp_nominal", "gdp_real_gwr"]], on=["year", "month"], how="left")
merge_df["gdp_nominal"] = merge_df["gdp_nominal"].apply(lambda x: str(x).replace(",", "")).astype("float32")
merge_df["gdp_nominal"] = merge_df["gdp_nominal"] / 3
merge_df["gdp_real_gwr"] = (((1 + merge_df["gdp_real_gwr"] / 100) ** (1/3)) - 1) * 100
df_full[["gdp_nominal", "gdp_real_gwr"]] = merge_df[["gdp_nominal", "gdp_real_gwr"]].bfill().ffill().values
del merge_df

In [43]:
df_full

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,총합,내구재,준내구재,비내구재,수출,수출_yoy,수입_yoy,trade_balance,gdp_nominal,gdp_real_gwr
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,5.6,10.8,6.6,2.2,99.0,27.8,34.4,-21.0,37431.500000,2.630213
1,1996,Q1,2,566323.0,70427.0,0.88940,12183.433333,0.93,26.94,29.04,...,16.6,7.2,13.3,24.2,99.0,17.2,16.1,-16.0,37431.500000,2.630213
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,12.2,5.4,10.4,17.8,117.0,16.9,4.0,-4.0,37431.500000,2.630213
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,10.4,7.8,6.9,14.6,106.0,4.7,14.3,-20.0,40387.167969,2.630213
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,15.4,21.7,9.3,14.8,112.0,5.5,7.3,-14.0,40387.167969,2.630213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2020,Q3,8,398661.0,457970.0,0.46538,40582.500000,0.86,72.93,84.71,...,0.2,13.8,-11.3,-2.2,395.0,-10.3,-15.6,37.0,165446.765625,-0.300905
296,2020,Q3,9,380235.0,525619.0,0.41975,40582.500000,0.80,65.24,81.26,...,4.1,19.4,-12.0,3.7,478.0,7.1,1.7,84.0,165446.765625,-0.300905
297,2020,Q4,10,616682.0,600890.0,0.50649,41580.133333,0.76,59.16,78.13,...,0.1,9.6,-2.2,-3.8,448.0,-3.9,-5.6,57.0,169440.109375,-0.300905
298,2020,Q4,11,1093747.0,665901.0,0.62157,41580.133333,0.75,58.72,78.64,...,-1.2,13.0,-11.1,-3.8,458.0,3.9,-1.9,58.0,169440.109375,-0.300905


**Global Energy Price**

In [44]:
df_full[["GEP"]] = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\Weather_And_GEP.csv")[["GEP"]]

In [45]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,내구재,준내구재,비내구재,수출,수출_yoy,수입_yoy,trade_balance,gdp_nominal,gdp_real_gwr,GEP
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,10.8,6.6,2.2,99.0,27.8,34.4,-21.0,37431.5,2.630213,50.014366
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,7.2,13.3,24.2,99.0,17.2,16.1,-16.0,37431.5,2.630213,52.249518
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,5.4,10.4,17.8,117.0,16.9,4.0,-4.0,37431.5,2.630213,52.340496
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,7.8,6.9,14.6,106.0,4.7,14.3,-20.0,40387.167969,2.630213,52.923612
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,21.7,9.3,14.8,112.0,5.5,7.3,-14.0,40387.167969,2.630213,50.163488


**글로벌 수심별 수온편차**

In [46]:
with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\global_수심별수온편차.txt", "r", encoding="cp949") as f:
    rawdata = f.readlines()
    tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [47]:
tmp_df

Unnamed: 0,연도,국외 수온(수온밑) 편차(0-100m),국외 수온(수온밑) 편차(0-700m),국외 수온(수온밑) 편차(0-2000m)
0,1955,-0.133,-0.034,-0.013
1,1956,-0.123,-0.028,-0.011
2,1957,-0.090,-0.049,-0.024
3,1958,-0.027,-0.016,-0.010
4,1959,-0.071,-0.023,-0.013
...,...,...,...,...
61,2016,0.444,0.148,0.079
62,2017,0.405,0.168,0.089
63,2018,0.383,0.177,0.091
64,2019,0.456,0.187,0.096


In [48]:
tmp_df["연도"] = tmp_df["연도"].astype("int")
tmp_df = tmp_df.loc[tmp_df["연도"]>=1996]
tmp_df["month"] = 12
tmp_df = tmp_df.rename(
    {"연도": "year",
     "국외 수온(수온밑) 편차(0-100m)": "글로벌_수심수온편차_0to100",
     "국외 수온(수온밑) 편차(0-700m)": "글로벌_수심수온편차_0to700",
     "국외 수온(수온밑) 편차(0-2000m)": "글로벌_수심수온편차_0to2000",}, axis=1
)
df_full = df_full.merge(tmp_df[["year", "month", "글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000"]], on=["year", "month"], how="left")
df_full[["글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000"]] = df_full[["글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000"]].astype("float32")
df_full["글로벌_수심수온편차_평균"] = df_full[["글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000"]].mean(axis=1).values
df_full["글로벌_수심수온편차_표준편차"] = df_full[["글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000"]].std(axis=1).values
df_full[["글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000", "글로벌_수심수온편차_평균", "글로벌_수심수온편차_표준편차"]] = df_full[["글로벌_수심수온편차_0to100", "글로벌_수심수온편차_0to700", "글로벌_수심수온편차_0to2000", "글로벌_수심수온편차_평균", "글로벌_수심수온편차_표준편차"]].interpolate().bfill().ffill().values

In [49]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,수입_yoy,trade_balance,gdp_nominal,gdp_real_gwr,GEP,글로벌_수심수온편차_0to100,글로벌_수심수온편차_0to700,글로벌_수심수온편차_0to2000,글로벌_수심수온편차_평균,글로벌_수심수온편차_표준편차
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,34.4,-21.0,37431.5,2.630213,50.014366,0.093,0.044,0.019,0.052,0.037643
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,16.1,-16.0,37431.5,2.630213,52.249518,0.093,0.044,0.019,0.052,0.037643
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,4.0,-4.0,37431.5,2.630213,52.340496,0.093,0.044,0.019,0.052,0.037643
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,14.3,-20.0,40387.167969,2.630213,52.923612,0.093,0.044,0.019,0.052,0.037643
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,7.3,-14.0,40387.167969,2.630213,50.163488,0.093,0.044,0.019,0.052,0.037643


**글로벌 연평균 해양열용량**

In [50]:
with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\global_연평균_해양열용량.txt", "r", encoding="cp949") as f:
    rawdata = f.readlines()
    tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [51]:
tmp_df.replace("", np.nan).dropna()

Unnamed: 0,연도,국외 해양열용량 편차(0-700m),(0-2000m)
50,2005,8.412,10.171
51,2006,10.43,12.638
52,2007,9.478,12.394
53,2008,10.052,13.257
54,2009,10.126,13.431
55,2010,10.367,14.537
56,2011,10.869,15.394
57,2012,10.941,16.187
58,2013,12.601,18.65
59,2014,13.261,20.11


In [52]:
tmp_df["연도"] = tmp_df["연도"].astype("int")
tmp_df = tmp_df.loc[tmp_df["연도"]>=1996]
tmp_df["month"] = 12
tmp_df = tmp_df.rename(
    {"연도": "year",
     "국외 해양열용량 편차(0-700m)": "글로벌_해양열용량편차_0to700"}, axis=1
)
df_full = df_full.merge(tmp_df[["year", "month", "글로벌_해양열용량편차_0to700"]], on=["year", "month"], how="left")
df_full[["글로벌_해양열용량편차_0to700"]] = df_full[["글로벌_해양열용량편차_0to700"]].astype("float32")

df_full[["글로벌_해양열용량편차_0to700"]] = df_full[["글로벌_해양열용량편차_0to700"]].interpolate().bfill().ffill().values

In [53]:
df_full

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,trade_balance,gdp_nominal,gdp_real_gwr,GEP,글로벌_수심수온편차_0to100,글로벌_수심수온편차_0to700,글로벌_수심수온편차_0to2000,글로벌_수심수온편차_평균,글로벌_수심수온편차_표준편차,글로벌_해양열용량편차_0to700
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,-21.0,37431.500000,2.630213,50.014366,0.093000,0.044000,0.019000,0.052000,0.037643,4.544000
1,1996,Q1,2,566323.0,70427.0,0.88940,12183.433333,0.93,26.94,29.04,...,-16.0,37431.500000,2.630213,52.249518,0.093000,0.044000,0.019000,0.052000,0.037643,4.544000
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,-4.0,37431.500000,2.630213,52.340496,0.093000,0.044000,0.019000,0.052000,0.037643,4.544000
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,-20.0,40387.167969,2.630213,52.923612,0.093000,0.044000,0.019000,0.052000,0.037643,4.544000
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,-14.0,40387.167969,2.630213,50.163488,0.093000,0.044000,0.019000,0.052000,0.037643,4.544000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2020,Q3,8,398661.0,457970.0,0.46538,40582.500000,0.86,72.93,84.71,...,37.0,165446.765625,-0.300905,93.227186,0.442667,0.185667,0.096667,0.241667,0.179670,17.591333
296,2020,Q3,9,380235.0,525619.0,0.41975,40582.500000,0.80,65.24,81.26,...,84.0,165446.765625,-0.300905,91.297555,0.441000,0.185500,0.096750,0.241083,0.178730,17.574501
297,2020,Q4,10,616682.0,600890.0,0.50649,41580.133333,0.76,59.16,78.13,...,57.0,169440.109375,-0.300905,95.402857,0.439333,0.185333,0.096833,0.240500,0.177790,17.557667
298,2020,Q4,11,1093747.0,665901.0,0.62157,41580.133333,0.75,58.72,78.64,...,58.0,169440.109375,-0.300905,100.895411,0.437667,0.185167,0.096917,0.239917,0.176850,17.540833


**글로벌 해수면 높이 및 온도편차 (데이터 부재로 merge 불가)**

In [54]:
# with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\global_해수면높이_온도편차.txt", "r", encoding="cp949") as f:
#     rawdata = f.readlines()
#     tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [55]:
# tmp_df.head()

**글로벌 및 한국 해양표층ph, CO2농도 (데이터 부재로 merge 불가)**

In [56]:
# with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\korea&global_해양표층ph_co2농도_해양분압.txt", "r", encoding="cp949") as f:
#     rawdata = f.readlines()
#     tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [57]:
# tmp_df.replace("", np.nan).dropna()

**한국 3면 연평균 표층염분 (각 데이터 내 분산이 너무 작아서 drop)**

In [58]:
# with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\korea_3면_연평균_표층염분.txt", "r", encoding="cp949") as f:
#     rawdata = f.readlines()
#     tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [59]:
# tmp_df.replace("", np.nan).dropna()

**한국 바다3면 평균수온**

In [60]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\korea_3면온도.txt", sep=",", encoding="cp949")
# tmp_df.columns = ["날짜", "gdp_nominal", "gdp_real_gwr"]
tmp_df.isna().sum().sum()

53

In [61]:
tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   연도          53 non-null     int64  
 1   동해          53 non-null     float64
 2   남해          53 non-null     float64
 3   서해          53 non-null     float64
 4   Unnamed: 4  0 non-null      float64
dtypes: float64(4), int64(1)
memory usage: 2.2 KB


In [62]:
tmp_df.head()

Unnamed: 0,연도,동해,남해,서해,Unnamed: 4
0,1968,15.9,17.9,14.4,
1,1969,15.8,18.0,14.3,
2,1970,16.3,18.0,13.9,
3,1971,15.6,18.5,14.0,
4,1972,15.8,18.7,15.2,


In [63]:
tmp_df["연도"] = tmp_df["연도"].astype("int")
tmp_df = tmp_df.loc[tmp_df["연도"]>=1996].reset_index(drop=True)
tmp_df["month"] = 12
tmp_df = tmp_df.rename({"연도": "year", "동해": "해수면온도_동해", "남해": "해수면온도_남해", "서해": "해수면온도_서해"}, axis=1)
df_full = df_full.merge(tmp_df[["year", "month", "해수면온도_동해", "해수면온도_남해", "해수면온도_서해"]], on=["year", "month"], how="left")
df_full["해수면온도_3면평균"] = df_full[["해수면온도_동해", "해수면온도_남해", "해수면온도_서해"]].mean(axis=1)
df_full["해수면온도_3면표준편차"] = df_full[["해수면온도_동해", "해수면온도_남해", "해수면온도_서해"]].std(axis=1)
df_full[["해수면온도_동해", "해수면온도_남해", "해수면온도_서해", "해수면온도_3면평균", "해수면온도_3면표준편차"]] = df_full[["해수면온도_동해", "해수면온도_남해", "해수면온도_서해", "해수면온도_3면평균", "해수면온도_3면표준편차"]].interpolate().bfill().ffill().values

In [64]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,글로벌_수심수온편차_0to700,글로벌_수심수온편차_0to2000,글로벌_수심수온편차_평균,글로벌_수심수온편차_표준편차,글로벌_해양열용량편차_0to700,해수면온도_동해,해수면온도_남해,해수면온도_서해,해수면온도_3면평균,해수면온도_3면표준편차
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,0.044,0.019,0.052,0.037643,4.544,16.0,19.3,15.4,16.9,2.1
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,0.044,0.019,0.052,0.037643,4.544,16.0,19.3,15.4,16.9,2.1
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,0.044,0.019,0.052,0.037643,4.544,16.0,19.3,15.4,16.9,2.1
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,0.044,0.019,0.052,0.037643,4.544,16.0,19.3,15.4,16.9,2.1
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,0.044,0.019,0.052,0.037643,4.544,16.0,19.3,15.4,16.9,2.1


**한국 연평균 표층염분 (각 데이터 내 분산이 너무 작아서 drop)**

In [65]:
# with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\korea_연평균_표층염분.txt", "r", encoding="cp949") as f:
#     rawdata = f.readlines()
#     tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [66]:
# tmp_df.replace("", np.nan).dropna()

**한국 연평균 지표온도**

In [67]:
with open("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\korea_지표온도.txt", "r", encoding="cp949") as f:
    rawdata = f.readlines()
    tmp_df = pd.DataFrame([i.split(",")[:-1] for i in rawdata[1:]], columns=rawdata[0].split(",")[:-1])

In [68]:
tmp_df.head()

Unnamed: 0,연도,SF_TMP_SUF,SF_TMP_0.05m,SF_TMP_0.1m,SF_TMP_0.2m,SF_TMP_0.3m,SF_TMP_0.5m,SF_TMP_1m,SF_TMP_1.5m,SF_TMP_3m,SF_TMP_5m
0,1973,14.4,14.3,14.4,14.5,14.5,14.9,14.8,14.7,14.8,14.8
1,1974,13.1,13.1,13.2,13.4,13.4,13.9,13.8,13.9,14.2,14.5
2,1975,14.4,14.4,14.5,14.7,14.6,14.9,14.8,14.7,14.7,14.6
3,1976,13.4,13.5,13.7,13.8,13.8,14.1,14.3,14.1,14.5,14.7
4,1977,14.1,14.3,14.4,14.5,14.5,14.6,14.6,14.4,14.5,14.4


In [69]:
tmp_df["연도"] = tmp_df["연도"].astype("int")
tmp_df = tmp_df.loc[tmp_df["연도"]>=1996].reset_index(drop=True)
tmp_df["month"] = 12
tmp_df = tmp_df.rename({"연도": "year"}, axis=1)
df_full = df_full.merge(tmp_df[["year", "month", "SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]], on=["year", "month"], how="left")
df_full[["SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]] = df_full[["SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]].astype("float32")
df_full["SF_TMP_고도평균"] = df_full[["SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]].mean(axis=1)
df_full["SF_TMP_고도표준편차"] = df_full[["SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]].std(axis=1)
df_full[["SF_TMP_고도평균", "SF_TMP_고도표준편차", "SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]] = df_full[["SF_TMP_고도평균", "SF_TMP_고도표준편차", "SF_TMP_SUF", "SF_TMP_1m", "SF_TMP_5m"]].interpolate().bfill().ffill().values

In [70]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,해수면온도_동해,해수면온도_남해,해수면온도_서해,해수면온도_3면평균,해수면온도_3면표준편차,SF_TMP_SUF,SF_TMP_1m,SF_TMP_5m,SF_TMP_고도평균,SF_TMP_고도표준편차
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,16.0,19.3,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,16.0,19.3,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,16.0,19.3,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,16.0,19.3,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,16.0,19.3,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064


**한국 해수면높이 및 온도편차**

In [71]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221006\\korea_해수면높이_온도편차.txt", sep=",", encoding="cp949")
tmp_df.isna().sum().sum()

80

In [72]:
tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   연도           53 non-null     int64  
 1   국내 해수면 높이    31 non-null     float64
 2   국내 해수면온도 편차  53 non-null     float64
 3   국내 평균기온 편차   48 non-null     float64
 4   Unnamed: 4   0 non-null      float64
dtypes: float64(4), int64(1)
memory usage: 2.2 KB


In [73]:
tmp_df.head()

Unnamed: 0,연도,국내 해수면 높이,국내 해수면온도 편차,국내 평균기온 편차,Unnamed: 4
0,1968,,-1.0,,
1,1969,,-1.1,,
2,1970,,-1.1,,
3,1971,,-1.1,,
4,1972,,-0.6,,


In [74]:
tmp_df = tmp_df.loc[tmp_df["연도"]>=1996].reset_index(drop=True)
tmp_df["month"] = 12
tmp_df = tmp_df.rename({"연도": "year"}, axis=1)
df_full = df_full.merge(tmp_df[["year", "month", "국내 해수면 높이", "국내 해수면온도 편차"]], on=["year", "month"], how="left")
df_full[["국내 해수면 높이", "국내 해수면온도 편차"]] = df_full[["국내 해수면 높이", "국내 해수면온도 편차"]].interpolate().bfill().ffill().values

In [75]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,해수면온도_서해,해수면온도_3면평균,해수면온도_3면표준편차,SF_TMP_SUF,SF_TMP_1m,SF_TMP_5m,SF_TMP_고도평균,SF_TMP_고도표준편차,국내 해수면 높이,국내 해수면온도 편차
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,15.4,16.9,2.1,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2


**유라시아 눈덮힘**

In [76]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221011\\eurasia_snow_cover.csv")
tmp_df.isna().sum().sum()

0

In [77]:
df_full["eurasia_snow_cover"] = tmp_df.loc[(tmp_df["year"] >= 1996) & (tmp_df["year"] <= 2020), "eurasia_snow_cover"].values

In [78]:
df_full["eurasia_snow_cover"].tail(20)

280     8503458
281     1742463
282      314917
283      158258
284     1574142
285    12767594
286    21974364
287    26303926
288    28869248
289    26760873
290    22095745
291    15198641
292     7574219
293     1123065
294      141046
295       71622
296      581980
297    10956669
298    21592232
299    27203120
Name: eurasia_snow_cover, dtype: int64

In [79]:
# df_full = df_full.drop("eurasia_snow_cover", axis=1)

**음의 북극진동 (2020년도에 na값이 있어 활용이 불가능하다고 판단하여 drop)**

In [80]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221011\\arctic_oscillation.csv")
tmp_df.isna().sum().sum()

11

In [81]:
df_full["arctic_oscillation"] = tmp_df.loc[(tmp_df["Year"] >= 1996) & (tmp_df["Year"] <= 2020), tmp_df.columns[1:]].to_numpy(dtype="float32").flatten()

In [82]:
df_full["arctic_oscillation"]

0     -1.200
1      0.163
2     -1.483
3     -1.525
4     -0.226
       ...  
295      NaN
296      NaN
297      NaN
298      NaN
299      NaN
Name: arctic_oscillation, Length: 300, dtype: float32

In [83]:
df_full = df_full.drop("arctic_oscillation", axis=1)

**해빙**

In [84]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221011\\sea_ice_index_monthly_data_by_year_nh_extend.csv")
tmp_df.isna().sum().sum()

15

In [85]:
df_full["sea_ice_index"] = tmp_df.loc[(tmp_df["Date"] >= 1996) & (tmp_df["Date"] <= 2020), tmp_df.columns[1:]].to_numpy(dtype="float32").flatten()

In [86]:
df_full["sea_ice_index"].isna().sum()

0

**인구 및 인구증가율**

In [87]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221011\\korea_pop_popGR.csv")
tmp_df.columns = ["year", "korea_pop", "korea_pop_gr"]
tmp_df["month"] = 12

In [88]:
tmp_df["korea_pop"] = tmp_df["korea_pop"].apply(lambda x: float(x.replace(",", "")))
tmp_df["korea_pop_gr"] = tmp_df["korea_pop_gr"].apply(lambda x: np.nan if x == "-" else float(x.replace(",", "")))

# 성장률을 월간으로 평균한 값을 미리 계산
tmp_df["korea_pop_gr"] = (((1 + tmp_df["korea_pop_gr"] / 100) ** (1/12)) - 1) * 100

In [89]:
tmp_df

Unnamed: 0,year,korea_pop,korea_pop_gr,month
0,1960,25012.0,,12
1,1961,25766.0,0.244193,12
2,1962,26513.0,0.235265,12
3,1963,27262.0,0.228766,12
4,1964,27984.0,0.215755,12
...,...,...,...,...
106,2066,39575.0,-0.103925,12
107,2067,39086.0,-0.103925,12
108,2068,38603.0,-0.103925,12
109,2069,38126.0,-0.103925,12


In [90]:
df_full = df_full.merge(tmp_df, on=["year", "month"], how="left")
df_full["korea_pop"] = df_full["korea_pop"].interpolate().bfill().ffill()
# df_full["korea_pop_gr"] = df_full["korea_pop_gr"].bfill().ffill()
df_full = df_full.drop("korea_pop_gr", axis=1)

In [91]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,SF_TMP_SUF,SF_TMP_1m,SF_TMP_5m,SF_TMP_고도평균,SF_TMP_고도표준편차,국내 해수면 높이,국내 해수면온도 편차,eurasia_snow_cover,sea_ice_index,korea_pop
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2,29460995,14.181,45525.0
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2,28220029,15.155,45525.0
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2,25649162,15.123,45525.0
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2,19248027,14.216,45525.0
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,14.1,14.8,15.4,14.766667,0.65064,-2.2,-0.2,10579547,13.094,45525.0


In [92]:
df_full.isna().sum().sum()

0

**글로벌 지역별 천연가스 생산량**

In [93]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221013\\IEA-world-natural-gas-production-by-region-1973-2020.csv")
tmp_df["month"] = 12

In [94]:
tmp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   year                         48 non-null     int64  
 1   OECD                         48 non-null     float64
 2   Non-OECD Europe and Eurasia  48 non-null     float64
 3   Non-OECD Middle East         48 non-null     float64
 4   Non-OECD Asia (Incl. China)  48 non-null     float64
 5   Non-OECD Africa              48 non-null     float64
 6   Non-OECD Americas            48 non-null     float64
 7   month                        48 non-null     int64  
dtypes: float64(6), int64(2)
memory usage: 3.1 KB


In [95]:
tmp_df.head()

Unnamed: 0,year,OECD,Non-OECD Europe and Eurasia,Non-OECD Middle East,Non-OECD Asia (Incl. China),Non-OECD Africa,Non-OECD Americas,month
0,1973,878.132,273.215973,25.38823,17.846117,9.727192,19.820409,12
1,1974,869.226941,299.100838,26.240363,23.240464,10.163994,20.886714,12
2,1975,835.413464,330.620955,28.452371,26.652415,11.823694,20.857936,12
3,1976,845.208315,366.720748,30.49586,31.422759,13.665009,22.042461,12
4,1977,859.031635,394.468522,33.564242,38.400386,14.072013,23.62038,12


In [96]:
tmp_df[diff(tmp_df.columns, ["year", "month"])] /= 12
df_full = df_full.merge(tmp_df, on=["year", "month"], how="left")
df_full[diff(tmp_df.columns, ["year", "month"])] = df_full[diff(tmp_df.columns, ["year", "month"])].bfill().ffill()

In [97]:
df_full.head(20)

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,국내 해수면온도 편차,eurasia_snow_cover,sea_ice_index,korea_pop,OECD,Non-OECD Europe and Eurasia,Non-OECD Middle East,Non-OECD Asia (Incl. China),Non-OECD Africa,Non-OECD Americas
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,-0.2,29460995,14.181,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,...,-0.2,28220029,15.155,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,-0.2,25649162,15.123,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,-0.2,19248027,14.216,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,-0.2,10579547,13.094,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
5,1996,Q2,6,116416.0,28534.0,0.80315,12384.133333,0.94,26.94,28.73,...,-0.2,5154921,12.084,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
6,1996,Q3,7,109474.0,31117.0,0.77867,12597.466667,0.86,26.94,31.19,...,-0.2,1221807,10.164,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
7,1996,Q3,8,94736.0,28188.0,0.77069,12597.466667,0.83,26.94,32.55,...,-0.2,252123,8.18,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
8,1996,Q3,9,105143.0,36977.0,0.73982,12597.466667,0.81,26.94,33.11,...,-0.2,2532924,7.583,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697
9,1996,Q4,10,237249.0,61954.0,0.79294,12790.8,0.79,26.94,34.17,...,-0.2,10029304,9.161,45525.0,89.020506,60.104264,11.95892,16.729374,7.252439,5.58697


**KOSIS 데이터 (2000년 부터 데이터가 없음)**

In [98]:
folder_path_fixed = 'C:\\Users\\kogas\\Desktop\\external_data\\20221105\\external_data_KOSIS\\result_fixed'

In [99]:
kosis_df = []
for i in glob(folder_path_fixed + "\\*"):
    tmp_df = pd.read_csv(i)
    if (str(tmp_df.iloc[0, 0])[:4] == "2000") & (str(tmp_df.iloc[0, 0])[4:] == "01"): 
        kosis_df.append(tmp_df)

In [100]:
print(len(kosis_df))

0


**전력 데이터 (데이터가 듬성듬성 비어있어서 사용 어려움 판단)**

In [101]:
tmp_df = pd.read_csv("C:\\Users\\kogas\\Desktop\\external_data\\20221105\\kogas_ind_higCorrr.csv", encoding="cp949")

In [102]:
# tmp_df.dropna()[:30]

In [103]:
target_vars = ["target_civil", "target_ind"]
target_name = "target_ind"
nontrain_vars = ["year"]
bin_vars = []
cat_vars = ["quarter", "month"]
num_vars = diff(df_full.columns, nontrain_vars + bin_vars + cat_vars)

assert len(df_full.columns) == len(nontrain_vars) + len(cat_vars) + len(num_vars)

## Feature Engineering

- version 0 : qva normalizing feature추가 (기준년도=2015)
- version 1 : short, intm, long에 대해 mean to std 수치 절대값 0.2 이상 feature 선정 (유사 feature는 주관적 판단 하에 drop)
- version 2 : version 0 + 시간 feature engineering (sin, cos)
- version 3 : version 0 + 시간 feature engineering (sin, cos, linear, 계절, decomposition, lag)

In [104]:
feature_version = 3

**qva normalizing by 2015**

In [105]:
# 부가가치값 normalizing
# 부가가치 측정의 경우 인플레이션이 반영되어 실제 부가가치가 늘어났는지 정확히 알 수가 없음
# 해당 분기 천연가스 median 가격으로 나눈 후 기준년도(=2015) median 가격을 곱해줌, 2015년도를 기준가격으로 했을 때 얼마나 물건이 생산되었는지 보는 지표
tmp_list = []
for i in ((df_full.groupby(["year", "quarter"]).sum()["qva"] / df_full.groupby(["year", "quarter"]).mean()["GEP"]) * df_full.groupby(["year", "quarter"]).mean().loc[2015, "GEP"]).values:
    tmp_list.extend([i / 3] * 3)
df_full["qva_norm2015"] = tmp_list

**lag features**

In [106]:
# lags = 12
# for lag in range(lags):
#     tmp = df_full[num_vars].shift(lag + 1).reset_index(drop=True)
#     tmp.columns = ["lag" + str(lag) + "_" + str(i) for i in tmp.columns]
#     df_full = pd.concat([df_full, tmp], axis=1)

# df_full = df_full.dropna().reset_index(drop=True)

In [107]:
# df_full 

**time feature engineering**

In [108]:
day_secs = 24 * 60 * 60 # 시 분 초
month_secs = (365.2425 / 12) * day_secs
year_secs = (365.2425) * day_secs

In [109]:
timestamp_s = pd.date_range(str(df_full["year"].unique()[0]) + "-01-01", str(df_full["year"].unique()[-1]) + "-12-31", freq="M").map(datetime.datetime.timestamp)

In [110]:
df_full["year_sin"] = np.sin(timestamp_s * (2*np.pi / year_secs))
df_full["year_cos"] = np.cos(timestamp_s * (2*np.pi / year_secs))

In [111]:
df_full["season"] = df_full["month"] % 12 // 3
df_full["year_linear"] = df_full["month"] / 12

cat_vars.append("season")

In [112]:
from statsmodels.tsa.seasonal import seasonal_decompose

decompose = seasonal_decompose(df_full[target_name], period=12, model="additive")
df_full["decompose_add_trend"] = decompose.trend
df_full["decompose_add_seasonal"] = decompose.seasonal

decompose = seasonal_decompose(df_full[target_name], period=12, model="multiplicative")
df_full["decompose_mul_trend"] = decompose.trend
df_full["decompose_mul_seasonal"] = decompose.seasonal

df_full = df_full.dropna(axis=1)

In [113]:
df_full

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,...,Non-OECD Asia (Incl. China),Non-OECD Africa,Non-OECD Americas,qva_norm2015,year_sin,year_cos,season,year_linear,decompose_add_seasonal,decompose_mul_seasonal
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,...,16.729374,7.252439,5.586970,30260.037031,0.488907,0.872336,0,0.083333,108133.477431,1.290050
1,1996,Q1,2,566323.0,70427.0,0.88940,12183.433333,0.93,26.94,29.04,...,16.729374,7.252439,5.586970,30260.037031,0.846681,0.532100,0,0.166667,15607.746528,1.058273
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,...,16.729374,7.252439,5.586970,30260.037031,0.999614,0.027790,1,0.250000,37483.218750,1.100490
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,...,16.729374,7.252439,5.586970,32502.439281,0.883136,-0.469116,1,0.333333,-37907.937500,0.898585
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,...,16.729374,7.252439,5.586970,32502.439281,0.522022,-0.852932,1,0.416667,-65100.467014,0.838177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2020,Q3,8,398661.0,457970.0,0.46538,40582.500000,0.86,72.93,84.71,...,40.632612,20.352585,11.590668,51104.807447,-0.860643,-0.509208,2,0.666667,-74481.814236,0.792579
296,2020,Q3,9,380235.0,525619.0,0.41975,40582.500000,0.80,65.24,81.26,...,40.632612,20.352585,11.590668,51104.807447,-0.999835,-0.018180,3,0.750000,-58018.388889,0.849025
297,2020,Q4,10,616682.0,600890.0,0.50649,41580.133333,0.76,59.16,78.13,...,40.632612,20.352585,11.590668,39683.501254,-0.870241,0.492626,3,0.833333,11754.335069,1.030351
298,2020,Q4,11,1093747.0,665901.0,0.62157,41580.133333,0.75,58.72,78.64,...,40.632612,20.352585,11.590668,39683.501254,-0.513800,0.857910,3,0.916667,62554.937500,1.162817


In [114]:
num_vars = diff(df_full.columns, nontrain_vars + bin_vars + cat_vars)
assert len(df_full.columns) == len(nontrain_vars) + len(cat_vars) + len(num_vars)

## select features

In [115]:
# # 1. vanilla
# df_full = df_full[["year", "quarter", "month", "target_civil", "target_ind", "weight_civil", "qva", "relative_price", "gas_price", "oil_price"]]

# target_vars = ["target_civil", "target_ind"]
# target_name = "target_ind"
# nontrain_vars = ["year"]
# cat_vars = ["quarter", "month"]
# bin_vars = []
# num_vars = diff(df_full.columns, nontrain_vars + bin_vars + cat_vars)
# assert len(df_full.columns) == len(nontrain_vars) + len(cat_vars) + len(num_vars)

In [116]:
# # 2. vanilla + 시간 FE
# df_full = df_full[["year", "quarter", "month", "target_civil", "target_ind", "weight_civil", "qva", "relative_price", "gas_price", "oil_price",
#                   "season", "year_cos", "year_linear"]]

# target_vars = ["target_civil", "target_ind"]
# target_name = "target_ind"
# nontrain_vars = ["year"]
# cat_vars = ["quarter", "month", "season"]
# bin_vars = []
# num_vars = diff(df_full.columns, nontrain_vars + bin_vars + cat_vars)
# assert len(df_full.columns) == len(nontrain_vars) + len(cat_vars) + len(num_vars)

In [117]:
# 3. vanilla + 시간 FE + 온도, 습도
df_full = df_full[["year", "quarter", "month", "target_civil", "target_ind", "weight_civil", "qva", "relative_price", "gas_price", "oil_price",
                  "season", "year_cos", "year_linear", "서울_최고기온", "부산_평균습도"]]

target_vars = ["target_civil", "target_ind"]
target_name = "target_ind"
nontrain_vars = ["year"]
cat_vars = ["quarter", "month", "season"]
bin_vars = []
num_vars = diff(df_full.columns, nontrain_vars + bin_vars + cat_vars)
assert len(df_full.columns) == len(nontrain_vars) + len(cat_vars) + len(num_vars)

In [118]:
df_full.head()

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,season,year_cos,year_linear,서울_최고기온,부산_평균습도
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,0,0.872336,0.083333,2.329032,0.43629
1,1996,Q1,2,566323.0,70427.0,0.8894,12183.433333,0.93,26.94,29.04,0,0.5321,0.166667,3.182759,0.464966
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,1,0.02779,0.25,9.451613,0.554323
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,1,-0.469116,0.333333,15.143333,0.5177
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,1,-0.852932,0.416667,24.196774,0.657935


In [119]:
df_full

Unnamed: 0,year,quarter,month,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,season,year_cos,year_linear,서울_최고기온,부산_평균습도
0,1996,Q1,1,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,0,0.872336,0.083333,2.329032,0.436290
1,1996,Q1,2,566323.0,70427.0,0.88940,12183.433333,0.93,26.94,29.04,0,0.532100,0.166667,3.182759,0.464966
2,1996,Q1,3,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,1,0.027790,0.250000,9.451613,0.554323
3,1996,Q2,4,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,1,-0.469116,0.333333,15.143333,0.517700
4,1996,Q2,5,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,1,-0.852932,0.416667,24.196774,0.657935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2020,Q3,8,398661.0,457970.0,0.46538,40582.500000,0.86,72.93,84.71,2,-0.509208,0.666667,29.325806,0.823613
296,2020,Q3,9,380235.0,525619.0,0.41975,40582.500000,0.80,65.24,81.26,3,-0.018180,0.750000,25.573333,0.740300
297,2020,Q4,10,616682.0,600890.0,0.50649,41580.133333,0.76,59.16,78.13,3,0.492626,0.833333,19.490323,0.578323
298,2020,Q4,11,1093747.0,665901.0,0.62157,41580.133333,0.75,58.72,78.64,3,0.857910,0.916667,12.550000,0.518267


In [120]:
df_eda = df_full.copy()

**create data pipeline**

In [121]:
df_eda = df_full.copy().set_index(["year", "quarter", "month"])

ohe = OneHotEncoder(sparse=False)
ohe.fit(df_full[cat_vars])

cat_oh_vars = []
for idx, value in enumerate(cat_vars):
    cat_oh_vars += [value + "_" + str(j) for j in ohe.categories_[idx]]

df_full = pd.concat([df_full.drop(cat_vars, axis=1), pd.DataFrame(ohe.transform(df_full[cat_vars]), columns=cat_oh_vars)], axis=1)

In [122]:
df_full_y = pd.concat([df_full[[target_name]].shift(-(i + 1)).rename({target_name: target_name + "_t" + str(i+1)}, axis=1) for i in range(168)], axis=1).dropna()
df_full_x = df_full.iloc[:-168]

In [123]:
df_full_x

Unnamed: 0,year,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,year_cos,year_linear,...,month_7,month_8,month_9,month_10,month_11,month_12,season_0,season_1,season_2,season_3
0,1996,605519.0,83809.0,0.87842,12183.433333,0.97,26.94,27.86,0.872336,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1996,566323.0,70427.0,0.88940,12183.433333,0.93,26.94,29.04,0.532100,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1996,477514.0,62652.0,0.88401,12183.433333,0.96,26.94,27.99,0.027790,0.250000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1996,337794.0,47050.0,0.87774,12384.133333,0.94,26.94,28.74,-0.469116,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1996,184522.0,30709.0,0.85732,12384.133333,0.92,26.94,29.18,-0.852932,0.416667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,2006,309326.0,269467.0,0.53443,26060.900000,0.69,75.57,109.87,-0.518138,0.666667,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
128,2006,314777.0,316123.0,0.49893,26060.900000,0.78,82.68,106.38,-0.028585,0.750000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
129,2006,412591.0,315971.0,0.56631,25996.200000,0.82,82.68,100.38,0.483542,0.833333,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
130,2006,940748.0,419670.0,0.69151,25996.200000,0.78,76.69,98.30,0.852516,0.916667,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [124]:
df_test_x = df_full.iloc[-168:]

In [125]:
df_test_x

Unnamed: 0,year,target_civil,target_ind,weight_civil,qva,relative_price,gas_price,oil_price,year_cos,year_linear,...,month_7,month_8,month_9,month_10,month_11,month_12,season_0,season_1,season_2,season_3
132,2007,1645363.0,461726.0,0.78087,26793.766667,0.76,73.60,97.08,0.869525,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
133,2007,1304625.0,350887.0,0.78805,26793.766667,0.76,73.60,97.25,0.541787,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
134,2007,1242387.0,419078.0,0.74777,26793.766667,0.71,71.55,100.61,0.039266,0.250000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
135,2007,831041.0,345166.0,0.70654,27398.633333,0.69,71.55,103.96,-0.458945,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
136,2007,483763.0,308941.0,0.61027,27398.633333,0.70,74.89,107.27,-0.846881,0.416667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2020,398661.0,457970.0,0.46538,40582.500000,0.86,72.93,84.71,-0.509208,0.666667,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
296,2020,380235.0,525619.0,0.41975,40582.500000,0.80,65.24,81.26,-0.018180,0.750000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
297,2020,616682.0,600890.0,0.50649,41580.133333,0.76,59.16,78.13,0.492626,0.833333,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
298,2020,1093747.0,665901.0,0.62157,41580.133333,0.75,58.72,78.64,0.857910,0.916667,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [126]:
df_full_x = df_full_x.drop(nontrain_vars, axis=1)
df_test_x = df_test_x.drop(nontrain_vars, axis=1)

## optuna helper function

In [127]:
# optuna function
def optuna_objective_function(trial: Trial, fold, train_x, train_y, train_groups, val_x, val_y, val_groups, categoIdx,
                              model_name, output_container, ntrees=1000, eta=1e-2, direction="minimize", TF=True, target_scaler=None):
    best_ntrees = None

    if model_name == "LGB_RF":
        # objective
        # regession : "mae", "mse"
        # classification - binary : "binary"
        # classification - binary : "multiclass" (num_class=n)
        # ranking : "xe_ndcg_mart"

        # metric
        # regession : "mae", "mse", "rmse"
        # classification - binary : "binary_logloss", "binary_error", "auc"
        # classification - muticlass : "multi_logloss", "multi_error"
        # ranking : "ndcg", "map"

        tuning_params = {
#             "n_estimators": trial.suggest_int("n_estimators", 50, 100, step=10),
            # "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 5e-3, 1e-3]),
            "num_leaves": trial.suggest_categorical("num_leaves", [pow(2, i) - 1 for i in [4, 5, 6, 7, 8]]),
            # goss sampling hyper-parameter replacing the "sumample"
            "subsample": trial.suggest_float("subsample", 0.5, 0.8, step=0.1),
            "subsample_freq": trial.suggest_int("subsample_freq", 1, 10, step=1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8, step=0.1),
            "reg_alpha": trial.suggest_categorical("reg_alpha", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_weight": trial.suggest_categorical("min_child_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 51, step=2),
            "min_gain_to_split": trial.suggest_categorical("min_gain_to_split", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            # # for binary
            # "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        }
#         model = lgb.LGBMRegressor(boosting_type="rf", objective="mae",
#                                    n_estimators=ntrees, device_type="cpu",
#                                    random_state=fold, verbose=-1, **tuning_params)
#         cb_list = [
#             lgb.early_stopping(stopping_rounds=int(ntrees * 0.2), first_metric_only=True, verbose=False, min_delta=0.001),
#         ]
#         cb_list = [
#             lgb.early_stopping(stopping_rounds=int(ntrees * 0.2), verbose=False),
#         ]
#         model.fit(train_x, train_y, categorical_feature=categoIdx,
#             eval_set=(val_x,val_y), eval_metric="mae", callbacks=cb_list)
#         best_ntrees = model.best_iteration_

        model = lgb.LGBMRegressor(boosting_type="rf", objective="mae",
                                   n_estimators=int(ntrees / 10), device_type="cpu",
                                   random_state=fold, verbose=-1, **tuning_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
    elif model_name == "LGB_GOSS":
        # objective
        # regession : "mae", "mse"
        # classification - binary : "binary"
        # classification - binary : "multiclass" (num_class=n)
        # ranking : "xe_ndcg_mart"

        # metric
        # regession : "mae", "mse", "rmse"
        # classification - binary : "binary_logloss", "binary_error", "auc"
        # classification - muticlass : "multi_logloss", "multi_error"
        # ranking : "ndcg", "map"

        tuning_params = {
#             "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 5e-3, 1e-3]),
            "num_leaves": trial.suggest_categorical("num_leaves", [pow(2, i) - 1 for i in [4, 5, 6, 7, 8]]),
            # goss sampling hyper-parameter replacing the "sumample"
            "top_rate": trial.suggest_float("top_rate", 0.2, 0.5, step=0.1),
            "other_rate": trial.suggest_float("other_rate", 0.2, 0.5, step=0.1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8, step=0.1),
            "reg_alpha": trial.suggest_categorical("reg_alpha", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_weight": trial.suggest_categorical("min_child_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 51, step=2),
            "min_gain_to_split": trial.suggest_categorical("min_gain_to_split", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            # # for binary
            # "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        }
#         model = lgb.LGBMRegressor(boosting_type="goss", objective="mae",
#                                    n_estimators=ntrees, device_type="cpu",
#                                    random_state=42, verbose=-1, **tuning_params)
#         cb_list = [
#             lgb.early_stopping(stopping_rounds=int(ntrees * 0.2), first_metric_only=True, verbose=False, min_delta=0.001),
#         ]
#         model.fit(train_x, train_y, categorical_feature=categoIdx,
#                   eval_set=(val_x,val_y), eval_metric="mae", callbacks=cb_list)
        
        model = lgb.LGBMRegressor(boosting_type="goss", objective="mae",
                           n_estimators=ntrees, learning_rate=eta, device_type="cpu",
                           random_state=fold, verbose=-1, **tuning_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
#         best_ntrees = model.best_iteration_
    elif model_name == "LGB_GBM":
        # objective
        # regession : "mae", "mse"
        # classification - binary : "binary"
        # classification - binary : "multiclass" (num_class=n)
        # ranking : "xe_ndcg_mart"

        # metric
        # regession : "mae", "mse", "rmse"
        # classification - binary : "binary_logloss", "binary_error", "auc"
        # classification - muticlass : "multi_logloss", "multi_error"
        # ranking : "ndcg", "map"

        tuning_params = {
#             "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 5e-3, 1e-3]),
#             "learning_rate": trial.suggest_categorical("learning_rate", [1e-2]),
            "num_leaves": trial.suggest_categorical("num_leaves", [pow(2, i) - 1 for i in [4, 5, 6, 7, 8]]),
            "subsample": trial.suggest_float("subsample", 0.5, 0.8, step=0.1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8, step=0.1),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_weight": trial.suggest_categorical("min_child_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 51, step=2),
            "min_gain_to_split": trial.suggest_categorical("min_gain_to_split", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            # # for binary
            # "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        }

#         model = lgb.LGBMRegressor(boosting_type="gbdt", objective="mae",
#                                    n_estimators=ntrees, device_type="cpu",
#                                    random_state=fold, verbose=-1, **tuning_params)
#         cb_list = [
#             lgb.early_stopping(stopping_rounds=int(ntrees * 0.2), first_metric_only=True, verbose=False, min_delta=0.001),
#         ]
#         model.fit(train_x, train_y, categorical_feature=categoIdx,
#                   eval_set=(val_x,val_y), eval_metric="mae", callbacks=cb_list)
#         best_ntrees = model.best_iteration_
        
        model = lgb.LGBMRegressor(boosting_type="gbdt", objective="mae",
                   n_estimators=ntrees, learning_rate=eta, device_type="cpu",
                   random_state=42, verbose=-1, **tuning_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
    elif model_name == "XGB_GBT":
        # objective
        # regession : "reg:absoluteerror", "reg:squarederror"
        # classification - binary : "binary:logistic"
        # classification - multicalss :"multi:softmax" (num_class=n)
        # ranking : "rank:ndcg"

        # metric
        # regession : "mae", "rmse"
        # classification - binary : "logloss", "error@t" (t=threshold), "auc"
        # classification - multicalss : "mlogloss", "merror"
        # ranking : "ndcg", "map"

        tuning_params = {
#             "learning_rate": trial.suggest_categorical("learning_rate", [1e-2, 5e-3, 1e-3]),
            "max_depth": trial.suggest_categorical("max_depth", [4, 5, 6, 7, 8]),
            "subsample": trial.suggest_float("subsample", 0.5, 0.8, step=0.1),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8, step=0.1),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_weight": trial.suggest_categorical("min_child_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "gamma": trial.suggest_categorical("gamma", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
#             # for binary
#             "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        }
#         model = xgb.XGBClassifier(booster="gbtree", objective="binary:logistic",
#                             n_estimators=ntrees,
#                             random_state=fold, verbosity=0, **tuning_params)
#         model.fit(train_x, train_y,
#                   eval_set=[(val_x, val_y)], eval_metric="auc",
#                   early_stopping_rounds=int(ntrees * 0.2), verbose=False)
#         best_ntrees = model.best_iteration
        
        model = xgb.XGBRegressor(booster="gbtree", objective="reg:squarederror",
                    n_estimators=ntrees, learning_rate=eta,
                    random_state=fold, verbosity=0, **tuning_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
    elif model_name == "CAT_GBM":
        # objective
        # regession : "MAE", "RMSE", "MAPE"
        # classification - binary : "Logloss"
        # classification - multicalss :"MultiClass"
        # ranking : "PairLogit", "YetiRank"

        # metric
        # regession : "MAE", "RMSE", "R2"
        # classification - binary : "Logloss", "Accuracy", "AUC", "F1"
        # classification - multicalss : "MultiClass", "Accuracy", "TotalF1" (average=Weighted,Macro,Micro)
        # ranking : "PairLogit", "YetiRank", "NDCG", "MAP"

        tuning_params = {
#             "learning_rate": trial.suggest_categorical("learning_rate", [5e-3]),
            "max_depth": trial.suggest_categorical("max_depth", [4, 6, 8]),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.1, 1.0, step=0.1),
#             rsm = colsample_bylevel (not supported for GPU)
            "rsm": trial.suggest_float("rsm", 0.5, 0.8, step=0.1),
            "random_strength": trial.suggest_categorical("random_strength", [0.01, 0.05, 0.1, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 10 if CFG.debug else 51, step=2),
            # # for binary
            # "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        }

        model = cat.CatBoostRegressor(boosting_type="Plain", loss_function="RMSE",
                            n_estimators=ntrees, learning_rate=eta, task_type="CPU", bootstrap_type="Bayesian",
                            verbose=False, random_state=fold, **tuning_params)
        # model = cat.CatBoostClassifier(boosting_type="Plain", loss_function="Logloss", eval_metric="Logloss",
        #                             n_estimators=ntrees, task_type="GPU", bootstrap_type="Bayesian",
        #                             verbose=False, random_state=fold, **tuning_params)
        
#         model.fit(train_x, train_y, cat_features=categoIdx,
#                 eval_set=[(val_x, val_y)], early_stopping_rounds=int(ntrees * 0.2), use_best_model=True,
#                 verbose=False)
#         best_ntrees = model.best_iteration_
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
        
    elif model_name == "CAT_ORD":
        # objective
        # regession : "MAE", "RMSE"
        # classification - binary : "Logloss"
        # classification - multicalss :"MultiClass"
        # ranking : "PairLogit", "YetiRank"

        # metric
        # regession : "MAE", "RMSE", "R2"
        # classification - binary : "Logloss", "Accuracy", "AUC", "F1"
        # classification - multicalss : "MultiClass", "Accuracy", "TotalF1" (average=Weighted,Macro,Micro)
        # ranking : "PairLogit", "YetiRank", "NDCG", "MAP"

        tuning_params = {
            "learning_rate": trial.suggest_categorical("learning_rate", [5e-3]),
            "max_depth": trial.suggest_categorical("max_depth", [4, 6, 8]),
            # "bagging_temperature": trial.suggest_categorical("bagging_temperature", list(np.linspace(1e-3, 1.0, num=75, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=25, endpoint=True))),
            # rsm = colsample_bylevel (not supported for GPU)
            # "rsm": trial.suggest_float("rsm", 0.5, 0.8, step=0.1),
            "random_strength": trial.suggest_categorical("random_strength", [0.01, 0.1, 1.0, 2.0, 3.0]),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            # "min_child_samples": trial.suggest_float("min_child_samples", 1, 51, step=2),
            # for binary
            "scale_pos_weight": trial.suggest_categorical("scale_pos_weight", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        }

        model = cat.CatBoostClassifier(boosting_type="Ordered", loss_function="Logloss", eval_metric="Logloss",
                                    n_estimators=ntrees, task_type="CPU", bootstrap_type="Bayesian",
                                    verbose=False, random_state=fold, **tuning_params)
        model.fit(train_x, train_y, cat_features=categoIdx,
                eval_set=[(val_x, val_y)], early_stopping_rounds=int(ntrees * 0.2), use_best_model=True,
                verbose=False)
        best_ntrees = model.best_iteration_
    elif model_name == "ElasticNet":
        # # for classification
        # tuner_params = {
        #     "C": trial.suggest_categorical("C", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
        #     "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0, step=0.05)
        # }

        # model = lm.LogisticRegression(penalty="elasticnet", solver="saga", multi_class="ovr", random_state=fold, n_jobs=-1, **tuner_params)

        # for regression
        tuner_params = {
            "alpha": trial.suggest_categorical("C", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0, step=0.05)
        }

        model = lm.ElasticNet(**tuner_params, random_state=fold)
        model = RegressorChain(model, cv=None, random_state=42)
#         model = RegressorChain(model)
        model.fit(train_x, train_y)

#         # for RAPIDS module
#         model = cuml.ElasticNet(**tuner_params, output_type="numpy")
#         model.fit(cupy.asarray(train_x), cupy.asarray(train_y))
    elif model_name == "CalibratedClassifier_SVM":
        tuner_params = {
            "C": trial.suggest_categorical("C", list(np.linspace(1e-3, 1.0, num=150, endpoint=False)) + list(np.linspace(1.0, 1e+2, num=50, endpoint=True))),
            "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf"]),
        }
        model_base = svm.SVC(max_iter=100, random_state=fold, **tuner_params)
        model_base.fit(train_x, train_y)

        model = CalibratedClassifierCV(base_estimator=model_base, method='sigmoid', cv="prefit", n_jobs=-1)
        model.fit(val_x, val_y)
    elif model_name == "KNN":
        tuner_params = {
            "n_neighbors": trial.suggest_int("n_neighbors", 1, 12, step=1),
            "weights": trial.suggest_categorical("kernel", ["uniform", "distance"]),
        }

#         model = KNeighborsClassifier(n_jobs=-1, **tuner_params)
#         model.fit(train_x, train_y)
        
        model = KNeighborsRegressor(n_jobs=None, **tuner_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
    elif model_name == "MLP":
        hidden_layer_units = trial.suggest_categorical("hidden_layer_units", [8, 16, 32, 64])
        hidden_layer_depth = trial.suggest_categorical("hidden_layer_depth", [1, 2, 3, 4])
        tuner_params = {
            "activation": "relu",
            "solver": "adam",
            "learning_rate": "constant",
            "learning_rate_init": 5e-4,
            "hidden_layer_sizes": tuple([hidden_layer_units] * hidden_layer_depth),
            "max_iter": trial.suggest_int("max_iter", 5, 100, 5),
            "batch_size": trial.suggest_categorical("batch_size", [1, 4, 8]),
            "early_stopping": False,
            "shuffle": True,
            "random_state": 42,
        }
        
        model = MLPRegressor(**tuner_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
    elif model_name == "ExtraTree":
        tuning_params = {
            "max_depth": trial.suggest_categorical("max_depth", [4, 5, 6, 7, 8, 9, 19]),
            "max_features": trial.suggest_float("max_features", 0.5, 0.8, step=0.1),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 51, step=2),
        }
        
        model = ExtraTreeRegressor(splitter="best", random_state=42, **tuning_params)
        model = RegressorChain(model, cv=None, random_state=fold)
        model.fit(train_x, train_y)
    else:
        print("unknown")
        return -1
    
    # # predict
    pred = model.predict(val_x)
    pred = np.expm1(pred) if TF else pred
    optuna_score = skl_merics.mean_absolute_percentage_error(df_full_y.iloc[valid_idx].values, pred, multioutput="uniform_average")

    if direction == "minimize":
        if optuna_score < output_container["score"]:
            if best_ntrees is not None:
                print("best_ntrees :", best_ntrees)
            output_container["model"] = model
            output_container["pred"] = pred
            output_container["score"] = optuna_score
            output_container["best_ntrees"] = best_ntrees
    else:
        if optuna_score > output_container["score"]:
            if best_ntrees is not None:
                print("best_ntrees :", best_ntrees)
            output_container["model"] = model
            output_container["pred"] = pred
            output_container["score"] = optuna_score
            output_container["best_ntrees"] = best_ntrees
    return optuna_score


import operator
class Optuna_EarlyStoppingCallback(object):
    """Early stopping callback for Optuna."""

    def __init__(self, early_stopping_rounds: int, direction: str = "minimize") -> None:
        self.early_stopping_rounds = early_stopping_rounds

        self._iter = 0

        if direction == "minimize":
            self._operator = operator.lt
            self._score = np.inf
        elif direction == "maximize":
            self._operator = operator.gt
            self._score = -np.inf
        else:
            ValueError(f"invalid direction: {direction}")

    def __call__(self, study: optuna.Study, trial: optuna.Trial) -> None:
        """Do early stopping."""
        if self._operator(study.best_value, self._score):
            self._iter = 0
            self._score = study.best_value
        else:
            self._iter += 1

        if self._iter >= self.early_stopping_rounds:
            study.stop()

## Modeling - baseline

In [128]:
# model_name_list = ["Linear", "LGB_GOSS", "XGB_GBT", "KNN"]
# model_name_list = ["ElasticNet", "LGB_RF", "LGB_GOSS", "XGB_GBT", "CAT_GBM", "KNN", "MLP"]
model_name_list = ["Linear", "ElasticNet", "LGB_RF", "LGB_GOSS", "XGB_GBT", "CAT_GBM", "KNN", "MLP"]
# model_name_list = ["Linear"]
# model_name_list = ["BayesianRidge"]
# model_name_list = ["ElasticNet"]

TF = False
tuning = True

architecture_name = "fv" + str(feature_version) + "_noTF_vanillaTimeFE_4folds3test_" + "allmodels_minmax" + "_try1"
# architecture_name = "test_try1"
architecture_root_path = folder_path + "architectures\\" + architecture_name + "\\"
if not os.path.exists(architecture_root_path):
    os.makedirs(architecture_root_path)
shutil.copy(folder_path + "YJ_modeling_target168.ipynb", architecture_root_path + "YJ_modeling_target168.ipynb")

'C:\\Users\\kogas\\Desktop\\jupyter_root_folder\\YJ_notebooks\\architectures\\fv3_noTF_vanillaTimeFE_4folds3test_allmodels_minmax_try1\\YJ_modeling_target168.ipynb'

### Training

**learning parameter setting**

In [129]:
def do_fold_training(fold, train_idx, valid_idx, TF=True, apply_pca=False, tuning=False):

    train_x = df_full_x.iloc[train_idx].reset_index(drop=True)
    if model_name in ["KNN", "MLP"]:
        scaler = MinMaxScaler()
    else:
        scaler = MinMaxScaler()
#         scaler = QuantileTransformer(n_quantiles=100, output_distribution="uniform", random_state=42)
#         scaler = PowerTransformer()
    if apply_pca:
        pca = PCA(16)
        train_x = df_full_x.iloc[train_idx].reset_index(drop=True)
        pca.fit(scaler.fit_transform(train_x[num_vars]))
        n_comp = np.where(pca.explained_variance_ratio_.cumsum() > CFG.pca_threshold)[0][0] + 1

        train_x = pd.concat([
            train_x.drop(num_vars, axis=1).reset_index(drop=True),
            pd.DataFrame(pca.transform(scaler.transform(train_x[num_vars]))[:, :n_comp]).reset_index(drop=True)
        ], axis=1)
    else:
        train_x[num_vars] = scaler.fit_transform(train_x[num_vars]) if scaler is not None else train_x
    train_y = df_full_y.iloc[train_idx].reset_index(drop=True)
    train_y = np.log1p(train_y) if TF else train_y

    valid_x = df_full_x.iloc[valid_idx].reset_index(drop=True)
    if apply_pca:
        valid_x = pd.concat([
            valid_x.drop(num_vars, axis=1).reset_index(drop=True),
            pd.DataFrame(pca.transform(scaler.transform(valid_x[num_vars]))[:, :n_comp]).reset_index(drop=True)
        ], axis=1)
    else:
        valid_x[num_vars] = scaler.transform(valid_x[num_vars]) if scaler is not None else valid_x
    valid_y = df_full_y.iloc[valid_idx].reset_index(drop=True)
    valid_y = np.log1p(valid_y) if TF else valid_y

    if model_name not in ["Linear", "BayesianRidge"]:
        output_container = {"model": None, "pred": None, "score": np.inf, "best_ntrees": None}
        optuna_direction = 'minimize'
        optuna_trials = 3 if CFG.debug else 300
#         optuna_timout = max(60, int(3 * 3600 / n_folds / (len(model_name_list))))
        optuna_timout = max(60, int(6 * 3600 / n_folds / (3)))
        optuna_earlyStopping = Optuna_EarlyStoppingCallback(max(1, int(optuna_trials * 0.2)), direction=optuna_direction)
        optuna_study = create_study(direction=optuna_direction, sampler=TPESampler())
        optuna_study.optimize(
            lambda trial: optuna_objective_function(
                trial, fold,
                train_x=train_x,
                train_y=train_y,
                train_groups=None,
                val_x=valid_x,
                val_y=valid_y,
                val_groups=None,
                categoIdx=None, model_name=model_name, output_container=output_container,
                ntrees=ntrees, eta=eta, direction=optuna_direction, TF=TF
            ),
            n_jobs=1, n_trials=optuna_trials, timeout=optuna_timout,
            callbacks=[optuna_earlyStopping],
        )

        chain_model = output_container["model"]
    elif model_name == "BayesianRidge":
        base_model = lm.BayesianRidge()
        chain_model = RegressorChain(base_model, cv=None, random_state=42)
        chain_model.fit(train_x, train_y)
    else:
        base_model = lm.LinearRegression()
        chain_model = RegressorChain(base_model, cv=None, random_state=42)
        chain_model.fit(train_x, train_y)

    tmp_pred = chain_model.predict(valid_x)
    tmp_pred = np.expm1(tmp_pred) if TF else tmp_pred
    valid_pred[(valid_idx - (df_full_x.shape[0] - (n_folds * test_size))), :] = tmp_pred

    metric_list["MAE"].append(skl_merics.mean_absolute_error(df_full_y.iloc[valid_idx].values, tmp_pred))
    metric_list["MAPE"].append(skl_merics.mean_absolute_percentage_error(df_full_y.iloc[valid_idx].values, tmp_pred, multioutput="uniform_average"))

    test_x = df_test_x.copy()
    if apply_pca:
        test_x = pd.concat([
            test_x.drop(num_vars, axis=1).reset_index(drop=True),
            pd.DataFrame(pca.transform(scaler.transform(test_x[num_vars]))[:, :n_comp]).reset_index(drop=True)
        ], axis=1)
    else:
        test_x[num_vars] = scaler.transform(test_x[num_vars]) if scaler is not None else test_x
    tmp_pred = chain_model.predict(test_x)
    tmp_pred = np.expm1(tmp_pred) if TF else tmp_pred

    test_pred[:] += tmp_pred / n_folds

#     scaler_list.append(scaler)
#     model_list.append(chain_model)

In [None]:
ntrees = 100 if CFG.debug else 1000
eta = 1e-2
target_name = "target_ind"

model_output_dic = dict.fromkeys(model_name_list)

for model_name in model_name_list:
    seed_everything()
    
#     scaler_list = []
#     model_list = []
    metric_list = {
        "MAE": [],
        "MAPE": []
    }

    n_folds = 4
    test_size = 3
    kfolds_spliter = TimeSeriesSplit(n_folds, max_train_size=CFG.train_timesteps, test_size=test_size)

    valid_pred = np.zeros(shape=(n_folds * test_size, CFG.test_timesteps))
    test_pred = np.zeros(shape=(CFG.test_timesteps, CFG.test_timesteps))
    
    for fold, (train_idx, valid_idx) in enumerate(kfolds_spliter.split(df_full_x)):
        train_idx = train_idx[:60] if CFG.debug else train_idx
        valid_idx = valid_idx[:60] if CFG.debug else valid_idx
        print(valid_idx)
        print((valid_idx - (df_full_x.shape[0] - (n_folds * test_size))))
#         continue
        do_fold_training(fold, train_idx, valid_idx, TF=TF, apply_pca=False, tuning=tuning)
        gc.collect()
    
    model_output_dic[model_name] = {
        "valid_pred": valid_pred,
        "metric_list": metric_list,
        "test_pred": test_pred,
    }

## Summarize score

In [None]:
mae_list = []
mape_list = []

for k in model_output_dic.keys():
    mae_list.append(np.nanmean(model_output_dic[k]["metric_list"]["MAE"]))
    mape_list.append(np.nanmean(model_output_dic[k]["metric_list"]["MAPE"]))
    
score_table = pd.DataFrame({"MAE": mae_list, "MAPE": mape_list}, index=model_name_list)
score_table.loc["average"] = score_table.iloc[:len(model_name_list)].mean(axis=0)
score_table.loc["std"] = score_table.iloc[:len(model_name_list)].std(axis=0)
display(score_table)
score_table.to_csv(architecture_root_path + "target_ind_score_table.csv", index=False)

## Get score weight & ensemble prediction

In [None]:
# ensemble_weight = softmax(1 / score_table["MAPE"].iloc[:len(model_name_list)], multiplier=1 / 1.5)
ensemble_weight = (1 / score_table["MAPE"].iloc[:len(model_name_list)]) / np.sum(1 / score_table["MAPE"].iloc[:len(model_name_list)])
display(ensemble_weight)

ensemble_test_pred = np.stack([model_output_dic[i]["test_pred"] * ensemble_weight.loc[i] for i in model_output_dic.keys()]).sum(axis=0)
last_ensemble_test_pred = ensemble_test_pred[-1]

### visualization validation value

In [None]:
ensemble_valid_pred = np.stack([model_output_dic[i]["valid_pred"] * ensemble_weight.loc[i] for i in model_output_dic.keys()]).sum(axis=0)

In [None]:
ensemble_valid_pred.shape

In [None]:
for i in range(ensemble_valid_pred.shape[0]):
    plt.figure(figsize=(16, 8))
    tmp_df = df_eda.iloc[-(168 + i) : (None if i==0 else -i)]
    tmp_df["target"] = ensemble_valid_pred[-(i+1)]
    tmp_df = tmp_df.set_index(["year", "quarter"])
    tmp_df = tmp_df.groupby(["year", "quarter"]).mean()
    
    plt.plot(
        [str(i[0]) + "-" + str(i[1]) for i in df_eda.loc[df_eda.set_index(["year", "quarter"]).index.get_level_values(0) >= 1997].groupby(["year", "quarter"]).mean().index.values],
        df_eda.loc[df_eda.set_index(["year", "quarter"]).index.get_level_values(0) >= 1997].groupby(["year", "quarter"]).mean()["target_ind"].values,
        color="orange", label="true value"
    )

    plt.plot(
        [str(i[0]) + "-" + str(i[1]) for i in tmp_df.index.values],
        tmp_df["target"].values,
        color="grey", linestyle="--", marker="o", mfc="green", label="pred value"
    )
    
    plt.title("validation set prediction fold " + str(i), pad=20, fontsize=14, weight="bold")
    plt.xticks(rotation=90)
    
    plt.legend()
    plt.show()
    

In [None]:
# threshold_pct_change = 0.2

# tmp = pd.Series(last_ensemble_test_pred)
# for idx, value in enumerate(tmp.pct_change().abs() > threshold_pct_change):
#     if value:
#         max_pct_change = (tmp.iloc[idx] / np.abs(tmp.iloc[idx])) * (threshold_pct_change) 
#         tmp.iloc[idx] = tmp.iloc[idx-1] * (1+max_pct_change)
# last_ensemble_test_pred = tmp

In [None]:
last_ensemble_test_pred[:5]

In [None]:
pd.Series(last_ensemble_test_pred).plot()

In [None]:
fliped_output = np.fliplr(ensemble_test_pred)
period_ensemble_test_pred = []
for i in range(fliped_output.shape[0]):
    period_ensemble_test_pred.append(round(np.median(fliped_output.diagonal(-i)) , 3))

In [None]:
# correct overshoting
threshold_pct_change = 0.2

tmp = pd.Series(period_ensemble_test_pred)
for idx, value in enumerate(tmp.pct_change().abs() > threshold_pct_change):
    if value:
        max_pct_change = (tmp.iloc[idx] / np.abs(tmp.iloc[idx])) * (threshold_pct_change) 
        tmp.iloc[idx] = tmp.iloc[idx-1] * (1+max_pct_change)
period_ensemble_test_pred = tmp

In [None]:
period_ensemble_test_pred[:5]

In [None]:
pd.Series(period_ensemble_test_pred).plot()

## Submission

In [None]:
df_submission = pd.read_csv(folder_path + 'dataset\\submission_sample.csv')
df_submission.head()
df_submission["IND"] = last_ensemble_test_pred
df_submission["CIVIL"] = pd.read_csv(folder_path + 'architectures\\civil\\ma_by_month.csv')["CIVIL"]
df_submission.to_csv(architecture_root_path + architecture_name + "_lastSample.csv", index=False)

In [None]:
df_submission.head()

In [None]:
df_result_viz = df_submission.copy()
df_result_viz.columns = df_result_viz.columns.str.lower()
df_result_viz["quarter"] = df_result_viz["month"].apply(lambda x: month_dic[x])
df_result_viz

plt.figure(figsize=(20, 9))
plt.plot(
    [str(i[0]) + "-" + str(i[1]) for i in df_eda.loc[df_eda.index.get_level_values(0) >= 2010].groupby(["year", "quarter"]).mean().index.values],
    df_eda.loc[df_eda.index.get_level_values(0) >= 2010].groupby(["year", "quarter"]).mean()["target_ind"].values,
    marker="o", mfc="orange"
)
plt.plot(
    [str(i[0]) + "-" + str(i[1]) for i in df_result_viz.groupby(["year", "quarter"]).mean().index.values],
    df_result_viz.groupby(["year", "quarter"]).mean()["ind"].values,
    color="grey", linestyle="--", marker="o", mfc="green"
)
plt.title("forecasting on industrial natural gas demand (last sample)", pad=20, fontsize=14, weight="bold")
plt.xticks(rotation=90)
plt.show()

In [None]:
df_submission = pd.read_csv(folder_path + 'dataset\\submission_sample.csv')
df_submission.head()
df_submission["IND"] = period_ensemble_test_pred
df_submission["CIVIL"] = pd.read_csv(folder_path + 'architectures\\civil\\ma_by_month.csv')["CIVIL"]
df_submission.to_csv(architecture_root_path + architecture_name + ".csv", index=False)

In [None]:
df_submission.head()

In [None]:
df_result_viz = df_submission.copy()
df_result_viz.columns = df_result_viz.columns.str.lower()
df_result_viz["quarter"] = df_result_viz["month"].apply(lambda x: month_dic[x])

plt.figure(figsize=(20, 9))
plt.plot(
    [str(i[0]) + "-" + str(i[1]) for i in df_eda.loc[df_eda.index.get_level_values(0) >= 2010].groupby(["year", "quarter"]).mean().index.values],
    df_eda.loc[df_eda.index.get_level_values(0) >= 2010].groupby(["year", "quarter"]).mean()["target_ind"].values,
    marker="o", mfc="orange"
)
plt.plot(
    [str(i[0]) + "-" + str(i[1]) for i in df_result_viz.groupby(["year", "quarter"]).mean().index.values],
    df_result_viz.groupby(["year", "quarter"]).mean()["ind"].values,
    color="grey", linestyle="--", marker="o", mfc="green"
)
plt.title("forecasting on industrial natural gas demand (period ensemble)", pad=20, fontsize=14, weight="bold")
plt.xticks(rotation=90)
plt.show()