In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.1 colorlog-6.8.0 optuna-3.5.0


# Library

In [3]:
import pandas as pd
from prophet import Prophet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import optuna
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Prophet Independent Variable

In [4]:
var_list = ['최저기온', '일교차', '평균습도', '일사합', '강수량']
result_df = pd.DataFrame()
for i, var in enumerate(var_list):
    df = pd.read_csv('train.csv')
    df['강수량'] = df['강수량'].fillna(0)     # 강수량 널값은 비가 안와서 널값이라고 추측
    df = df.iloc[4749:]                       # 일사합 컬럼이 인덱스 4748까지 널값임
    df.interpolate(inplace=True)              # TimeSeriesData에는 선형 보간이 적합하다고 판단
    df.reset_index(drop=True, inplace=True)
    df['일시'] = pd.to_datetime(df['일시'])
    if var == '강수량':
        df[f'{var}'] = np.where(df['강수량'] < 3, 0,
                                np.where((df['강수량'] >= 3) & (df['강수량'] < 15), 1,
                                np.where((df['강수량'] >= 15) & (df['강수량'] < 30), 2,
                                         3)))
    df.rename(columns={'일시':'ds', f'{var}':'y'}, inplace=True)

    scaler = MinMaxScaler()
    df['y'] = scaler.fit_transform(df['y'].values.reshape(-1, 1))

    #모델 학습
    model = Prophet(growth='linear',
                    weekly_seasonality = False,
                    changepoint_prior_scale = 0.1,
                    holidays_prior_scale = 0.01,
                    seasonality_prior_scale = 10,
                    seasonality_mode = 'multiplicative',
                    changepoint_range = 0.9
                    )
    model.add_country_holidays(country_name='KR')

    model.fit(df)

    #모델 예측
    future_data = model.make_future_dataframe(periods = 358, freq = 'd')
    forecast_data = model.predict(future_data)
    sub = forecast_data[['ds', 'yhat']].rename(columns={'ds':'일시', 'yhat':f'{var}'})[-358:]
    sub[f'{var}'] = scaler.inverse_transform(sub[f'{var}'].values.reshape(-1, 1))
    if i == 0:
        result_df = pd.concat([result_df, sub], axis=1)
    else:
        result_df = pd.concat([result_df, sub[f'{var}']], axis=1)

result_df.to_csv('프로펫_독립변수_20231229.csv', index=False, encoding='utf-8')

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp0rt_kmww/fqjs4znc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp0rt_kmww/7dgymj23.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=51914', 'data', 'file=/tmp/tmp0rt_kmww/fqjs4znc.json', 'init=/tmp/tmp0rt_kmww/7dgymj23.json', 'output', 'file=/tmp/tmp0rt_kmww/prophet_modelaha67j88/prophet_model-20240103015420.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
01:54:20 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
01:54:36 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp0r

# Preprocess

In [6]:
result_df = pd.read_csv('프로펫_독립변수_20231229.csv', encoding='utf-8')
result_df.set_index('일시', inplace=True)
df = pd.read_csv('train.csv')
df['강수량'] = df['강수량'].fillna(0)     # 강수량 널값은 비가 안와서 널값이라고 추측
df = df.iloc[4749:]                     # 일사합 컬럼이 인덱스 4748까지 널값임
df.interpolate(inplace=True)            # TimeSeriesData에는 선형 보간이 적합하다고 판단
df.reset_index(drop=True, inplace=True)
df['일시'] = pd.to_datetime(df['일시'])
df.set_index('일시', inplace=True)
df['강수량'] = np.where(df['강수량'] < 3, 0,
                       np.where((df['강수량'] >= 3) & (df['강수량'] < 15), 1,
                       np.where((df['강수량'] >= 15) & (df['강수량'] < 30), 2,
                       3)))

df = df[var_list+['평균기온']]

scaler = MinMaxScaler()
cols = df.drop(columns=['평균기온']).columns
df[cols] = scaler.fit_transform(df[cols])
result_df[cols] = scaler.transform(result_df[cols])

# Model Tuning

In [7]:
# 하이퍼 파라미터 튜닝 Optuna
X = df.drop(columns=['평균기온'])
y = df['평균기온']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 150),
        'subsample': trial.suggest_uniform('subsample', 0.7, 0.9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.7),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.03),
    }

    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)

    pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, pred)

    return mae

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print(study.best_params)

[I 2024-01-03 01:56:19,945] A new study created in memory with name: no-name-aae3be3c-5774-4b3f-b6f5-f18e6cd12842
[I 2024-01-03 01:56:20,170] Trial 0 finished with value: 4.993228218965324 and parameters: {'n_estimators': 101, 'subsample': 0.767812363031168, 'colsample_bytree': 0.5564326752993933, 'learning_rate': 0.01116441246903377}. Best is trial 0 with value: 4.993228218965324.
[I 2024-01-03 01:56:20,414] Trial 1 finished with value: 4.597958919944386 and parameters: {'n_estimators': 133, 'subsample': 0.8852677807242659, 'colsample_bytree': 0.5614239601114784, 'learning_rate': 0.010049289086199687}. Best is trial 1 with value: 4.597958919944386.
[I 2024-01-03 01:56:20,614] Trial 2 finished with value: 3.543348923205841 and parameters: {'n_estimators': 102, 'subsample': 0.808521861003493, 'colsample_bytree': 0.5207265345734038, 'learning_rate': 0.018567045827231024}. Best is trial 2 with value: 3.543348923205841.
[I 2024-01-03 01:56:20,971] Trial 3 finished with value: 2.08451480764

{'n_estimators': 150, 'subsample': 0.8887333227278845, 'colsample_bytree': 0.659207158548348, 'learning_rate': 0.029983763113953597}


# XGBoost Target

In [12]:
# best model -> filename : Prophet_XGBRegressor_Optuna_231229.csv
# model = XGBRegressor(**study.best_params)
model = XGBRegressor(n_estimators = 150,
                     colsample_bytree=0.650938788194843,
                     learning_rate=0.02997152666801559,
                     subsample=0.8654298963125641) # best
model.fit(df.drop(columns=['평균기온']), df['평균기온'])
pred =  model.predict(result_df)
sub = pd.DataFrame(pd.date_range(start='2023-01-01', end='2023-12-24', freq='D'), columns=['일시'])
sub['평균기온'] = pred
sub.to_csv('Prophet_XGBRegressor_Optuna_231229.csv', index=False, encoding='utf-8')

In [13]:
sub

Unnamed: 0,일시,평균기온
0,2023-01-01,-0.662653
1,2023-01-02,-0.751370
2,2023-01-03,-0.767618
3,2023-01-04,-1.003180
4,2023-01-05,-1.042971
...,...,...
353,2023-12-20,0.153806
354,2023-12-21,0.153806
355,2023-12-22,0.109041
356,2023-12-23,-0.092392
