In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/코랩파일들/제주 특산물 가격예측 AI/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
!pip install darts

Collecting darts
  Downloading darts-0.26.0-py3-none-any.whl (784 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m784.8/784.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting nfoursid>=1.0.0 (from darts)
  Downloading nfoursid-1.0.1-py3-none-any.whl (16 kB)
Collecting pmdarima>=1.8.0 (from darts)
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=0.9.5 (from darts)
  Downloading pyod-1.1.1.tar.gz (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.4/159.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting shap>=0.40.0 (from darts)
  Downloading shap-0.43.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (532 

In [None]:
from darts.datasets import WeatherDataset
from darts.models import DLinearModel

In [None]:
reset_seeds(SEED)

In [None]:
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test.csv')

In [None]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))


test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [None]:
date = pd.to_datetime(train_df['timestamp'])
train_df['weekday'] = date.dt.weekday
train_df['timestamp'] = date

In [None]:
date = pd.to_datetime(test_df['timestamp'])
test_df['weekday'] = date.dt.weekday
test_df['timestamp'] = date

In [None]:
train_df['holiday'] = train_df.apply(lambda x : 0 if x['weekday']==6 else 1, axis = 1)
test_df['holiday'] = test_df.apply(lambda x : 0 if x['weekday']==6 else 1, axis = 1)

In [None]:
# 2022 추석
lst = [10,11,12]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2022)&(train_df['month'] == 9)&(train_df['day'] == i)].index,'holiday'] = 0
#2021 추석
lst = [20,21,22,23]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2021)&(train_df['month'] == 9)&(train_df['day'] == i)].index,'holiday'] = 0
#2020 추석
lst = [1,2,3]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2020)&(train_df['month'] == 10)&(train_df['day'] == i)].index,'holiday'] = 0
#2019 추석
lst = [13,14]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2019)&(train_df['month'] == 9)&(train_df['day'] == i)].index,'holiday'] = 0


In [None]:
lst = [23,24]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2023)&(train_df['month'] == 1)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [1,2,3]
train_df.loc[train_df[(train_df['year'] == 2022)&(train_df['month'] == 1)&(train_df['day'] == 31)].index,'holiday'] = 0
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2022)&(train_df['month'] == 2)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [12,13]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2021)&(train_df['month'] == 2)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [25,27]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2020)&(train_df['month'] == 1)&(train_df['day'] == i)].index,'holiday'] = 0

lst = [5,6,7]
for i in lst:
  train_df.loc[train_df[(train_df['year'] == 2019)&(train_df['month'] == 2)&(train_df['day'] == i)].index,'holiday'] = 0

In [None]:
train_df.loc[train_df[(train_df['month']==1)&(train_df['day']==1)].index,'holiday'] =0
train_df.loc[train_df[(train_df['month']==1)&(train_df['day']==2)].index,'holiday'] =0

In [None]:
from darts.timeseries import TimeSeries

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
my_stopper = EarlyStopping(
    monitor="val_loss",
    patience=20,
    min_delta=0.002,
    mode='min',
)

In [None]:
from darts.models import NLinearModel

In [None]:
idx = 0
for i in tqdm(train_df['item'].unique()):
  for c in train_df['corporation'].unique():
    for l in train_df['location'].unique():
      test_tmp = test_df[(test_df['item'] == f'{i}')&(test_df['corporation'] == f'{c}')&(test_df['location'] == f'{l}')]
      # test_tmp = test_tmp[test_tmp['holiday'] != 0]

      if len(test_tmp) == 0:
        pass

      else:
        tmp = train_df[(train_df['item'] == f'{i}')&(train_df['corporation'] == f'{c}')&(train_df['location'] == f'{l}')]
        # tmp = tmp[tmp['holiday'] !=0]
        tmp2 = tmp[['timestamp','price(원/kg)']]
        tmp2.columns = ['ds','y']
        train = tmp2[:-int(len(tmp2)*0.2)]
        target = tmp2[-int(len(tmp2)*0.2):]
        train_t = TimeSeries.from_dataframe(train,time_col='ds')['y']
        target_t = TimeSeries.from_dataframe(target,time_col='ds')['y']

        model = NLinearModel(
            input_chunk_length=120,
            output_chunk_length=28,
            n_epochs=60,
            loss_fn = RMSELoss(),
            optimizer_kwargs = {'lr': 1e-3},
            model_name = f'dansama_{idx}',
            batch_size = 32,
            random_state= 42,
            save_checkpoints = True,
            # normalize = True,
            pl_trainer_kwargs = {"accelerator": "gpu", "devices": -1} # , "callbacks": [my_stopper]

        )
        model.fit(train_t,val_series = target_t,verbose = True)
        model_loaded = NLinearModel.load_from_checkpoint(f'dansama_{idx}', best=True, map_location="cpu")
        idx+=1
        pred = model_loaded.predict(28,TimeSeries.from_dataframe(tmp2[-120:],time_col='ds')['y'],verbose = False)
        test_df.loc[test_tmp.index,'answer'] = pred.all_values().reshape(-1)

In [None]:
from datetime import datetime, timedelta
idx = 0
for i in tqdm(train_df['item'].unique()):
  for c in train_df['corporation'].unique():
    for l in train_df['location'].unique():
      test_tmp = test_df[(test_df['item'] == f'{i}')&(test_df['corporation'] == f'{c}')&(test_df['location'] == f'{l}')]
      test_tmp = test_tmp[test_tmp['holiday'] != 0]

      if len(test_tmp) == 0:
        pass

      else:
        tmp = train_df[(train_df['item'] == f'{i}')&(train_df['corporation'] == f'{c}')&(train_df['location'] == f'{l}')]
        tmp = tmp[tmp['holiday'] !=0]
        tmp['timestamp'] = [tmp['timestamp'].iloc[0]+timedelta(days=x) for x in range(len(tmp))]
        tmp2 = tmp[['timestamp','price(원/kg)']]
        tmp2.columns = ['ds','y']
        train = tmp2[:-int(len(tmp2)*0.2)]
        target = tmp2[-int(len(tmp2)*0.2):]
        train_t = TimeSeries.from_dataframe(train,time_col='ds')['y']
        target_t = TimeSeries.from_dataframe(target,time_col='ds')['y']

        model = NLinearModel(
            input_chunk_length=120,
            output_chunk_length=24,
            n_epochs=80,
            loss_fn = RMSELoss(),
            optimizer_kwargs = {'lr': 5e-4},
            model_name = f'dansama_{idx}',
            batch_size = 32,
            random_state= 42,
            save_checkpoints = True,
            normalize = True,

            pl_trainer_kwargs = {"accelerator": "gpu", "devices": -1},# , "callbacks": [my_stopper]

        )
        model.fit(train_t,val_series = target_t,verbose = False)
        model_loaded = NLinearModel.load_from_checkpoint(f'dansama_{idx}', best=True, map_location="cpu")
        idx+=1
        pred = model_loaded.predict(24,TimeSeries.from_dataframe(tmp2[-120:],time_col='ds')['y'],verbose = False)
        test_df.loc[test_tmp.index,'answer2'] = pred.all_values().reshape(-1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
idx = 0
for i in tqdm(train_df['item'].unique()):
  for c in train_df['corporation'].unique():
    for l in train_df['location'].unique():
      test_tmp = test_df[(test_df['item'] == f'{i}')&(test_df['corporation'] == f'{c}')&(test_df['location'] == f'{l}')]
      # test_tmp = test_tmp[test_tmp['holiday'] != 0]

      if len(test_tmp) == 0:
        pass

      else:
        tmp = train_df[(train_df['item'] == f'{i}')&(train_df['corporation'] == f'{c}')&(train_df['location'] == f'{l}')]
        # tmp = tmp[tmp['holiday'] !=0]
        tmp2 = tmp[['timestamp','price(원/kg)']]
        tmp2.columns = ['ds','y']
        tmp2.loc[tmp2.index,'y'] = scaler.fit_transform(tmp2[['y']])
        train = tmp2[:-int(len(tmp2)*0.2)]
        target = tmp2[-int(len(tmp2)*0.2):]
        train_t = TimeSeries.from_dataframe(train,time_col='ds')['y']
        target_t = TimeSeries.from_dataframe(target,time_col='ds')['y']

        model = NLinearModel(
            input_chunk_length=120,
            output_chunk_length=28,
            n_epochs=60,
            loss_fn = RMSELoss(),
            optimizer_kwargs = {'lr': 1e-3},
            model_name = f'dansama_{idx}',
            batch_size = 32,
            random_state= 42,
            save_checkpoints = True,
            # normalize = True,
            pl_trainer_kwargs = {"accelerator": "gpu", "devices": -1} # , "callbacks": [my_stopper]

        )
        model.fit(train_t,val_series = target_t,verbose = False)
        model_loaded = NLinearModel.load_from_checkpoint(f'dansama_{idx}', best=True, map_location="cpu")
        idx+=1
        pred = model_loaded.predict(28,TimeSeries.from_dataframe(tmp2[-120:],time_col='ds')['y'],verbose = False)
        test_df.loc[test_tmp.index,'answer3'] = pred.all_values().reshape(-1)
        test_df.loc[test_tmp.index,'answer3'] = scaler.inverse_transform(test_df.loc[test_tmp.index,['answer3']])

In [None]:
idx = 0
for i in tqdm(train_df['item'].unique()):
  for c in train_df['corporation'].unique():
    for l in train_df['location'].unique():
      test_tmp = test_df[(test_df['item'] == f'{i}')&(test_df['corporation'] == f'{c}')&(test_df['location'] == f'{l}')]
      test_tmp = test_tmp[test_tmp['holiday'] != 0]

      if len(test_tmp) == 0:
        pass

      else:
        tmp = train_df[(train_df['item'] == f'{i}')&(train_df['corporation'] == f'{c}')&(train_df['location'] == f'{l}')]
        tmp = tmp[tmp['holiday'] !=0]
        tmp['timestamp'] = [tmp['timestamp'].iloc[0]+timedelta(days=x) for x in range(len(tmp))]
        tmp2 = tmp[['timestamp','price(원/kg)']]
        tmp2.columns = ['ds','y']
        tmp2.loc[tmp2.index,'y'] = scaler.fit_transform(tmp2[['y']])
        train = tmp2[:-int(len(tmp2)*0.2)]
        target = tmp2[-int(len(tmp2)*0.2):]
        train_t = TimeSeries.from_dataframe(train,time_col='ds')['y']
        target_t = TimeSeries.from_dataframe(target,time_col='ds')['y']

        model = NLinearModel(
            input_chunk_length=120,
            output_chunk_length=24,
            n_epochs=80,
            loss_fn = RMSELoss(),
            optimizer_kwargs = {'lr': 5e-4},
            model_name = f'dansama_{idx}',
            batch_size = 32,
            random_state= 42,
            save_checkpoints = True,
            normalize = True,

            pl_trainer_kwargs = {"accelerator": "gpu", "devices": -1},# , "callbacks": [my_stopper]

        )
        model.fit(train_t,val_series = target_t,verbose = False)
        model_loaded = NLinearModel.load_from_checkpoint(f'dansama_{idx}', best=True, map_location="cpu")
        idx+=1
        pred = model_loaded.predict(24,TimeSeries.from_dataframe(tmp2[-120:],time_col='ds')['y'],verbose = False)
        test_df.loc[test_tmp.index,'answer4'] = pred.all_values().reshape(-1)
        test_df.loc[test_tmp.index,'answer4'] = scaler.inverse_transform(test_df.loc[test_tmp.index,['answer4']])

In [None]:
# import shutil
# shutil.rmtree("/content/darts_logs")

In [None]:
# model_loaded = DLinearModel.load_from_checkpoint(f'dansama_0', best=True, map_location="cpu")
# pred = model_loaded.predict(28,TimeSeries.from_dataframe(tmp2[-120:],time_col='ds')['y'])
# test_df.loc[test_tmp.index,'answer'] = pred.all_values().reshape(-1)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [None]:
submission['answer'] = (test_df.answer+test_df.answer2+test_df.answer3+test_df.answer4)/4
submission

In [None]:
submission.loc[test_df[test_df['holiday'] == 0].index,'answer'] = 0
submission.loc[test_df[test_df['holiday'] == 0].index]

Unnamed: 0,ID,answer
1,TG_A_J_20230305,0.0
8,TG_A_J_20230312,0.0
15,TG_A_J_20230319,0.0
22,TG_A_J_20230326,0.0
29,TG_A_S_20230305,0.0
...,...,...
1058,CB_F_J_20230326,0.0
1065,RD_F_J_20230305,0.0
1072,RD_F_J_20230312,0.0
1079,RD_F_J_20230319,0.0


In [None]:
submission.loc[submission[submission['answer'] < 0].index]

Unnamed: 0,ID,answer
326,CR_B_J_20230322,-287.057783
334,CR_B_J_20230330,-7.428426
392,CR_D_S_20230304,-1.147227
395,CR_D_S_20230307,-0.542519
396,CR_D_S_20230308,-0.632864
...,...,...
933,BC_C_S_20230313,-2.748989
940,BC_C_S_20230320,-0.468378
941,BC_C_S_20230321,-5.943059
942,BC_C_S_20230322,-7.875756


In [None]:
submission.loc[submission[submission['answer'] < 0].index,'answer'] = 0
submission.loc[submission[submission['answer'] < 0].index]

Unnamed: 0,ID,answer


In [None]:
submission.to_csv('N_linaer.csv', index=False)