In [None]:
!pip install pmdarima -q
!pip install statsmodels -q

In [10]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
warnings.simplefilter(action='ignore')

In [2]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':5,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':4096,
    'SEED':41
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [27]:
train_data = pd.read_csv('./new_train.csv').drop(columns=['Unnamed: 0','ID', '제품'])
train_data

Unnamed: 0,대분류,중분류,소분류,브랜드,Avg_price,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,7325.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,26333.750000,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,10853.492063,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,4791.666667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,4921.780492,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,B002-C001-0003,B002-C002-0008,B002-C003-0042,B002-03799,1888.169643,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15886,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,22157.082261,0,0,0,0,0,...,0,0,0,3,0,2,4,1,1,3
15887,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,11712.896203,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15888,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,13600.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [28]:
'''
# Data Scaling
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train_data))):
    maxi = np.max(train_data.iloc[idx,5:])
    mini = np.min(train_data.iloc[idx,5:])

    if maxi == mini :
        train_data.iloc[idx,5:] = 0
    else:
        train_data.iloc[idx,5:] = (train_data.iloc[idx,5:] - mini) / (maxi - mini)

    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini
'''

'\n# Data Scaling\nscale_max_dict = {}\nscale_min_dict = {}\n\nfor idx in tqdm(range(len(train_data))):\n    maxi = np.max(train_data.iloc[idx,5:])\n    mini = np.min(train_data.iloc[idx,5:])\n\n    if maxi == mini :\n        train_data.iloc[idx,5:] = 0\n    else:\n        train_data.iloc[idx,5:] = (train_data.iloc[idx,5:] - mini) / (maxi - mini)\n\n    scale_max_dict[idx] = maxi\n    scale_min_dict[idx] = mini\n'

In [29]:
train_data

Unnamed: 0,대분류,중분류,소분류,브랜드,Avg_price,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,7325.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,26333.750000,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,10853.492063,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,4791.666667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,4921.780492,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,B002-C001-0003,B002-C002-0008,B002-C003-0042,B002-03799,1888.169643,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15886,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,22157.082261,0,0,0,0,0,...,0,0,0,3,0,2,4,1,1,3
15887,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,11712.896203,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15888,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,13600.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [30]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [31]:
train_data = train_data.transpose()

In [32]:
p=d=q=range(0, 4)
pdq=list(itertools.product(p,d,q))

for param in pdq :
    try :
        model = sm.tsa.statespace.SARIMAX(train_data[5:][0], order=param)
        model_fit = model.fit(trend='c')
        print(param, model_fit.aic)
    except:
        continue


(0, 0, 0) 1886.5509356671446
(0, 0, 1) 1809.7945263694714
(0, 0, 2) 1740.100486527273
(0, 0, 3) 1710.6042879247166
(0, 1, 0) 1868.3139937381093
(0, 1, 1) 1663.6084462904187
(0, 1, 2) 1664.9343807878465
(0, 1, 3) 1649.912663848785
(0, 2, 0) 2375.3416914246304
(0, 2, 1) 1873.3669293914706
(0, 2, 2) 1672.1487215225902
(0, 2, 3) 1673.6679644294707
(0, 3, 0) 2931.6541165092603
(0, 3, 1) 2379.2720578586436
(0, 3, 2) 1884.5769941314197
(0, 3, 3) 1689.977623602003
(1, 0, 0) 1746.595970505307
(1, 0, 1) 1660.6181913465703
(1, 0, 2) 1662.613624009207
(1, 0, 3) 1651.1838067751005
(1, 1, 0) 1721.8819276543518
(1, 1, 1) 1664.446686675994
(1, 1, 2) 1665.1299709070588
(1, 1, 3) 1650.552428255494
(1, 2, 0) 2058.532476983762
(1, 2, 1) 1728.1001354546293
(1, 2, 2) 1673.3291991558813
(1, 2, 3) 1673.6912783999637
(1, 3, 0) 2507.5875085678363
(1, 3, 1) 2064.2301523682327
(1, 3, 2) 1741.2481171189688
(1, 3, 3) 1691.5167892826548
(2, 0, 0) 1679.1018177030448
(2, 0, 1) 1662.6099985387625
(2, 0, 2) 1662.2792955

In [33]:
import statsmodels.api as sm

model = sm.tsa.statespace.SARIMAX(train_data[5:][0], order=(2,2,2))
model_fit = model.fit(trend='nc')
print(model_fit.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                  459
Model:               SARIMAX(2, 2, 2)   Log Likelihood                -825.549
Date:                Wed, 23 Aug 2023   AIC                           1661.097
Time:                        22:32:22   BIC                           1681.720
Sample:                    01-01-2022   HQIC                          1669.220
                         - 04-04-2023                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.1489      0.040      3.751      0.000       0.071       0.227
ar.L2          0.2074      0.032      6.435      0.000       0.144       0.271
ma.L1         -1.9195      0.820     -2.341      0.0

In [34]:
aa = model_fit.forecast(21)

In [35]:
aa

2023-04-05    0.031622
2023-04-06    0.036376
2023-04-07    0.043686
2023-04-08    0.045805
2023-04-09    0.047681
2023-04-10    0.048444
2023-04-11    0.048992
2023-04-12    0.049276
2023-04-13    0.049476
2023-04-14    0.049609
2023-04-15    0.049715
2023-04-16    0.049802
2023-04-17    0.049882
2023-04-18    0.049956
2023-04-19    0.050028
2023-04-20    0.050099
2023-04-21    0.050168
2023-04-22    0.050238
2023-04-23    0.050307
2023-04-24    0.050376
2023-04-25    0.050445
Freq: D, Name: predicted_mean, dtype: float64