In [1]:
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
warnings.simplefilter(action='ignore')

In [2]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':5,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':4096,
    'SEED':41
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [4]:
train_data = pd.read_csv('./new_train.csv').drop(columns=['Unnamed: 0','ID', '제품'])
train_data

Unnamed: 0,대분류,중분류,소분류,브랜드,Avg_price,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,7325.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,26333.750000,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,10853.492063,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,4791.666667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,4921.780492,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,B002-C001-0003,B002-C002-0008,B002-C003-0042,B002-03799,1888.169643,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15886,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,22157.082261,0,0,0,0,0,...,0,0,0,3,0,2,4,1,1,3
15887,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,11712.896203,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15888,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,13600.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [5]:
'''
# Data Scaling
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train_data))):
    maxi = np.max(train_data.iloc[idx,5:])
    mini = np.min(train_data.iloc[idx,5:])

    if maxi == mini :
        train_data.iloc[idx,5:] = 0
    else:
        train_data.iloc[idx,5:] = (train_data.iloc[idx,5:] - mini) / (maxi - mini)

    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini
'''

'\n# Data Scaling\nscale_max_dict = {}\nscale_min_dict = {}\n\nfor idx in tqdm(range(len(train_data))):\n    maxi = np.max(train_data.iloc[idx,5:])\n    mini = np.min(train_data.iloc[idx,5:])\n\n    if maxi == mini :\n        train_data.iloc[idx,5:] = 0\n    else:\n        train_data.iloc[idx,5:] = (train_data.iloc[idx,5:] - mini) / (maxi - mini)\n\n    scale_max_dict[idx] = maxi\n    scale_min_dict[idx] = mini\n'

In [6]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [7]:
train_data = train_data.transpose()

In [8]:
p=d=q=range(0, 4)
pdq=list(itertools.product(p,d,q))
min_a = 9999999

for param in pdq :
    try :
        model = sm.tsa.statespace.SARIMAX(train_data[5:][0], order=param)
        model_fit = model.fit(trend='c')
        if model_fit.aic < min_a :
            min_a = model_fit.aic
            min_param = param
    except:
        continue

model = sm.tsa.statespace.SARIMAX(train_data[5:][0], order=min_param)
model_fit = model.fit(trend='nc')
aa = model_fit.forecast(21)

In [None]:
p=d=q=range(0, 3)
pdq=list(itertools.product(p,d,q))

for idx in tqdm(range(1, 7945)) :
    for param in pdq :
        min_a = 9999999
        try :
            model = sm.tsa.statespace.SARIMAX(train_data[5:][0], order=param)
            model_fit = model.fit(trend='nc')
            if model_fit.aic < min_a :
                min_a = model_fit.aic
                min_param = param
        except:
            continue
    model = sm.tsa.statespace.SARIMAX(train_data[5:][idx], order=min_param)
    model_fit = model.fit(trend='nc')
    bb = model_fit.forecast(21)
    aa = pd.concat([aa, bb], axis = 1)

  0%|          | 0/7944 [00:00<?, ?it/s]

In [9]:
aa

2023-04-05    0.018659
2023-04-06    0.014484
2023-04-07    0.019607
2023-04-08    0.021115
2023-04-09    0.016273
2023-04-10    0.022094
2023-04-11    0.018454
2023-04-12    0.018502
2023-04-13    0.021558
2023-04-14    0.017358
2023-04-15    0.020477
2023-04-16    0.019809
2023-04-17    0.018059
2023-04-18    0.021014
2023-04-19    0.018480
2023-04-20    0.019413
2023-04-21    0.020302
2023-04-22    0.018294
2023-04-23    0.020272
2023-04-24    0.019289
2023-04-25    0.018942
Freq: D, Name: predicted_mean, dtype: float64

In [12]:
cols = list(range(0, 7945))
aa.columns = cols
result = aa.transpose()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [11]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

2023-04-05    0.018659
2023-04-06    0.014484
2023-04-07    0.019607
2023-04-08    0.021115
2023-04-09    0.016273
2023-04-10    0.022094
2023-04-11    0.018454
2023-04-12    0.018502
2023-04-13    0.021558
2023-04-14    0.017358
2023-04-15    0.020477
2023-04-16    0.019809
2023-04-17    0.018059
2023-04-18    0.021014
2023-04-19    0.018480
2023-04-20    0.019413
2023-04-21    0.020302
2023-04-22    0.018294
2023-04-23    0.020272
2023-04-24    0.019289
2023-04-25    0.018942
Freq: D, Name: predicted_mean, dtype: float64

In [14]:
submit.iloc[:,1:] = result
submit.head()

NameError: name 'result' is not defined

In [None]:
submit.to_csv('./arima_front_0_to_7944.csv', index=False)