In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import platform

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

In [None]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':1e-5,
    'BATCH_SIZE':2048,
    'SEED':41
}

PATH = os.getcwd() + '/data/'
if platform.system() == 'Darwin':
    LOADPATH = '/Users/a1r/Desktop/DL/timeseries_new_data/'
else:
    LOADPATH = '/home/a1r/바탕화면/DL/timeseries_new_data/'

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

## Dataset

In [None]:
submit = pd.read_csv(PATH + 'sample_submission.csv')
new_train = pd.read_csv(LOADPATH + 'train_fe.csv', low_memory=False)
new_train = new_train.sort_values(by = ['ID', 'date']).reset_index(drop = True)
origin_train = pd.read_csv(PATH + 'train.csv')

### About Features
#### New_Train : train_data after feature engineering
O : 학습에 사용될 Features
X : 학습에서 drop할 Features

* [O] sales - 제품의 일별 판매량  => **Target**
* [X] ID - 제품 ID
* [X] 제품 - 제품 코드
* [O] 대분류 - 제품의 대분류
* [O] 중분류 - 제품의 중분류
* [O] 소분류 - 제품의 소분류
* [O] 브랜드 - 제품의 브랜드
* [X] date - 제품의 판매 날짜
    *  `23.02.23 ~ 23.03.28` : 약 92.65%의 상품이 이 기간동안 0임을 알 수 있음
* [O] quarter - 제품의 판매 분기 (1, 2, 3, 4)분기 존재
* [O] day_name - 제품의 판매 요일
* [O] keyword - 정규화된 제품 브랜드의 키워드 언급 횟수 : 브랜드의 인지도로 판단
* [O] price - 제품의 판매 가격(₩)
* [O] event - 해당 날짜에 event가 있음 (binary? or Category?)

In [None]:
def data_FE(df):
    one = df.query('event != "0"')
    one.event = np.ones(len(one), dtype = np.int16)
    df.loc[one.index, 'event'] = one.event
    df.event = df.event.astype(np.int16)

    drop_date = []
    df_enc = df.copy()
    columns = ['ID', '대분류', '중분류', '소분류', '브랜드', 'day_name', 'quarter', 'keyword', 'event', 'sales', 'date']

    for i in range(34):
        drop_date.append(list(np.array(pd.date_range('2023-02-23', periods = 34)).astype(str))[i].split('T')[0])

    for date in tqdm(drop_date):
        drop_idx = df_enc.query('date == @date').index
        df_enc.drop(drop_idx, axis = 0, inplace = True)
    
    # 요일 명에서 주중 주말로 변경 : Category -> Binary
    df_enc['day_name'] = df_enc['day_name'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

    df_enc.drop(['제품','year', 'month', 'day', 'day_of_week', 'price'], axis = 1, inplace = True)
    df_enc = df_enc[columns]
    df_enc = df_enc.drop_duplicates()
    df_enc = df_enc.reset_index(drop = True)

    return df_enc

train_enc = data_FE(new_train)

In [None]:
# Label Encoding

col = ['대분류', '중분류', '소분류', '브랜드', 'day_name']
encoder = LabelEncoder()

for c in col:
    train_enc[c] = encoder.fit_transform(train_enc[c])

train_enc.head(3)

#### Preprocessing

In [None]:
# MinMaxScaler about Each ID
def MinMaxscaler(df):
    min_arr = []
    max_arr = []
    ID_num = df.ID.nunique()

    for i in tqdm(range(ID_num)):
        date_cluster = df.iloc[i*425:(i+1)*425]
        min_ = date_cluster.sales.min(axis = 0)
        max_ = date_cluster.sales.max(axis = 0)
        denom = max_ - min_

        if min_ == max_:
            df.iloc[i*425:(i+1)*425, 9] = 0  # NaN값 방지
        else:
            df.iloc[i*425:(i+1)*425, 9] = (date_cluster.sales - min_) / denom

        min_arr.append(min_)
        max_arr.append(max_)

    min_arr = np.array(min_arr)
    max_arr = np.array(max_arr)

    return df, min_arr, max_arr

In [None]:
train_enc, _, _ = MinMaxscaler(train_enc)

## Make Train & Test Dataset to Array

In [None]:
#  시간이 너무 오래걸림
## 현실적으로 쓸 수 없는 함수
## pandas 라이브러리가 너무 무거운 거로 판단됨 -> numpy 데이터로 변형 후 함수 적용하는게 맞는듯
## numpy로 바꿔도 차이가 없음 : 시간복잡도가 너무 높아서 생기는 문제로 판단됨 O(n^2)
## pandas.DataFrame.query()의 문제라고 판명
## ID및 date의 순서로 되어있기 때문에 iloc을 사용해 순서대로 잘라서 dataset을 만들어서 작업 소요시간이 매우 줄음

def make_train_data(data, train_size = CFG['TRAIN_WINDOW_SIZE'], predict_size = CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : date를 melt시킨 새로운 train data
    train_size : 학습에 활용할 기간 => 90 Days
    predict_size : 추론할 기간 => 21 Days
    '''
    window_size = train_size + predict_size         # 90 + 21 = 111
    num_id = data.ID.nunique()                      # ID: 15890
    num_date = data.date.nunique()                  # 날짜: 425
    num_features = len(data.iloc[0, 1:9])           # date를 제외한 나머지 features : 대분류 ~ sales / sales <- Target
    data = np.array(data)                           # DataFrame to Numpy Data
    
    input_data = np.empty((num_id * ((num_date + num_features) - window_size + 1), train_size, num_features + 1), dtype = np.float16)
    target_data = np.empty((num_id * ((num_date + num_features) - window_size + 1), predict_size), dtype = np.float16)

    for id in tqdm(range(num_id)):
        for j in range(num_date - window_size + 1):      # 315
            temp_data = data[id*425: 425*(id+1)][j:train_size+j, 1:10]
            input_data[id * ((num_date + num_features) - window_size + 1) + j] = temp_data
            target_data[id * ((num_date + num_features) - window_size + 1) + j] = data[id*425: 425*(id+1)][train_size+j:window_size+j, 9] # sales

    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']): #90
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : date를 melt시킨 새로운 train data
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_id = data.ID.nunique()                      # ID: 15890
    num_date = data.date.nunique()                  # 날짜: 425
    num_features = len(data.iloc[0, 1:9])           # date를 제외한 나머지 features : 대분류 ~ sales / sales <- Target
    data = np.array(data)
    
    test_input = np.empty((num_id, train_size, num_features + 1), dtype = np.float16)

    for id in tqdm(range(num_id)):
        temp_data = data[id*425: 425*(id+1)][-train_size:, 1:10]
        test_input[id] = temp_data

    return test_input

In [None]:
train_input, train_target = make_train_data(train_enc)
test_input = make_predict_data(train_enc)

np.save(f'train_input_weeke', train_input)
np.save(f'train_target_week', train_target)
np.save(f'test_input_week', test_input)

##### 대분류 별로 MinMaxScaling

In [None]:
train_enc_0 = train_enc.query('대분류 == 0')
train_enc_1 = train_enc.query('대분류 == 1')
train_enc_2 = train_enc.query('대분류 == 2')
train_enc_3 = train_enc.query('대분류 == 3')
train_enc_4 = train_enc.query('대분류 == 4')

train_enc_large = [train_enc_0, train_enc_1, train_enc_2, train_enc_3, train_enc_4]

In [None]:
train_enc_0, min_0, max_0 = MinMaxscaler(train_enc_0)
train_enc_1, min_1, max_1 = MinMaxscaler(train_enc_1)
train_enc_2, min_2, max_2 = MinMaxscaler(train_enc_2)
train_enc_3, min_3, max_3 = MinMaxscaler(train_enc_3)
train_enc_4, min_4, max_4 = MinMaxscaler(train_enc_4)

# np.save('MIN_0', min_0)
# np.save('MAX_0', max_0)

# np.save('MIN_1', min_1)
# np.save('MAX_1', max_1)

# np.save('MIN_2', min_2)
# np.save('MAX_2', max_2)

# np.save('MIN_3', min_3)
# np.save('MAX_3', max_3)

# np.save('MIN_4', min_4)
# np.save('MAX_4', max_4)

train_enc_large = [train_enc_0, train_enc_1, train_enc_2, train_enc_3, train_enc_4]

In [None]:
i = 0
for data in tqdm(train_enc_large):
    train_input, train_target = make_train_data(data)
    test_input = make_predict_data(data)
    
    np.save(f'train_input_{i}', train_input)
    np.save(f'train_target_{i}', train_target)
    np.save(f'test_input_{i}', test_input)

    i += 1

- 대분류: 0
    - train_input.shape: (1217064, 90, 9)
    - train_target.shape: (1217064, 21)
- 대분류: 1
    - train_input.shape: (3538142, 90, 9)
    - train_target.shape: (3538142, 21)
- 대분류: 2
    - train_input.shape: (143412, 90, 9)
    - train_target.shape: (143412, 21)

In [None]:
train_input, train_target = make_train_data(train_enc)
test_input = make_predict_data(train_enc)

# np.save('train_input', train_input)
# np.save('train_target', train_target)
# np.save('test_input', test_input)

## EDA by Large Class

In [None]:
# ID_num = 3768

train_enc_0 = pd.read_csv(os.getcwd() + '/train_data_0.csv')
train_enc_0 = train_enc_0.sort_values(by = 'date').reset_index()
train_enc_0

In [None]:
train_0 = train_enc_0.copy()

for i in range(425):
    key_mean = np.mean(train_enc_0.iloc[i*3768: (i+1) * 3768].keyword)
    sales_sum = sum(train_enc_0.iloc[i*3768: (i+1) * 3768].sales)
    train_0.iloc[i*3768:(i+1) * 3768, 10] = np.repeat(sales_sum, 3768)
    train_0.iloc[i*3768:(i+1) * 3768, 8] = np.repeat(key_mean, 3768)

drop_col = ['index', 'ID', '중분류', '소분류', '브랜드']
train_0 = train_0.drop(drop_col, axis = 1).drop_duplicates().reset_index(drop = True)

## Output Blending

In [None]:
gru_1 = pd.read_csv(PATH + 'GRU_1e_4_drop(0.4).csv')
lstm_1 = pd.read_csv(PATH + 'lstm_first.csv')
gru_2 = pd.read_csv(PATH + 'GRU_2_layers_1e_4.csv')
lstm_2 = pd.read_csv(PATH + 'lstm_2Layers_0820.csv')

In [None]:
blend_all = lstm_1.copy()

In [None]:
blend = gru_1.iloc[:, 1:] + gru_2.iloc[:, 1:] + lstm_1.iloc[:, 1:] + lstm_2.iloc[:, 1:]
blend_all.iloc[:, 1:] = np.around(blend/4).astype(int)

In [None]:
blend_all.to_csv(PATH + 'Blend_0820.csv', index_label=False)