In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import platform

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

In [2]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':5e-7,
    'BATCH_SIZE':2048,
    'SEED':41
}

PATH = os.getcwd() + '/data/'
LOADPATH = '/home/a1r/바탕화면/DL/timeseries_new_data/'

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
submit = pd.read_csv(PATH + 'sample_submission.csv')
new_train = pd.read_csv(LOADPATH + 'train_fe.csv', low_memory=False)
new_train = new_train.sort_values(by = ['ID', 'date']).reset_index(drop = True)
origin_train = pd.read_csv(PATH + 'train.csv')

## About Features
#### New_Train : train_data after feature engineering
O : 학습에 사용될 Features
X : 학습에서 drop할 Features

* [O] sales - 제품의 일별 판매량  => **Target**
* [X] ID - 제품 ID
* [X] 제품 - 제품 코드
* [O] 대분류 - 제품의 대분류
* [O] 중분류 - 제품의 중분류
* [O] 소분류 - 제품의 소분류
* [O] 브랜드 - 제품의 브랜드
* [X] date - 제품의 판매 날짜
    *  `23.02.23 ~ 23.03.28` : 약 92.65%의 상품이 이 기간동안 0임을 알 수 있음
* [O] quarter - 제품의 판매 분기 (1, 2, 3, 4)분기 존재
* [O] day_name - 제품의 판매 요일
* [O] keyword - 정규화된 제품 브랜드의 키워드 언급 횟수 : 브랜드의 인지도로 판단
* [O] price - 제품의 판매 가격(₩)
* [O] event - 해당 날짜에 event가 있음 (binary? or Category?)

In [None]:
def data_FE(df):
    one = df.query('event != "0"')
    one.event = np.ones(len(one), dtype = np.int16)
    df.loc[one.index, 'event'] = one.event
    df.event = df.event.astype(np.int16)

    drop_date = []
    df_enc = df.copy()
    columns = ['ID', '대분류', '중분류', '소분류', '브랜드', 'day_name', 'quarter', 'keyword', 'event', 'sales', 'date']

    for i in range(34):
        drop_date.append(list(np.array(pd.date_range('2023-02-23', periods = 34)).astype(str))[i].split('T')[0])

    for date in tqdm(drop_date):
        drop_idx = df_enc.query('date == @date').index
        df_enc.drop(drop_idx, axis = 0, inplace = True)

    df_enc.drop(['제품','year', 'month', 'day', 'day_of_week', 'price'], axis = 1, inplace = True)
    df_enc = df_enc[columns]
    df_enc = df_enc.drop_duplicates()
    df_enc = df_enc.reset_index(drop = True)

    return df_enc

train_enc = data_FE(new_train)
train_enc.head(3)

In [None]:
# Label Encoding

col = ['대분류', '중분류', '소분류', '브랜드', 'day_name']
encoder = LabelEncoder()

for c in col:
    train_enc[c] = encoder.fit_transform(train_enc[c])

train_enc.head(3)

#### Preprocessing

In [7]:
class IndividualMinMaxScaler:
    def __get_min_val(self, df, target):
        df_mins = df[target].min(axis=0)
        return df_mins

    def __get_max_val(self, df, target):
        df_maxs = df[target].max(axis=0)
        return df_maxs


    def fit(self, df: pd.DataFrame):
        target = 'sales'

        self.min_val = self.__get_min_val(df, target)
        self.max_val = self.__get_max_val(df, target)
        self.denom = self.max_val - self.min_val
        

    def transform(self, df: pd.DataFrame):
        target = 'sales'
        return df[target] \
            .apply(lambda x: (x-self.min_val)/self.denom)

    def fit_transform(self, df: pd.DataFrame):
        self.fit(df)
        return self.transform(df)

In [None]:
sort_date = train_enc.sort_values(by = 'date')   # 날짜 당 15890
min_ = []
max_ = []

for i in tqdm(range(425)):
    scaler = IndividualMinMaxScaler()
    date_cluster = sort_date.iloc[i*15890:(i+1)*15890]
    min_.append(date_cluster.sales.min())
    max_.append(date_cluster.sales.max())
    sort_date.iloc[i*15890:(i+1)*15890, 9] = scaler.fit_transform(date_cluster)

train_enc.sales = sort_date.sort_index().sales

In [None]:
min_ = np.array(min_)
max_ = np.array(max_)

np.save('MIN', min_)
np.save('MAX', max_)

In [None]:
#  시간이 너무 오래걸림
## 현실적으로 쓸 수 없는 함수
## pandas 라이브러리가 너무 무거운 거로 판단됨 -> numpy 데이터로 변형 후 함수 적용하는게 맞는듯
## numpy로 바꿔도 차이가 없음 : 시간복잡도가 너무 높아서 생기는 문제로 판단됨 O(n^2)
## pandas.DataFrame.query()의 문제라고 판명
## ID및 date의 순서로 되어있기 때문에 iloc을 사용해 순서대로 잘라서 dataset을 만들어서 작업 소요시간이 매우 줄음

def make_train_data(data, train_size = CFG['TRAIN_WINDOW_SIZE'], predict_size = CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : date를 melt시킨 새로운 train data
    train_size : 학습에 활용할 기간 => 90 Days
    predict_size : 추론할 기간 => 21 Days
    '''
    window_size = train_size + predict_size         # 90 + 21 = 111
    num_id = data.ID.nunique()                      # 15890
    num_date = data.date.nunique()                  # 425
    num_features = len(data.iloc[0, 1:9])           # 대분류 ~ sales까지
    data = np.array(data)                           # DataFrame to Numpy Data
    
    input_data = np.empty((num_id * ((num_date + num_features) - window_size + 1), train_size, num_features + 1), dtype = np.float16)
    target_data = np.empty((num_id * ((num_date + num_features) - window_size + 1), predict_size), dtype = np.float16)

    for id in tqdm(range(num_id)):
        for j in range(num_date - window_size + 1):      # 315
            temp_data = data[id*425: 425*(id+1)][j:train_size+j, 1:10]
            input_data[id * ((num_date + num_features) - window_size + 1) + j] = temp_data
            target_data[id * ((num_date + num_features) - window_size + 1) + j] = data[id*425: 425*(id+1)][train_size+j:window_size+j, 9] # sales

    return input_data, target_data

In [None]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']): #90
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : date를 melt시킨 새로운 train data
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_id = data.ID.nunique()
    num_date = data.date.nunique()
    num_features = data.iloc[0:1, 1:9].shape[1]   # 대분류 ~ sales까지
    data = np.array(data)
    
    test_input = np.empty((num_id, train_size, num_features + 1), dtype = np.float16)

    for id in tqdm(range(num_id)):
        temp_data = data[id*425: 425*(id+1)][-train_size:, 1:10]
        test_input[id] = temp_data

    return test_input

In [None]:
train_input, train_target = make_train_data(train_enc)
test_input = make_predict_data(train_enc)

In [None]:
np.save('train_input', train_input)
np.save('train_target', train_target)
np.save('test_input', test_input)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

## CNN-LSTM MODEL
![스크린샷 2023-08-04 오후 5 16 00](https://github.com/Megvii-BaseDetection/YOLOX/assets/103639510/6e612140-39b5-4198-bb14-81ff2aca5f22)


In [5]:
class Conv1d_LSTM(nn.Module):
    def __init__(self, in_channel=9, hidden_size = 128, out_channel=CFG['PREDICT_SIZE']):
        super(Conv1d_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.conv1d_1 = nn.Conv1d(in_channels=in_channel,
                                out_channels=16,
                                kernel_size=3,
                                stride=1,
                                padding=1)
        
        self.conv1d_2 = nn.Conv1d(in_channels=16,
                                out_channels=32,
                                kernel_size=3,
                                stride=1,
                                padding=1)
        
        self.lstm = nn.LSTM(input_size = 32,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bias=True,
                            bidirectional=False,
                            batch_first=True)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, out_channel)
        )

        self.actv = nn.ReLU()
        # self.dense1 = nn.Linear(hidden_size, hidden_size//2)
        # self.dropout = nn.Dropout()
        # self.dense2 = nn.Linear(hidden_size//2, out_channel)
    
    def init_hidden(self, batch_size, device):
        return (
            torch.zeros(1, batch_size, self.hidden_size, device = device),
            torch.zeros(1, batch_size, self.hidden_size, device = device)
        )

    def forward(self, x):
	# Raw x shape : (B, TRAIN_WINDOW_SIZE, in_channel) => (B, 90, 5)
        
        # Shape : (B, F: in_channel, S: TRAIN_WINDOW_SIZE) => (B, 5, 90)
        x = x.transpose(1, 2)
        # Shape with Conv1d_1 : (B, F, S) == (B, C, S) || C = channel => (B, 16, 90)
        x = self.conv1d_1(x)
        ## Shape with Conv1d_2 : (B, C, S) => (B, 32, 90)
        x = self.conv1d_2(x)
        ## Reshape : (B, S, C) == (B, S, F) => (B, 90, 32)
        x = x.transpose(1, 2)
        
        # self.lstm.flatten_parameters()
        # # Shape : (B, S, H) // H = hidden_size => (B, 90, 512)
        # _, (hidden, _) = self.lstm(x)
        # # Shape : (B, H) // -1 means the last sequence => (B, 512)
        # x = hidden[-1]
        
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        # LSTM Layer
        lstm_out, hidden = self.lstm(x, hidden)

        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]

        # Fully connected layer
        output = self.actv(self.fc(last_output))
        x = output.squeeze(1)
        
        # # Shape : (B, 512)
        # x = self.dense1(x)
        # # ReLU
        # x = F.relu(x)
        # # Shape : (B, H) => (B, 256)
        # x = self.dropout(x)
        # # Shape : (B, O) // O = output => (B, 21)
        # x = self.dense2(x)

        return x

## Loss

#### Metric

In [None]:
def pseudo_sfa(pred, df):
    pred_length = pred.shape[1] - 1
    true = df.iloc[:, -pred_length:].reset_index() \
        .rename(columns={"index": "ID"})

    main_id = {}
    for main_cat in df["대분류"].unique():
        main_id[main_cat] = df.query("대분류==@main_cat")["ID"].to_list()

    psfa = []
    for main_cat in main_id.keys():
        indices = true["ID"].isin(main_id[main_cat])

        true_arr = true[indices].iloc[:, 1:].to_numpy()
        pred_arr = pred[indices].iloc[:, 1:].to_numpy()

        eps = np.ones((true_arr.shape)) / 1e8

        true_sum = true_arr.sum(axis=0)
        true_sum = np.stack([true_sum]*len(true_arr)) + eps
        true_rate = true_arr / true_sum

        abs_error = np.abs(true_arr - pred_arr)
        denom = np.maximum(true_arr, pred_arr+eps)
        
        score = 1 - (1 / true_arr.shape[1]
                     * (abs_error / denom) * true_rate).sum()
        psfa.append(score)
        print(main_cat, score)

    return np.mean(psfa)

#### Loss Function

In [None]:
class PsfaLoss(nn.Module):
    def __init__(self, scaler, df):
        super().__init__()
        self.scaler = scaler
        self.main_cats = df.groupby("대분류")["ID"].unique().values
    
    def forward(self, pred, true):
        # pred: [batch_size, length, products(15890)]
        pred = pred * torch.tensor(self.scaler.denom) \
            + torch.tensor(self.scaler.min_val)
        true = true * torch.tensor(self.scaler.denom) \
            + torch.tensor(self.scaler.min_val)

        L1scaled = torch.abs(true-pred) / torch.maximum(pred, true+1e-8)
        
        rate = torch.zeros_like(true)
        for i in range(len(self.main_cats)):
            rate[:, :, self.main_cats[i]] = \
                true[:, :, self.main_cats[i]] \
                / (true[:, :, self.main_cats[i]].sum(dim=-1, keepdim=True) + 1e-8) \
                / len(self.main_cats)
        return (L1scaled * rate).sum() / (true.shape[0] * true.shape[1])

In [None]:
scaler = IndividualMinMaxScaler()
loss = PsfaLoss(scaler, train_gate)

## Train Function

In [5]:
train_input = np.load(LOADPATH + 'train_input.npy')
train_target = np.load(LOADPATH + 'train_target.npy')

In [9]:
data_len = len(train_input)
train_dataset = CustomDataset(train_input[:-int(data_len*0.2)], train_target[:-int(data_len*0.2)])
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(train_input[-int(data_len*0.2):], train_target[-int(data_len*0.2):])
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [10]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    train_loss = {}
    val_loss = {}
    
    # Epoch
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss_li = []
        train_mae = []
    
    # Iteration
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss_li.append(loss.item())
        
        val_loss_li = validation(model, val_loader, criterion, device)
        val_loss_mean = np.mean(val_loss_li)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss_li):.5f}] Val Loss : [{val_loss_mean:.5f}]')
        
        if best_loss > val_loss_mean:
            best_loss = val_loss_mean
            best_model = model
            print('Model Saved')

        train_loss[epoch] = train_loss_li
        val_loss[epoch] = val_loss_li

    return best_model, train_loss, val_loss

In [11]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return val_loss

In [13]:
model = Conv1d_LSTM()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG['LEARNING_RATE'])
infer_model, train_loss, val_loss = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [0.00868] Val Loss : [0.00375]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [0.00323] Val Loss : [0.00101]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [0.00176] Val Loss : [0.00054]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [4] Train Loss : [0.00128] Val Loss : [0.00045]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [5] Train Loss : [0.00108] Val Loss : [0.00039]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [6] Train Loss : [0.00097] Val Loss : [0.00036]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [7] Train Loss : [0.00089] Val Loss : [0.00029]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [8] Train Loss : [0.00083] Val Loss : [0.00025]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [9] Train Loss : [0.00078] Val Loss : [0.00023]
Model Saved


  0%|          | 0/2005 [00:00<?, ?it/s]

  0%|          | 0/502 [00:00<?, ?it/s]

Epoch : [10] Train Loss : [0.00074] Val Loss : [0.00020]
Model Saved


In [14]:
SAVE_PATH = os.getcwd() + '/cnn_lstm_with_fe_data.pth'
torch.save(infer_model.state_dict(), SAVE_PATH)

In [15]:
train_loss = np.array(train_loss)
val_loss = np.array(val_loss)

In [16]:
np.save(PATH + 'train_loss', train_loss)
np.save(PATH + 'val_loss', val_loss)

In [6]:
test_input = np.load(LOADPATH + 'test_input.npy')

In [7]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [9]:
model = Conv1d_LSTM()
model.load_state_dict(torch.load(os.getcwd() + '/cnn_lstm_with_fe_data.pth'))

<All keys matched successfully>

In [None]:
test_smp = test_loader.dataset.X

In [None]:
prediction = inference(model, test_loader, device)
prediction.shape

In [None]:
test_input

In [None]:
submit = pd.read_csv(PATH + '/sample_submission.csv') # submit은 15690개 상품 / 예측은 15682개 상품
submit.iloc[:, 1:] = prediction

In [None]:
SAVE_SUBMIT = os.getcwd() + '/data/cnn_lstm_test_2.csv'
submit.to_csv(SAVE_SUBMIT)

In [None]:
submit