# Library import 

In [1]:
import os
import random
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from omegaconf import OmegaConf, DictConfig

from tqdm.auto import tqdm
import warnings
import wandb
from datetime import datetime
import re
from typing import Tuple

warnings.filterwarnings("ignore")

In [2]:
def sanitize_filename(filename):
    # Remove characters that are not allowed in Windows file names
    # (e.g., : / \ ? * < > | ")
    filename = re.sub(r'[\\/:*?"<>|]', '_', filename)
    return filename

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Configuration

In [4]:
project_root = os.getcwd()
data_root = os.path.join(project_root, "data")

In [161]:
cfg_dict: dict = {
    "WINDOW_SIZE" : 90,
    "PREDICT_SIZE" : 21,
    "EPOCHS" : 2,
    "LEARNING_RATE" : 1e-3,
    "BATCH_SIZE" : 2048,
    "NUM_WORKERS" : 0,
    "SEED" : 29,
    "input_size" : 5,
    "hidden_size" : 512,
    "output_size" : 21,
    "num_layers" : 3,
    "num_attention_heads" : 4,
    "feedforward_dim" : 25,
    "dropout_rate" : 0.2
}

cfg = OmegaConf.create(cfg_dict)
print(OmegaConf.to_yaml(cfg))

WINDOW_SIZE: 90
PREDICT_SIZE: 21
EPOCHS: 2
LEARNING_RATE: 0.001
BATCH_SIZE: 2048
NUM_WORKERS: 0
SEED: 29
input_size: 5
hidden_size: 512
output_size: 21
num_layers: 3
num_attention_heads: 4
feedforward_dim: 25
dropout_rate: 0.2



### SET SEED

In [6]:
random.seed(cfg["SEED"])
os.environ["PYTHONHASHSEED"] = str(cfg["SEED"])
np.random.seed(cfg["SEED"])
torch.manual_seed(cfg["SEED"])
torch.cuda.manual_seed(cfg["SEED"])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False # 실험시 False

# Data Load

In [7]:
train = pd.read_csv(data_root+"\\train.csv")
train.drop(["ID", "제품"], axis=1, inplace=True)

In [8]:
train.head()

Unnamed: 0,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B002-C001-0001,B002-C002-0001,B002-C003-0003,B002-00003,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Data Scaling
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train))):
    maxi = np.max(train.iloc[idx,4:])
    mini = np.min(train.iloc[idx,4:])
    
    if maxi == mini :
        train.iloc[idx,4:] = 0
    else:
        train.iloc[idx,4:] = (train.iloc[idx,4:] - mini) / (maxi - mini)
    
    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini

  0%|          | 0/15890 [00:00<?, ?it/s]

In [14]:
encoder = LabelEncoder()
categorical_col = ["대분류", "중분류", "소분류", "브랜드"]

for col in categorical_col:
    train[col] = encoder.fit_transform(train[col])

In [11]:
def make_train_data(data, train_size=cfg["WINDOW_SIZE"], predict_size=cfg["PREDICT_SIZE"]):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    num_rows = len(data)
    window_size = train_size + predict_size
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, 4:])
        
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [12]:
def make_predict_data(data, train_size=cfg["WINDOW_SIZE"]):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    
    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1))
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data
    
    return input_data

In [13]:
train_input, train_target = make_train_data(train)
test_input = make_predict_data(train)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [16]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [17]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((4487336, 90, 5),
 (4487336, 21),
 (1121834, 90, 5),
 (1121834, 21),
 (15890, 90, 5))

# DataSet

In [166]:
class CustomDataset(Dataset):
    def __init__(self, X, Y, pad_to_length=None):
        self.X = X
        self.Y = Y
        self.pad_to_length = pad_to_length

    def __getitem__(self, index):
        x = torch.Tensor(self.X[index])
        y = torch.Tensor(self.Y[index]) if self.Y is not None else None

        if self.pad_to_length is not None:
            seq_len = x.shape[0]
            if seq_len < self.pad_to_length:
                # Pad the sequence to the desired length
                padding = torch.zeros(self.pad_to_length - seq_len, x.shape[1])
                x = torch.cat([x, padding])

        return x, y

    def __len__(self):
        return len(self.X)

In [66]:
def collate_fn(batch):
    batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    inputs_padded = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    targets_padded = nn.utils.rnn.pad_sequence(targets, batch_first=True)
    return inputs_padded, targets_padded

In [173]:
train_dataset = CustomDataset(train_input, train_target)
train_dataloader = DataLoader(train_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=True, num_workers=0, collate_fn=collate_fn)

val_dataset = CustomDataset(val_input, val_target)
val_dataloader = DataLoader(val_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=0, collate_fn=collate_fn)

In [167]:
pad_to_length = max(len(max(train_input, key=len)), len(max(val_input, key=len)))

train_dataset = CustomDataset(train_input, train_target, pad_to_length)
train_dataloader = DataLoader(train_dataset, batch_size=cfg['BATCH_SIZE'], shuffle=True)

val_dataset = CustomDataset(val_input, val_target, pad_to_length)
val_dataloader = DataLoader(val_dataset, batch_size=cfg['BATCH_SIZE'], shuffle=False)

# Define Model

In [21]:
class SalesForecastNet(nn.Module):
    def __init__(self, cfg: DictConfig = cfg):
        super().__init__()
        self.hidden_size = cfg.hidden_size
        self.bilstm1 = nn.LSTM(cfg.input_size, cfg.hidden_size, batch_first=True, bidirectional=True)
        self.bilstm2 = nn.LSTM(cfg.hidden_size * 2, cfg.hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Sequential(
            nn.Linear(cfg.hidden_size * 2, cfg.hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(cfg.hidden_size // 2, cfg.output_size)
        )
        self.actv = nn.ReLU()

    def forward(self, x):
        batch_size = x.size(0)
        hidden1 = self.init_hidden(batch_size, x.device)
        lstm_out1, hidden1 = self.bilstm1(x, hidden1)
        hidden2 = self.init_hidden(batch_size, x.device)
        lstm_out2, hidden2 = self.bilstm2(lstm_out1, hidden2)
        last_output = lstm_out2[:, -1, :]
        output = self.actv(self.fc(last_output))
        return output.squeeze(1)

    def init_hidden(self, batch_size, device):
        # Initialize hidden state and cell state
        return (torch.zeros(2, batch_size, cfg.hidden_size, device=device),
                torch.zeros(2, batch_size, cfg.hidden_size, device=device))

model = SalesForecastNet(cfg)

In [22]:
model

SalesForecastNet(
  (bilstm1): LSTM(5, 512, batch_first=True, bidirectional=True)
  (bilstm2): LSTM(1024, 512, batch_first=True, bidirectional=True)
  (fc): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=256, out_features=21, bias=True)
  )
  (actv): ReLU()
)

In [155]:
# class PadToEqualLength(nn.Module):
#     def forward(self, x, lengths):
#         max_len = torch.max(lengths)
#         batch_size, seq_len, input_size = x.size()
#         padding = torch.zeros(batch_size, max_len - seq_len, input_size, device=x.device)
#         x = torch.cat([x, padding], dim=1)
#         return x


# class TransformerLayer(nn.Module):
#     def __init__(self, hidden_size, num_attention_heads, feedforward_dim, dropout_rate=0.1):
#         super(TransformerLayer, self).__init__()
#         self.attention = nn.MultiheadAttention(hidden_size, num_attention_heads, dropout=dropout_rate)
#         self.norm1 = nn.LayerNorm(hidden_size)
#         self.dropout1 = nn.Dropout(dropout_rate)
#         self.feedforward = nn.Sequential(
#             nn.Linear(hidden_size, feedforward_dim),
#             nn.ReLU(),
#             nn.Linear(feedforward_dim, hidden_size),
#         )
#         self.norm2 = nn.LayerNorm(hidden_size)
#         self.dropout2 = nn.Dropout(dropout_rate)
#         self.activation = nn.ReLU()

#     def forward(self, x, mask=None):
#         # Self-attention
#         attn_output, _ = self.attention(x, x, x, key_padding_mask=~mask)

#         # Residual connection and normalization
#         x = x + self.dropout1(attn_output)
#         x = self.norm1(x)

#         # Feedforward
#         feedforward_output = self.feedforward(x)

#         # Residual connection and normalization
#         x = x + self.dropout2(feedforward_output)
#         x = self.norm2(x)

#         # Activation
#         x = self.activation(x)

#         return x


# class SalesForecastNet(nn.Module):
#     def __init__(self, cfg):
#         super(SalesForecastNet, self).__init__()
#         self.hidden_size = cfg["hidden_size"]
#         self.num_layers = cfg["num_layers"]
#         self.num_attention_heads = cfg["num_attention_heads"]
#         self.feedforward_dim = cfg["feedforward_dim"]
#         self.output_size = cfg["output_size"]
#         self.dropout_rate = cfg["dropout_rate"]

#         self.embedding = nn.Linear(cfg["input_size"], self.hidden_size)
#         self.transformer_layers = nn.ModuleList(
#             [TransformerLayer(self.hidden_size, self.num_attention_heads, self.feedforward_dim, self.dropout_rate)
#              for _ in range(self.num_layers)]
#         )
#         self.pad_to_equal_length = PadToEqualLength()
#         self.fc = nn.Linear(self.hidden_size, self.output_size)
#         self.actv = nn.ReLU()

#     def forward(self, x, lengths):
#         x = self.embedding(x)

#         # Transformer layers
#         mask = self.pad_to_equal_length(x, lengths).sum(dim=2) != 0
#         for layer in self.transformer_layers:
#             x = layer(x, mask=mask)

#         # Take the last output
#         last_output = x[:, -1, :]

#         output = self.actv(self.fc(last_output))
#         return output

In [179]:
for sample in train_dataloader:
    print(sample[0].shape)
    print(sample[1].shape)
    break

torch.Size([2048, 90, 5])
torch.Size([2048, 21])


# model compile

In [42]:
# Warmup Scheduler
class WarmupLR(optim.lr_scheduler.LambdaLR):

    def __init__(
        self,
        optimizer: optim.Optimizer,
        warmup_end_steps: int,
        last_epoch: int = -1,
    ):
        
        def wramup_fn(step: int):
            if step < warmup_end_steps:
                return float(step) / float(max(warmup_end_steps, 1))
            return 1.0
        
        super().__init__(optimizer, wramup_fn, last_epoch)


In [43]:
# set up gpu
gpu = 0

# define model
model = SalesForecastNet(cfg)
if gpu is not None:
    model.cuda(gpu)
model_name = type(model).__name__

# define loss
loss_function = nn.MSELoss()

# define optimizer
lr = cfg["LEARNING_RATE"]
optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer_name = type(optimizer).__name__

# define scheduler
scheduler = WarmupLR(optimizer, 1500)
scheduler_name = type(scheduler).__name__ if scheduler is not None else "no"

max_epoch = cfg["EPOCHS"]

# define wandb
project_name = "LG_AIMERS_Sales_Forecast"
current_time = datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
run_name = f"{current_time}_{model_name}_{optimizer_name}_optim_{lr}_with_{scheduler_name}"
run_name = sanitize_filename(run_name)
run_tags = [project_name]
wandb.init(
    project=project_name,
    name=run_name,
    tags=run_tags,
    config={"lr": lr, "model_name": model_name, "optimizer_name": optimizer_name, "scheduler_name": scheduler_name},
    reinit=True
)
wandb.watch(model)

# # set save model path
# run_dirname = "LG_AIMERS_Sales_Forecast"
# log_dir = os.path.join(project_root, "runs", run_dirname, run_name)
# log_model_path = os.path.join(log_dir, "models")
# os.makedirs(log_model_path, exist_ok=True)

VBox(children=(Label(value='0.009 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.667522…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

[]

# Define EarlyStopping

In [44]:
clip_value = 1.0

In [175]:
def train(model, optimizer, train_dataloader, val_dataloader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    
    for epoch in range(1, cfg['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_dataloader)):
            X = X.to(device)
            Y = Y.to(device)
            
            # Foward
            optimizer.zero_grad()
            # get prediction
            output = model(X)
            
            loss = criterion(output, Y)
            
            # back propagation

            loss.backward()
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()
            # Perform LR scheduler Work
            if scheduler is not None:
                scheduler.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_dataloader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
        
        # WandB logging
        wandb.log({
            "Epoch": epoch,
            "Train Loss": np.mean(train_loss),
            "Validation Loss": val_loss,
        })
        
    return best_model

def validation(model, val_dataloader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_dataloader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss)

In [61]:
infer_model = train(model, optimizer, train_dataloader, val_dataloader, device)

  0%|          | 0/2192 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (5) must match the size of tensor b (512) at non-singleton dimension 2

In [28]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = cfg['BATCH_SIZE'], shuffle=False, num_workers=cfg["NUM_WORKERS"])

In [29]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [30]:
pred = inference(infer_model, test_loader, device)

  0%|          | 0/8 [00:00<?, ?it/s]

# Submission

In [31]:
submit = pd.read_csv(data_root + "/sample_submission.csv")
submit.iloc[:,1:] = pred
submit.head()

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0.0,0.0,0.0,0.0,0.002025,0.002418,0.002897,0.007942,0.009496,...,0.010155,0.011174,0.011677,0.01383,0.014613,0.014759,0.018046,0.01779,0.020457,0.022615
1,1,0.06171,0.066687,0.074364,0.075773,0.07364,0.072372,0.070945,0.071412,0.073564,...,0.073324,0.07214,0.070894,0.072001,0.074156,0.074091,0.07397,0.071657,0.071515,0.072803
2,2,0.0,0.0,0.0,0.0,0.002356,0.002863,0.003147,0.008268,0.009726,...,0.01043,0.011593,0.01204,0.014246,0.014894,0.014944,0.018153,0.017871,0.020594,0.02283
3,3,0.0,0.0,0.0,0.0,0.002355,0.002862,0.003145,0.008267,0.009725,...,0.010428,0.011592,0.012039,0.014245,0.014893,0.014943,0.018152,0.01787,0.020593,0.02283
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000394,0.001176,0.004528,0.006145,0.008072


In [32]:
submit.to_csv('./baseline_submit.csv', index=False)