In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import warnings
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import platform

warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

In [5]:
CFG = {
    'TRAIN_WINDOW_SIZE':90, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':10,
    'LEARNING_RATE':5e-6,
    'BATCH_SIZE':2048,
    'SEED':41
}

PATH = os.getcwd() + '/data/'

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

In [7]:
train_data = pd.read_csv(PATH + 'train.csv')
train_data.head(3)

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,2022-01-01,2022-01-02,2022-01-03,2022-01-04,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,0,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,2,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CNN-LSTM MODEL
![스크린샷 2023-08-04 오후 5 16 00](https://github.com/Megvii-BaseDetection/YOLOX/assets/103639510/6e612140-39b5-4198-bb14-81ff2aca5f22)


In [5]:
class Conv1d_LSTM(nn.Module):
    def __init__(self, in_channel=5, hidden_size = 64, out_channel=CFG['PREDICT_SIZE']):
        super(Conv1d_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.conv1d_1 = nn.Conv1d(in_channels=in_channel,
                                out_channels=32,
                                kernel_size=3,
                                stride=1,
                                padding=1)
        
        # self.conv1d_2 = nn.Conv1d(in_channels=16,
        #                         out_channels=32,
        #                         kernel_size=3,
        #                         stride=1,
        #                         padding=1)
        
        self.lstm = nn.LSTM(input_size = 32,
                            hidden_size=hidden_size,
                            num_layers=1,
                            bias=True,
                            bidirectional=False,
                            batch_first=True)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(hidden_size//2, out_channel)
        )

        self.actv = nn.ReLU()
        # self.dense1 = nn.Linear(hidden_size, hidden_size//2)
        # self.dropout = nn.Dropout()
        # self.dense2 = nn.Linear(hidden_size//2, out_channel)
    
    def init_hidden(self, batch_size, device):
        return (
            torch.zeros(1, batch_size, self.hidden_size, device = device),
            torch.zeros(1, batch_size, self.hidden_size, device = device)
        )

    def forward(self, x):
	# Raw x shape : (B, TRAIN_WINDOW_SIZE, in_channel) => (B, 90, 5)
        
        # Shape : (B, F: in_channel, S: TRAIN_WINDOW_SIZE) => (B, 5, 90)
        x = x.transpose(1, 2)
        # Shape with Conv1d_1 : (B, F, S) == (B, C, S) || C = channel => (B, 16, 90)
        x = self.conv1d_1(x)
        ## Shape with Conv1d_2 : (B, C, S) => (B, 32, 90)
        # x = self.conv1d_2(x)
        ## Reshape : (B, S, C) == (B, S, F) => (B, 90, 32)
        x = x.transpose(1, 2)
        
        # self.lstm.flatten_parameters()
        # # Shape : (B, S, H) // H = hidden_size => (B, 90, 512)
        # _, (hidden, _) = self.lstm(x)
        # # Shape : (B, H) // -1 means the last sequence => (B, 512)
        # x = hidden[-1]
        
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size, x.device)

        # LSTM Layer
        lstm_out, hidden = self.lstm(x, hidden)

        # Only use the last output sequence
        last_output = lstm_out[:, -1, :]

        # Fully connected layer
        output = self.actv(self.fc(last_output))
        x = output.squeeze(1)
        
        # # Shape : (B, 512)
        # x = self.dense1(x)
        # # ReLU
        # x = F.relu(x)
        # # Shape : (B, H) => (B, 256)
        # x = self.dropout(x)
        # # Shape : (B, O) // O = output => (B, 21)
        # x = self.dense2(x)

        return x

## Make Dataset

In [None]:
train_data = train_data.drop(columns = ['ID', '제품'])

# Data Scaling
scale_max_dict = {}
scale_min_dict = {}

for idx in tqdm(range(len(train_data))):
    maxi = np.max(train_data.iloc[idx,4:])
    mini = np.min(train_data.iloc[idx,4:])
    
    if maxi == mini :
        train_data.iloc[idx,4:] = 0
    else:
        train_data.iloc[idx,4:] = (train_data.iloc[idx,4:] - mini) / (maxi - mini)
    
    scale_max_dict[idx] = maxi
    scale_min_dict[idx] = mini

In [7]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [8]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간 => 90 Days
    predict_size : 추론할 기간 => 21 Days
    '''
    num_rows = len(data) # 15890
    window_size = train_size + predict_size # 90 + 21 = 111
    
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1), dtype = np.float16)
    # (5609170, 90, 5)
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size), dtype = np.float16)
    # (5640950, 21)

    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])  # Label: 대분류, 중분류, 소분류, 브랜드
        sales_data = np.array(data.iloc[i, 4:]) # 날짜 데이터: 2022-01-01 ~ 2023-04-04
        
        for j in range(len(sales_data) - window_size + 1): # 0 ~ 348
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [9]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    
    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1), dtype = np.float16)
    
    for i in tqdm(range(num_rows)):
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data
    
    return input_data

In [None]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

In [11]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

print(train_input.shape)
print(train_target.shape)
print(val_input.shape)
print(val_target.shape)
print(test_input.shape)

(4487336, 90, 5)
(4487336, 21)
(1121834, 90, 5)
(1121834, 21)
(15890, 90, 5)


In [12]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## Loss

#### Metric

In [None]:
def pseudo_sfa(pred, df):
    pred_length = pred.shape[1] - 1
    true = df.iloc[:, -pred_length:].reset_index() \
        .rename(columns={"index": "ID"})

    main_id = {}
    for main_cat in df["대분류"].unique():
        main_id[main_cat] = df.query("대분류==@main_cat")["ID"].to_list()

    psfa = []
    for main_cat in main_id.keys():
        indices = true["ID"].isin(main_id[main_cat])

        true_arr = true[indices].iloc[:, 1:].to_numpy()
        pred_arr = pred[indices].iloc[:, 1:].to_numpy()

        eps = np.ones((true_arr.shape)) / 1e8

        true_sum = true_arr.sum(axis=0)
        true_sum = np.stack([true_sum]*len(true_arr)) + eps
        true_rate = true_arr / true_sum

        abs_error = np.abs(true_arr - pred_arr)
        denom = np.maximum(true_arr, pred_arr+eps)
        
        score = 1 - (1 / true_arr.shape[1]
                     * (abs_error / denom) * true_rate).sum()
        psfa.append(score)
        print(main_cat, score)

    return np.mean(psfa)

#### Scaler

In [None]:
class IndividualMinMaxScaler:
    def __get_min_val(self, df, date_cols):
        df_mins = df[date_cols].min(axis=1).to_numpy()
        return df_mins

    def __get_max_val(self, df, date_cols):
        df_maxs = df[date_cols].max(axis=1).to_numpy()
        return df_maxs


    def fit(self, df: pd.DataFrame):
        date_cols = [col for col in df.columns  # 일자 변수들만 Scale하기
                     if col.startswith("2")]

        self.min_val = self.__get_min_val(df, date_cols)
        self.max_val = self.__get_max_val(df, date_cols)
        denom = self.max_val - self.min_val
        self.denom = np.where(denom==0, 1, denom)

    def transform(self, df: pd.DataFrame):
        date_cols = [col for col in df.columns
                     if col.startswith("2")]
        return df[date_cols] \
            .apply(lambda x: (x-self.min_val)/self.denom)

    def fit_transform(self, df: pd.DataFrame):
        self.fit(df)
        return self.transform(df)

#### Loss Function

In [None]:
class PsfaLoss(nn.Module):
    def __init__(self, scaler, df):
        super().__init__()
        self.scaler = scaler
        self.main_cats = df.groupby("대분류")["ID"].unique().values
    
    def forward(self, pred, true):
        # pred: [batch_size, length, products(15890)]
        pred = pred * torch.tensor(self.scaler.denom) \
            + torch.tensor(self.scaler.min_val)
        true = true * torch.tensor(self.scaler.denom) \
            + torch.tensor(self.scaler.min_val)

        L1scaled = torch.abs(true-pred) / torch.maximum(pred, true+1e-8)
        
        rate = torch.zeros_like(true)
        for i in range(len(self.main_cats)):
            rate[:, :, self.main_cats[i]] = \
                true[:, :, self.main_cats[i]] \
                / (true[:, :, self.main_cats[i]].sum(dim=-1, keepdim=True) + 1e-8) \
                / len(self.main_cats)
        return (L1scaled * rate).sum() / (true.shape[0] * true.shape[1])

## Train Function

In [13]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999
    best_model = None
    
    # Epoch
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
    
    # Iteration
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_loss_li = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    
    return best_model, train_loss, val_loss_li

In [14]:
def validation(model, val_loader, criterion, device):
    model.eval()
    val_loss = []
    
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    return np.mean(val_loss), val_loss

In [None]:
model = Conv1d_LSTM()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG['LEARNING_RATE'])
infer_model, train_loss, val_loss = train(model, optimizer, train_loader, val_loader, device)

In [None]:
SAVE_PATH = os.getcwd() + '/cnn_lstm.pth'
torch.save(infer_model.state_dict(), SAVE_PATH)

In [67]:
infer_model.state_dict()

OrderedDict([('conv1d_1.weight',
              tensor([[[-0.2139,  0.0341,  0.0178],
                       [ 0.2430,  0.1748, -0.1341],
                       [ 0.1950, -0.2251, -0.1796],
                       [ 0.1951,  0.1846, -0.2366],
                       [-0.2379,  0.2016, -0.1399]],
              
                      [[ 0.1218,  0.2366,  0.0295],
                       [-0.2214,  0.1076, -0.1064],
                       [-0.1019, -0.0861, -0.1100],
                       [ 0.2166,  0.2255,  0.0938],
                       [-0.2652,  0.0379,  0.0283]],
              
                      [[ 0.1974,  0.0710, -0.1311],
                       [ 0.1524,  0.2426,  0.2417],
                       [ 0.0776, -0.0773,  0.0488],
                       [ 0.2244, -0.1414,  0.1487],
                       [-0.0419,  0.1354, -0.0563]],
              
                      [[-0.0784, -0.0625,  0.1732],
                       [-0.0773, -0.2054, -0.2235],
                       [ 0.2032,  0

In [68]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            output = model(X)
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [69]:
prediction = inference(infer_model, test_loader, device)
prediction.shape

  0%|          | 0/8 [00:00<?, ?it/s]

(15890, 21)

In [70]:
submit = pd.read_csv(PATH + '/sample_submission.csv')
submit.iloc[:, 1:] = prediction

In [74]:
SAVE_SUBMIT = os.getcwd() + '/data/cnn_lstm_test.csv'
submit.to_csv(SAVE_SUBMIT)

In [75]:
submit

Unnamed: 0,ID,2023-04-05,2023-04-06,2023-04-07,2023-04-08,2023-04-09,2023-04-10,2023-04-11,2023-04-12,2023-04-13,...,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24,2023-04-25
0,0,0.074958,0.021919,0.058938,0.063323,0.077961,0.033219,0.000000,0.062225,0.004978,...,0.028688,0.000000,0.000000,0.0,0.072217,0.085864,0.046388,0.057272,0.031109,0.083085
1,1,0.067382,0.019339,0.062163,0.064648,0.083207,0.032057,0.000000,0.059975,0.001315,...,0.023031,0.000000,0.000000,0.0,0.071236,0.083069,0.051398,0.052051,0.032542,0.082119
2,2,0.067413,0.018728,0.062136,0.065073,0.083972,0.031766,0.000000,0.060243,0.001488,...,0.022108,0.000000,0.000000,0.0,0.070908,0.083447,0.051409,0.052545,0.031610,0.081690
3,3,0.067411,0.018729,0.062136,0.065076,0.083977,0.031768,0.000000,0.060244,0.001481,...,0.022104,0.000000,0.000000,0.0,0.070906,0.083448,0.051409,0.052544,0.031609,0.081694
4,4,0.058988,0.000000,0.065890,0.063309,0.063863,0.000000,0.000317,0.045002,0.000000,...,0.008858,0.000000,0.000000,0.0,0.053872,0.066913,0.062819,0.052554,0.011568,0.065357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,15885,0.084488,0.056618,0.084525,0.094908,0.097554,0.057195,0.052711,0.087004,0.050217,...,0.067580,0.031295,0.026144,0.0,0.078733,0.086771,0.074674,0.092337,0.042366,0.087180
15886,15886,0.082372,0.054405,0.083648,0.093777,0.096568,0.052019,0.049801,0.084127,0.048317,...,0.066425,0.029713,0.023569,0.0,0.078804,0.085397,0.074005,0.090643,0.041790,0.087124
15887,15887,0.082361,0.054393,0.083643,0.093771,0.096563,0.051995,0.049787,0.084114,0.048307,...,0.066420,0.029706,0.023555,0.0,0.078804,0.085390,0.074001,0.090634,0.041787,0.087123
15888,15888,0.081895,0.054128,0.083854,0.093492,0.096244,0.051313,0.049531,0.083500,0.048328,...,0.066638,0.030191,0.023226,0.0,0.079321,0.085127,0.074197,0.089995,0.042427,0.087072
