In [1]:
import random
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import DataLoader, TensorDataset

In [2]:
device = torch.device('mps') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
CFG = {
    'TRAIN_WINDOW_SIZE':120, # 90일치로 학습
    'PREDICT_SIZE':21, # 21일치 예측
    'EPOCHS':30,
    'LEARNING_RATE':0.0001,
    'BATCH_SIZE':512,
    'SEED':41
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
train_data = pd.read_csv('/Users/leeshinhaeng/Desktop/open/train.csv').drop(columns=['ID', '제품'])

In [6]:
# Scaling 코드 (이희원님 코드 가져옴)

# 숫자형 변수들의 min-max scaling을 수행하는 코드입니다.
numeric_cols = train_data.columns[4:]
# 칵 column의 min 및 max 계산
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)
# 각 행의 범위(max-min)를 계산하고, 범위가 0인 경우 1로 대체
ranges = max_values - min_values
ranges[ranges == 0] = 1
# min-max scaling 수행
train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
# max와 min 값을 dictionary 형태로 저장
scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [7]:
# Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['대분류', '중분류', '소분류', '브랜드']

# 범주형 칼럼별로 레이블 인코딩
for col in categorical_columns:
    label_encoder.fit(train_data[col])
    train_data[col] = label_encoder.transform(train_data[col])

In [8]:
def make_train_data(data, train_size=CFG['TRAIN_WINDOW_SIZE'], predict_size=CFG['PREDICT_SIZE']):
    '''
    학습 기간 블럭, 예측 기간 블럭의 세트로 데이터를 생성
    data : 일별 판매량
    train_size : 학습에 활용할 기간
    predict_size : 추론할 기간
    '''
    # 데이터 행 수 저장
    num_rows = len(data)
    # 전체 시간 간격 (학습 기간 + 예측 기간)
    window_size = train_size + predict_size
    
    # 빈 배열로 초기화
    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size, len(data.iloc[0, :4]) + 1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))
    
    # 학습 데이터, 레이블 데이터 학습
    for i in tqdm(range(num_rows)):
        # 정보를 나타내는 4개 열을 encode_info로
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, 4:])
        # 학습 기간 블럭 : temp_data에 저장, input_data에 추가
        # 예측 기간 블럭 : windorw[train_size:]로 추출, target_data에 추가
        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]
    
    return input_data, target_data

In [9]:
data=train_data
train_size=CFG['TRAIN_WINDOW_SIZE'] # 120
predict_size=CFG['PREDICT_SIZE'] # 21

num_rows = len(data) # 15890
window_size = train_size + predict_size # 120+21

In [10]:
def make_predict_data(data, train_size=CFG['TRAIN_WINDOW_SIZE']):
    '''
    평가 데이터(Test Dataset)를 추론하기 위한 Input 데이터를 생성
    data : 일별 판매량
    train_size : 추론을 위해 필요한 일별 판매량 기간 (= 학습에 활용할 기간)
    '''
    num_rows = len(data)
    
    # 초기 빈 배열로 초기화
    input_data = np.empty((num_rows, train_size, len(data.iloc[0, :4]) + 1))
    
    for i in tqdm(range(num_rows)):
        # 정보를 나타내는 4개 열을 encode_info로
        encode_info = np.array(data.iloc[i, :4])
        sales_data = np.array(data.iloc[i, -train_size:])
        
        window = sales_data[-train_size : ]
        temp_data = np.column_stack((np.tile(encode_info, (train_size, 1)), window[:train_size]))
        input_data[i] = temp_data
    
    return input_data

In [11]:
train_input, train_target = make_train_data(train_data)
test_input = make_predict_data(train_data)

  0%|          | 0/15890 [00:00<?, ?it/s]

  0%|          | 0/15890 [00:00<?, ?it/s]

In [12]:
# Train / Validation Split (60,20,20)
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [13]:
train_input.shape, train_target.shape, val_input.shape, val_target.shape, test_input.shape

((4105976, 120, 5),
 (4105976, 21),
 (1026494, 120, 5),
 (1026494, 21),
 (15890, 120, 5))

In [14]:
# 주어진 데이터셋을 X와 Y로 나누어 입력과 타겟을 각각 저장
# 학습과 평가를 위한 데이터셋을 편리하게 관리
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [15]:
train_dataset = CustomDataset(train_input, train_target)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_input, val_target)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [117]:
# Model
class CNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, seq_length):
        super(CNN_LSTM, self).__init__()
        # 변수 정의
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.seq_length = seq_length
        
        # 모델 정의
        self.cnn1 = nn.Sequential(
            nn.Conv1d(in_channels=5, out_channels=32, kernel_size = 3, stride=1, padding=1),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.cnn2 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size = 3, stride=1, padding=1),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        # LSTM
        self.lstm = nn.LSTM(64, hidden_size, num_layers, bias=True, batch_first=True)
        
        self.dropout = nn.Dropout(0.1)
        
        self.fc1 = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.fc2 = nn.Sequential(
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.fc3 = nn.Linear(64, output_size)
        
    def forward(self, x):
        # CNN Layer
        # (512, 120, 5) -> (512, 5, 120)
        x = x.transpose(1, 2)
        # (512, 32, 118)
        x = self.cnn1(x)
        # (512, 64, 116)
        x = self.cnn2(x)
        # (512, 64, 116) -> (512, 116, 64)
        x = x.transpose(1,2)
        print('CNN layer Complete')
        
        # LSTM Layer
        # x: (batch_size, seq_length, input_size)
        # h0 size : (2, 512, 256)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # c0 size : (2, 512, 256)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # (512,116, 256)
        out, _ = self.lstm(x, (h0, c0))
        print('LSTM Layer Complete')
        
        # (512, 256) (두번째 차원 제외)
        out = out[:, -1, :]  # 마지막 time step의 출력만 사용
        # (512, 256) -> (512, 128)
        out = self.fc1(out)
        # (512, 128) -> (512, 64)
        out = self.fc2(out)
        # (512, 64) -> (512, 21)
        out = self.fc3(out)
        print('Fully Connected Layer Complete')
        return out

In [118]:
for sample in train_loader:
    print(sample[0].shape)
    print(sample[1].shape)
    break

torch.Size([512, 120, 5])
torch.Size([512, 21])


In [119]:
train_input.shape, train_target.shape

((4105976, 120, 5), (4105976, 21))

In [120]:
input_size = train_input.shape[2] # 5
hidden_size = 256
output_size = 21
num_layers = 2
seq_length = train_input.shape[1]
epochs = CFG['EPOCHS'] # 30
learning_rate = CFG['LEARNING_RATE'] # 0.0001
dropout = 0.1

In [121]:
model = CNN_LSTM(input_size, hidden_size, output_size, num_layers, seq_length)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [122]:
model

CNN_LSTM(
  (cnn1): Sequential(
    (0): Conv1d(5, 32, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (cnn2): Sequential(
    (0): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Sequential(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
  )
  (fc2): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
  )
  (fc3): Linear(in_features=64, out_features=21, bias=True)
)

In [124]:
def train(model, optimizer, train_loader, val_loader, device):
    model.to(device)
    # MSE 정의
    criterion = nn.MSELoss().to(device)
    best_loss = 9999999 # 가장 낮은 손실
    best_model = None
    # 에포크 횟수만큼 반복 (10번)
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            # gradient 0으로 초기화
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            # 손실을 역전파해서 그래디언트 계산
            loss.backward()
            # 옵티마이저 이용해 모델 파라미터 업데이트
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss = validation(model, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}]')
        # 검증 손실과 학습 손실을 출력하고, 현재 검증 손실이 이전까지의 최적의 검증 손실보다 작다면, 모델을 best_model에 저장하고 'Model Saved'를 출력
        if best_loss > val_loss:
            best_loss = val_loss
            best_model = model
            print('Model Saved')
    return best_model

In [125]:
def validation(model, val_loader, criterion, device):
    # 평가모드
    model.eval()
    val_loss = []
    # 검증 과정에서 gradient 계산하지 않도록
    with torch.no_grad():
        for X, Y in tqdm(iter(val_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            # 모델에 입력 데이터 전달해 출력 계산
            output = model(X)
            loss = criterion(output, Y)
            
            val_loss.append(loss.item())
    # 평균 손실 계산해 반환
    return np.mean(val_loss)

In [126]:
infer_model = train(model, optimizer, train_loader, val_loader, device)

  0%|          | 0/8020 [00:00<?, ?it/s]

CNN layer Complete
LSTM Layer Complete
Fully Connected Layer Complete
CNN layer Complete
LSTM Layer Complete
Fully Connected Layer Complete


KeyboardInterrupt: 

In [None]:
# 모델 추론
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    predictions = []
    
    with torch.no_grad():
        # gradient 계산하지 않도록
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            # 입력 데이터 전달
            output = model(X)
            
            # 모델 출력인 output을 CPU로 이동하고 numpy 배열로 변환
            output = output.cpu().numpy()
            
            predictions.extend(output)
    
    return np.array(predictions)

In [None]:
pred = inference(infer_model, test_loader, device)

In [None]:
# 추론 결과를 inverse scaling
for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]
    
# 결과 후처리
pred = np.round(pred, 0).astype(int)

In [None]:
submit = pd.read_csv('/Users/leeshinhaeng/Desktop/open/sample_submission.csv')
submit.head()

In [None]:
submit.iloc[:,1:] = pred
submit.head()