In [None]:
import random
import os
from time import time
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

cfg = {
    'train_window_size':70, # 90일치로 학습
    'predict_size':21, # 21일치 예측
    'epochs':7,
    'learning_rate':1e-3,
    'batch_size':128,
    'SEED':4
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(cfg['SEED']) # Seed 고정

In [None]:
import zipfile
import os

zip_path = '/content/drive/MyDrive/LG/LG Aimers 3기_온라인 채널 제품 판매량 예측 AI 오프라인 해커톤_오프라인_data.zip'
extract_path = '/content/LG_data'
os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
train_row = pd.read_csv('/content/LG_data/train.csv')
train_row

Unnamed: 0,ID,제품,대분류,중분류,소분류,브랜드,쇼핑몰,2022-01-01,2022-01-02,2022-01-03,...,2023-04-15,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24
0,SAMPLE_00000,B002-00001-00001,B002-C001-0002,B002-C002-0007,B002-C003-0038,B002-00001,S001-00001,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,SAMPLE_00001,B002-00002-00001,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,0,0,0,...,2,0,2,0,2,2,1,0,0,0
2,SAMPLE_00002,B002-00002-00002,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,SAMPLE_00003,B002-00002-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00001,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,SAMPLE_00004,B002-00002-00004,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-00002,S001-00010,0,0,0,...,10,6,4,4,4,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28889,SAMPLE_28889,B002-03798-00046,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03798,S001-00001,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28890,SAMPLE_28890,B002-03799-00002,B002-C001-0003,B002-C002-0008,B002-C003-0042,B002-03799,S001-00001,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28891,SAMPLE_28891,B002-03799-00003,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,S001-00001,0,0,0,...,2,4,1,0,0,0,4,0,2,0
28892,SAMPLE_28892,B002-03799-00004,B002-C001-0003,B002-C002-0008,B002-C003-0044,B002-03799,S001-00001,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
shop4 = train_row[train_row['쇼핑몰']=='S001-00004'].reset_index(drop=True)
shop10 = train_row[train_row['쇼핑몰']=='S001-00010'].reset_index(drop=True)

In [None]:
temp

Unnamed: 0,제품,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,...,2023-04-15,2023-04-16,2023-04-17,2023-04-18,2023-04-19,2023-04-20,2023-04-21,2023-04-22,2023-04-23,2023-04-24
0,B002-00002-00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10,6,4,4,4,0,0,0,0,8
1,B002-00005-00002,0.0,0.0,0.0,130.0,80.0,120.0,0.0,840.0,0.0,...,40,0,10,172,320,45,5,50,5,144
2,B002-00005-00003,26.0,40.0,58.0,32.0,18.0,10.0,20.0,2.0,2.0,...,9,2,23,24,15,16,3,36,8,4
3,B002-00005-00005,12.0,24.0,0.0,50.0,36.0,6.0,0.0,6.0,0.0,...,75,44,109,60,29,32,87,84,48,72
4,B002-00012-00001,0.0,18.0,20.0,10.0,8.0,12.0,10.0,16.0,8.0,...,4,5,6,11,5,0,8,11,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,B002-03786-00003,12.0,12.0,16.0,36.0,64.0,16.0,12.0,20.0,76.0,...,2,2,16,54,8,42,22,12,0,26
1840,B002-03791-00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,6,18,0,0,0,0
1841,B002-03794-00001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,3,1,2,1,1,0,0,0,1
1842,B002-03796-00004,86.0,14.0,54.0,60.0,44.0,40.0,36.0,0.0,132.0,...,24,19,10,0,11,23,40,39,48,28


In [None]:
temp4 = shop4.iloc[:,[1]+list(range(7,410))]
temp10 = shop10.iloc[:,[1]+list(range(410,486))]
temp = pd.merge(temp4,temp10, how='right')
temp = temp.fillna(0)

shop10_id = shop10.iloc[:,:7]
final = pd.merge(shop10_id,temp).dropna()
final_numeric = final.iloc[:,7:].astype(int)
final_idx = final.iloc[:,:7]
final_real = pd.concat([final_idx,final_numeric],axis=1)

In [None]:
train_data_no410 = train_row[(train_row['쇼핑몰']!='S001-00004')&(train_row['쇼핑몰']!='S001-00010')]

In [None]:
train_data = pd.concat([train_data_no410,final_real]).sort_values('ID').reset_index(drop=True)
numeric_cols = train_data.columns[7:]

In [None]:
min_values = train_data[numeric_cols].min(axis=1)
max_values = train_data[numeric_cols].max(axis=1)

ranges = max_values - min_values
ranges[ranges == 0] = 1

train_data[numeric_cols] = (train_data[numeric_cols].subtract(min_values, axis=0)).div(ranges, axis=0)
train_value = train_data[numeric_cols]

scale_min_dict = min_values.to_dict()
scale_max_dict = max_values.to_dict()

In [None]:
# univariate
def make_train_data(data, train_size=cfg['train_window_size'], predict_size=cfg['predict_size']):
    num_rows = len(data)
    window_size = train_size + predict_size

    input_data = np.empty((num_rows * (len(data.columns) - window_size + 1), train_size,1))
    target_data = np.empty((num_rows * (len(data.columns) - window_size + 1), predict_size))

    for i in tqdm(range(num_rows)):
        sales_data = np.array(data.iloc[i, :])

        for j in range(len(sales_data) - window_size + 1):
            window = sales_data[j : j + window_size]
            temp_data = window[:train_size].reshape(-1,1)
            input_data[i * (len(data.columns) - window_size + 1) + j] = temp_data
            target_data[i * (len(data.columns) - window_size + 1) + j] = window[train_size:]

    return input_data, target_data

def make_predict_data(data, train_size=cfg['train_window_size']):
    num_rows = len(data)

    input_data = np.empty((num_rows, train_size, 1))

    for i in tqdm(range(num_rows)):
        sales_data = np.array(data.iloc[i, -train_size:])

        window = sales_data[-train_size : ]
        temp_data = window[:train_size].reshape(-1,1)
        input_data[i] = temp_data

    return input_data

In [None]:
train_input, train_target = make_train_data(train_value)
test_input = make_predict_data(train_value)

  0%|          | 0/21967 [00:00<?, ?it/s]

  0%|          | 0/21967 [00:00<?, ?it/s]

In [None]:
# Train / Validation Split
data_len = len(train_input)
val_input = train_input[-int(data_len*0.2):]
val_target = train_target[-int(data_len*0.2):]
train_input = train_input[:-int(data_len*0.2)]
train_target = train_target[:-int(data_len*0.2)]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])

    def __len__(self):
        return len(self.X)

In [None]:
train_dataset = CustomDataset(train_input, train_target)
val_dataset = CustomDataset(val_input, val_target)

train_loader = DataLoader(train_dataset, batch_size = cfg['batch_size'], shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size = cfg['batch_size'], shuffle=False, num_workers=0)

In [None]:
class LTSF_NLinear(torch.nn.Module):
    def __init__(self, window_size, forcast_size, individual, feature_size):
        super(LTSF_NLinear, self).__init__()
        self.window_size = window_size
        self.forcast_size = forcast_size
        self.individual = individual
        self.channels = feature_size
        if self.individual:
            self.Linear = torch.nn.ModuleList()
            for i in range(self.channels):
                self.Linear.append(torch.nn.Linear(self.window_size, self.forcast_size))
        else:
            self.Linear = torch.nn.Linear(self.window_size, self.forcast_size)

    def forward(self, x):
        # Apply mask with probability P to the last variable of indices 50 to 83 only during training
        if self.training:
            P = 0.9
            if torch.rand(1).item() < P:  # 90% 확률로 True
                x[:, 30-90+cfg['train_window_size']:65-90+cfg['train_window_size'], -1] = 0

        seq_last = x[:,-1:,:].detach()
        x = x - seq_last
        if self.individual:
            output = torch.zeros([x.size(0), self.forcast_size, x.size(2)],dtype=x.dtype).to(x.device)
            for i in range(self.channels):
                output[:,:,i] = self.Linear[i](x[:,:,i])
            x = output
        else:
            x = self.Linear(x.permute(0,2,1)).permute(0,2,1)
        x = x + seq_last
        return x

In [None]:
train_loss_list = []
valid_loss_list = []

model = LTSF_NLinear(window_size=cfg['train_window_size'],
                     forcast_size=21,
                     individual=False,
                     feature_size=1)#.to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=cfg['learning_rate'])
max_loss = 999999999

for epoch in range(1, cfg['epochs']+1):
    loss_list = []
    model.train()
    for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.unsqueeze(-1))
        loss.backward()
        optimizer.step()
        loss_list.append(loss.item())
    train_loss_list.append(np.mean(loss_list))

    model.eval()
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            valid_loss = criterion(output, target.unsqueeze(-1))
            valid_loss_list.append(valid_loss)

    if valid_loss < max_loss:
        torch.save(model, 'model.pth')
        max_loss = valid_loss
        best_model = model
        print("valid_loss={:.5f},Model Save".format(valid_loss))
        dlinear_best_epoch = epoch
        dlinear_best_train_loss = np.mean(loss_list)
        dlinear_best_valid_loss = np.mean(valid_loss.item())

    print("epoch = {}, train_loss : {:.5f}, valid_loss : {:.5f}".format(epoch, np.mean(loss_list), valid_loss))

  0%|          | 0/53408 [00:00<?, ?it/s]

valid_loss=0.00378,Model Save
epoch = 1, train_loss : 0.01698, valid_loss : 0.00378


  0%|          | 0/53408 [00:00<?, ?it/s]

epoch = 2, train_loss : 0.01696, valid_loss : 0.00390


  0%|          | 0/53408 [00:00<?, ?it/s]

epoch = 3, train_loss : 0.01696, valid_loss : 0.00402


  0%|          | 0/53408 [00:00<?, ?it/s]

valid_loss=0.00377,Model Save
epoch = 4, train_loss : 0.01697, valid_loss : 0.00377


  0%|          | 0/53408 [00:00<?, ?it/s]

epoch = 5, train_loss : 0.01696, valid_loss : 0.00387


  0%|          | 0/53408 [00:00<?, ?it/s]

valid_loss=0.00348,Model Save
epoch = 6, train_loss : 0.01696, valid_loss : 0.00348


  0%|          | 0/53408 [00:00<?, ?it/s]

epoch = 7, train_loss : 0.01696, valid_loss : 0.00373


In [None]:
test_dataset = CustomDataset(test_input, None)
test_loader = DataLoader(test_dataset, batch_size = cfg['batch_size'], shuffle=False, num_workers=2)

In [None]:
def inference(model, test_loader, device):
    predictions = []

    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            output = model(X)

            output = output.cpu().numpy()

            predictions.extend(output)

    return np.array(predictions)

In [None]:
pred = inference(best_model, test_loader, device).squeeze().squeeze()*1.2

for idx in range(len(pred)):
    pred[idx, :] = pred[idx, :] * (scale_max_dict[idx] - scale_min_dict[idx]) + scale_min_dict[idx]

pred = np.where(pred<=0,0,pred)
pred = np.round(pred, 0).astype(int)

  0%|          | 0/172 [00:00<?, ?it/s]

In [None]:
submit = pd.read_csv('/content/LG_data/sample_submission.csv')

shop4 = train_row[train_row['쇼핑몰']=='S001-00004']
shop4_ID = shop4['ID']
submit_pred = submit[~submit['ID'].isin(shop4_ID)]
submit_shop4 = submit[submit['ID'].isin(shop4_ID)]

no_shop4_id = train_data['ID']
pred_submit = pd.concat([no_shop4_id,pd.DataFrame(pred)],axis=1)
pred_submit.columns = submit.columns

In [None]:
nlinear = pd.concat([pred_submit,submit_shop4]).sort_values('ID').reset_index(drop=True)
arima = pd.read_csv('data/submission/auto.arima_1.2.csv')

ensembling = round((nlinear.iloc[:,1:]*0.55+arima.iloc[:,1:]*0.45)).astype(int)
ensembling = pd.concat([nlinear.iloc[:,0],ensembling],axis=1)
ensembling.to_csv('data/submission/간지팀제출.csv',index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/submission/auto.arima_1.2.csv'