<a href="https://colab.research.google.com/github/100jy/dacon_ts_forecasting/blob/main/ARmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torchcontrib



In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import Dataset
import torch.optim.adam
from torchcontrib.optim import SWA
import datetime
import matplotlib.pyplot as plt 
from tqdm import tqdm

In [4]:

train = pd.read_csv("./drive/MyDrive/데이콘/train.csv", encoding = 'euc-kr')

# 시간 관련 변수들
date_time = pd.to_datetime(train.DateTime)
#일자
train['Date'] = date_time.dt.date
train = train.groupby(train['Date']).sum().reset_index()  


def log_trans(x):
  return np.log(1+x)

def minmax(x):
  return (x-x.min())/(x.max()-x.min())


for target in ['사용자', '세션', '신규방문자', '페이지뷰']:
  train[target+'_mean'] = (train[target].rolling(60).mean())
  train[target+'_std'] = (train[target].rolling(60).std())


# 보간해줌
train.iloc[479,1:] = (train.iloc[477,1:] + train.iloc[480,1:]) // 2
train.iloc[478,1:] = (train.iloc[477,1:] + train.iloc[479,1:]) // 2

# ts feature 생성 
for target in ['사용자', '세션', '신규방문자', '페이지뷰']:

    #normalizing
    train[target] = (train[target] - train[target+'_mean']) / (train[target+'_std'] + 1e-5)
    train[target+'_mean'] = log_trans(train[target+'_mean'])
    train[target+'_std'] = log_trans(train[target+'_std'])
    
train = train.dropna()

In [5]:
train

Unnamed: 0,Date,사용자,세션,신규방문자,페이지뷰,사용자_mean,사용자_std,세션_mean,세션_std,신규방문자_mean,신규방문자_std,페이지뷰_mean,페이지뷰_std
59,2018-11-07,-0.556295,-0.584247,-0.384991,-0.432610,5.160300,4.607746,5.132066,4.561907,3.878811,3.885817,7.059603,6.741687
60,2018-11-08,-0.383759,-0.352675,0.001401,-0.468571,5.146040,4.598840,5.118892,4.553833,3.869811,3.883374,7.044077,6.738265
61,2018-11-09,-0.374020,-0.355272,-0.162480,-0.229845,5.133148,4.592309,5.107258,4.548561,3.865630,3.883535,7.027093,6.727741
62,2018-11-10,-0.348252,-0.372892,0.277985,-0.154622,5.113593,4.569798,5.088625,4.528572,3.866328,3.883715,7.011935,6.718447
63,2018-11-11,-0.219710,-0.251182,0.190051,0.441710,5.097628,4.552946,5.072984,4.512766,3.870159,3.884008,6.994514,6.691493
...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,2020-11-04,2.359355,2.389423,2.271748,2.233255,7.821449,6.755353,7.803183,6.743263,6.387665,5.584357,10.959621,10.114925
788,2020-11-05,1.845771,1.789555,1.629254,1.754383,7.836429,6.781409,7.818108,6.767000,6.407375,5.599400,10.977834,10.137575
789,2020-11-06,1.258113,1.230108,0.797956,1.134281,7.841165,6.793336,7.822912,6.778599,6.413377,5.604898,10.985730,10.148753
790,2020-11-07,-0.077260,-0.088545,-0.283087,-0.067939,7.840641,6.793387,7.822338,6.778666,6.411326,5.605585,10.987362,10.148065


In [6]:
def make_data(df, window_size=30):
  # in 180
  input_window = window_size
  # out 7
  output_window = 7

  window_x = np.zeros((df.shape[0] - (input_window + output_window), input_window, df.shape[1]-1))
  window_y = np.zeros((df.shape[0] - (input_window + output_window), output_window, 12))

  for start in range(df.shape[0] - (input_window + output_window)):
      end = start + input_window    
      window_x[start,:, :] = df.iloc[start : end, 1: ].values
      window_y[start,:, :] = df.iloc[end   : end + output_window, 1: 13].values


  return window_x, window_y

In [7]:
class DatasetWindows(Dataset):
  def __init__(self, df, input_days):

    x, y = make_data(df, input_days)
    
    self.x = torch.tensor(x, dtype=torch.float32).cuda()
    self.y = torch.tensor(y, dtype=torch.float32).cuda()
  
  def __len__(self):
    return len(self.x)
    
  def __getitem__(self, idx):
    return self.x[idx,...], self.y[idx,...]

In [8]:
# in 61 out 61일
# B x 61 x 75
# embedding
# B x 61 x 4
class SimpleLinear(nn.Module):
    def __init__(self, input_days, embedded_dim, hidden_size, num_layers, batch_out_p):
        super(SimpleLinear, self).__init__()

        self.embedding = nn.Sequential(nn.Linear(12, embedded_dim),
                                       nn.Dropout(batch_out_p),
                                       nn.ReLU())
        
        self.LSTM = nn.Sequential(nn.LSTM(input_size = input_days,
                            hidden_size = hidden_size,
                            num_layers=num_layers,
                            batch_first=True))
        
        self.last = nn.Sequential(nn.Linear(embedded_dim, 12))
    
    def forward(self, x_time):
        x_time = self.embedding(x_time)
        x_time = x_time.transpose(-2,-1)
        x_time,_ = self.LSTM(x_time)
        x_time = x_time[...,-7:]
        x_time = x_time.transpose(-2,-1)
        out_time = self.last(x_time)
        
        return out_time

In [17]:
class ModelManager():
  def __init__(self, model_name, embedded_dim, hidden_size, num_layers, batch_out_p, df, device='gpu', cv=1):
    super(ModelManager, self).__init__()
    self.cv = cv
    # CV 구현..
    self.models = []
    self.dataloders = []
    self.days = [90, 120, 180, 360]

    for i in range(cv):  
      input_days = self.days[i%len(self.days)]
      model =  model_name(input_days, embedded_dim, hidden_size, num_layers, batch_out_p)
      if device == 'gpu':
        model =  model.cuda()
      self.models.append(model)

      cv_set = df
      dataset = DatasetWindows(cv_set, input_days)
      self.dataloders.append(DataLoader(dataset, batch_size=30,  num_workers=0, pin_memory=False,
                                        shuffle=True))
    
  def fit(self, num_epochs=500, lr=1e-2 ,log=False, val_set=None, train_set=None):
    
    def get_val_loss():
      val_loss = self.make_val_plot(val_set, train_set, get_loss=True)
      return val_loss

    for i in tqdm(range(self.cv)):  
      # Train model
      model = self.models[i]
      dataloader = self.dataloders[i]
      adam = torch.optim.Adam(model.parameters(), lr=lr)
      optimizer = SWA(adam, swa_start=10, swa_freq=5, swa_lr=lr/2)
      criterion = nn.MSELoss(reduction='mean')
      running_loss = 0

      for epoch in range(num_epochs):
          for idx,data in enumerate(dataloader):
              x, y = data
              train_pred = model(x)
              loss = criterion(train_pred, y)
              optimizer.zero_grad()
              loss.backward()
              optimizer.step()
              running_loss += loss.item()
          if epoch % 100 == 99:
            if log:
              if val_set.any().any():
                val_loss = get_val_loss()
                print(f"{epoch+1} Epochs train MSE: {running_loss/(100*idx):1.5f}, ", f"{epoch+1} Epochs val MSE: {val_loss:1.5f}")
              else: 
                print(f"{epoch+1} Epochs train MSE: {loss.item():1.5f}")
              running_loss = 0

  @staticmethod
  def inverse_log(x):
    # 32bit 사용시 단위문제 발생..
    return np.exp(x)-1

  def predict(self, df):
    input_days = self.days[0]
    last_observe = df.iloc[-input_days:,1:]
    inp_tensor = torch.tensor(last_observe.values, dtype=torch.float32).cuda()
    inp_tensor = inp_tensor.unsqueeze(0)
    # normalizing 추가
    # AR
    for i in range(61//7 + 1):
      model = self.models[0].eval()
      prediction = model(inp_tensor)

      mu_7 = self.inverse_log(prediction[...,[4,6,8,10]].cpu().detach().squeeze().numpy())
      sigma_7 = self.inverse_log(prediction[...,[5,7,9,11]].cpu().detach().squeeze().numpy())
      x_7 = prediction[...,:4].cpu().detach().squeeze().numpy()

      if i > 0:
        mu = np.concatenate((mu, mu_7), axis=0)
        sigma = np.concatenate((sigma, sigma_7), axis=0)
        x = np.concatenate((x, x_7), axis=0)
      else:
        mu = mu_7
        sigma = sigma_7
        x =x_7

      inp_tensor = torch.cat([inp_tensor[:,7:,:], prediction], dim=1)
      
    prediction = ((x * sigma) + mu)

    self.models[0] = model.train()

    for i in range(1, self.cv): 
      input_days = self.days[i]
      last_observe = df.iloc[-input_days:,1:]
      inp_tensor = torch.tensor(last_observe.values, dtype=torch.float32).cuda()
      inp_tensor = inp_tensor.unsqueeze(0)
     
      for j in range(61//7 + 1):
        model = self.models[i].eval()
        prediction = model(inp_tensor)
        mu_7 = self.inverse_log(prediction[...,[4,6,8,10]].cpu().detach().squeeze().numpy())
        sigma_7 = self.inverse_log(prediction[...,[5,7,9,11]].cpu().detach().squeeze().numpy())
        x_7 = prediction[...,:4].cpu().detach().squeeze().numpy()
        
        if j > 0:
          mu = np.concatenate((mu, mu_7), axis=0)
          sigma_7 = np.concatenate((sigma, sigma_7), axis=0)
          x_7 = np.concatenate((x, x_7), axis=0)

        else:
          mu = mu_7
          sigma = sigma_7
          x =x_7
        
        inp_tensor = torch.cat([inp_tensor[:,7:,:], prediction],dim=1)

      pred = ((x * sigma) + mu)
      prediction += pred
      self.models[i] = model.train()
    
    prediction /= self.cv
    return  prediction.astype(np.int64)[:61,:]

  def make_val_plot(self, val_df, train_df, get_loss=False):
    #predict
    pred = self.predict(train_df)
    label = val_df

    def dacon_rmse(true, pred):  
      w0 = 1095.214646
      w1 = 1086.728535
      w2 = 268.070707
      w3 = 24236.194444

      score = (np.sqrt(np.mean(np.square(true[:,0] - pred[:,0]))) / w0 + 
               np.sqrt(np.mean(np.square(true[:,1] - pred[:,1]))) / w1 + 
               np.sqrt(np.mean(np.square(true[:,2] - pred[:,2]))) / w2 + 
               np.sqrt(np.mean(np.square(true[:,3] - pred[:,3]))) / w3  )
      return score

    if get_loss:
      return dacon_rmse(label.values, pred)

    for idx, key in enumerate(val_df.columns):
      plt.plot(figsize=(20,10))
      plt.plot(label.index,pred[:,idx])
      plt.plot(label[key])
      plt.legend(['predict', 'label'])
      plt.show()
      
    loss = dacon_rmse(label.iloc[:,:4].values, pred)
    print('RMSE : ' + str(loss))

In [None]:
# train val split
train_whole = train
train_split = train[:-61]

val = train.iloc[-61:, 1:5]
mu = (np.exp(train.iloc[-61:, [5,7,9,11]])-1).values
sigma = (np.exp(train.iloc[-61:, [6,8,10,12]])-1).values
val.iloc[:,:] = ((val.values * sigma) + mu).astype(np.int64)

embedded_dim = 256
hidden_size = 256
num_layers = 1
batch_out_p = 0.3

simple = ModelManager(SimpleLinear, embedded_dim, hidden_size, num_layers, batch_out_p, train_split, cv=1)
simple.fit(num_epochs=1000, lr=1e-3, log=True, val_set=val, train_set=train_split)


  0%|          | 0/1 [00:00<?, ?it/s][A

100 Epochs train MSE: 0.85967,  100 Epochs val MSE: 3.11933
200 Epochs train MSE: 0.17689,  200 Epochs val MSE: 3.57001
300 Epochs train MSE: 0.04552,  300 Epochs val MSE: 3.48797
400 Epochs train MSE: 0.02580,  400 Epochs val MSE: 3.62716
