In [1]:
import pandas as pd
import numpy as np
import random 
import os

from sklearn.preprocessing import MinMaxScaler, LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

# Data Load

In [3]:
train_df = pd.read_csv('data/train2.csv')
test_df = pd.read_csv('data/test2.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

# Data Pre-processing

In [4]:
train_df = train_df.drop(['sunshine', 'radiation'], axis=1)
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,temp,rain,wind,humidity,power,month,day,time,건물유형,area,cooling_area
0,1_20220601 00,1,20220601 00,18.6,2.302924,0.9,42.0,1085.28,6,1,0,건물기타,110634.0,39570.0
1,1_20220601 01,1,20220601 01,18.0,2.302924,1.1,45.0,1047.36,6,1,1,건물기타,110634.0,39570.0
2,1_20220601 02,1,20220601 02,17.7,2.302924,1.5,45.0,974.88,6,1,2,건물기타,110634.0,39570.0
3,1_20220601 03,1,20220601 03,16.7,2.302924,1.4,48.0,953.76,6,1,3,건물기타,110634.0,39570.0
4,1_20220601 04,1,20220601 04,18.4,2.302924,2.8,43.0,986.4,6,1,4,건물기타,110634.0,39570.0


In [5]:
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,temp,rain,wind,humidity,month,day,time,건물유형,area,cooling_area
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8,25,0,건물기타,110634.0,39570.0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8,25,1,건물기타,110634.0,39570.0
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8,25,2,건물기타,110634.0,39570.0
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8,25,3,건물기타,110634.0,39570.0
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8,25,4,건물기타,110634.0,39570.0


In [6]:
# 순서 재배치
train_df = train_df[train_df.columns[:7].to_list() + train_df.columns[8:].to_list() + train_df.columns[7:8].to_list()]
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,temp,rain,wind,humidity,month,day,time,건물유형,area,cooling_area,power
0,1_20220601 00,1,20220601 00,18.6,2.302924,0.9,42.0,6,1,0,건물기타,110634.0,39570.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,2.302924,1.1,45.0,6,1,1,건물기타,110634.0,39570.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,2.302924,1.5,45.0,6,1,2,건물기타,110634.0,39570.0,974.88
3,1_20220601 03,1,20220601 03,16.7,2.302924,1.4,48.0,6,1,3,건물기타,110634.0,39570.0,953.76
4,1_20220601 04,1,20220601 04,18.4,2.302924,2.8,43.0,6,1,4,건물기타,110634.0,39570.0,986.4


In [7]:
le = LabelEncoder()
train_df['건물유형'] = le.fit_transform(train_df['건물유형'])
test_df['건물유형'] = le.transform(test_df['건물유형'])

train_df['month'] = train_df['month'].astype('float64')
train_df['day'] = train_df['day'].astype('float64')
train_df['time'] = train_df['time'].astype('float64')
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,temp,rain,wind,humidity,month,day,time,건물유형,area,cooling_area,power
0,1_20220601 00,1,20220601 00,18.6,2.302924,0.9,42.0,6.0,1.0,0.0,0,110634.0,39570.0,1085.28
1,1_20220601 01,1,20220601 01,18.0,2.302924,1.1,45.0,6.0,1.0,1.0,0,110634.0,39570.0,1047.36
2,1_20220601 02,1,20220601 02,17.7,2.302924,1.5,45.0,6.0,1.0,2.0,0,110634.0,39570.0,974.88
3,1_20220601 03,1,20220601 03,16.7,2.302924,1.4,48.0,6.0,1.0,3.0,0,110634.0,39570.0,953.76
4,1_20220601 04,1,20220601 04,18.4,2.302924,2.8,43.0,6.0,1.0,4.0,0,110634.0,39570.0,986.4


In [8]:
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,temp,rain,wind,humidity,month,day,time,건물유형,area,cooling_area
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8,25,0,0,110634.0,39570.0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8,25,1,0,110634.0,39570.0
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8,25,2,0,110634.0,39570.0
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8,25,3,0,110634.0,39570.0
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8,25,4,0,110634.0,39570.0


# Hyperparameter

In [33]:
# 하이퍼파라미터
input_size = 11  # feature의 개수
hidden_size = 128
num_layers = 10
output_size = 1
num_epochs = 30
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 128
learning_rate = 0.001

# Dataset

In [34]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.df[idx:idx+self.window_size, :], dtype=torch.float)
        if self.df.shape[1] > 1:
            y = torch.tensor(self.df[idx+self.window_size, -1], dtype=torch.float)
        else:
            y = None
        return x, y

def create_data_loader(df, window_size, batch_size):
    dataset = TimeSeriesDataset(df, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [35]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_df.drop(['num_date_time', '건물번호', '일시'], axis=1).values)
train_loader = create_data_loader(train_data, window_size, batch_size)

In [36]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, (hn, cn) = self.lstm1(x, (h0, c0))
        out = self.linear(out)
        out = self.relu(out)
        out, _ = self.lstm2(out, (hn, cn))
        out = self.fc(out[:, -1, :])

        return out

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"current device: {device}")

model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current device: cuda


In [38]:
for epoch in range(num_epochs):
    print('Epoch {} ----------' .format(epoch+1))
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Step [{}/{}], Loss: {:.4f}' 
                   .format(i+1, len(train_loader), loss.item()))
    print()

Epoch 1 ----------
Step [300/1594], Loss: 0.0007
Step [600/1594], Loss: 0.0049
Step [900/1594], Loss: 0.0019
Step [1200/1594], Loss: 0.0040
Step [1500/1594], Loss: 0.0006

Epoch 2 ----------
Step [300/1594], Loss: 0.0007
Step [600/1594], Loss: 0.0049
Step [900/1594], Loss: 0.0021
Step [1200/1594], Loss: 0.0039
Step [1500/1594], Loss: 0.0008

Epoch 3 ----------
Step [300/1594], Loss: 0.0007
Step [600/1594], Loss: 0.0077
Step [900/1594], Loss: 0.0013
Step [1200/1594], Loss: 0.0041
Step [1500/1594], Loss: 0.0006

Epoch 4 ----------
Step [300/1594], Loss: 0.0007
Step [600/1594], Loss: 0.0128
Step [900/1594], Loss: 0.0016
Step [1200/1594], Loss: 0.0043
Step [1500/1594], Loss: 0.0006

Epoch 5 ----------
Step [300/1594], Loss: 0.0008
Step [600/1594], Loss: 0.0035
Step [900/1594], Loss: 0.0024
Step [1200/1594], Loss: 0.0038
Step [1500/1594], Loss: 0.0009

Epoch 6 ----------
Step [300/1594], Loss: 0.0007
Step [600/1594], Loss: 0.0107
Step [900/1594], Loss: 0.0013
Step [1200/1594], Loss: 0.0041


# Test Dataset

In [105]:
# 학습 데이터에서 마지막 행 가져오기
last_train_data = train_df.drop(['num_date_time', '건물번호', '일시',], axis=1).loc[len(train_df)-window_size:,:]

# 실수형 데이터로 변환
test_df['humidity'] = test_df['humidity'].astype('float64')
test_df['month'] = test_df['month'].astype('float64')
test_df['day'] = test_df['day'].astype('float64')
test_df['time'] = test_df['time'].astype('float64')

# 전력소비량 열 생성
final_df = pd.concat((test_df.drop(['num_date_time', '건물번호', '일시',], axis=1), pd.DataFrame(np.zeros(test_df.shape[0]))),axis=1)
final_df = final_df.rename({0:'power'},axis=1)

In [106]:
test_df = pd.concat((last_train_data, final_df)).reset_index(drop=True)
test_data = scaler.transform(test_df.values) # train과 동일하게 scaling
test_data.shape

(16824, 11)

In [107]:
# Dataset & DataLoader
test_dataset = TimeSeriesDataset(test_data, window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Inference

In [108]:
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - window_size):
        x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))
        
        test_data[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

In [109]:
predictions = scaler.inverse_transform(test_data)[window_size:,-1] # 원래 scale로 복구

In [111]:
sample_submission['answer'] = predictions
sample_submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,939.109048
1,1_20220825 01,939.109048
2,1_20220825 02,939.109048
3,1_20220825 03,939.109048
4,1_20220825 04,939.109048
...,...,...
16795,100_20220831 19,939.109048
16796,100_20220831 20,939.109048
16797,100_20220831 21,939.109048
16798,100_20220831 22,939.109048
