# DACON 전력사용량 예측 AI 경진대회

- XGBoost

In [1]:
# 라이브러리 호출
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

## 데이터 전처리
- building_info.csv
- train.csv
- test.csv


In [2]:
# train.csv 호출 및 불필요 데이터 삭제, column 명 변경
train = pd.read_csv('train_dataset/train.csv')
train.drop(['일시', '강수량(mm)', '일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid', 'Elec']
train.dropna(inplace=True)


In [3]:
# test.csv 호출 및 불필요 데이터 삭제, column 명 변경
test = pd.read_csv('test/test.csv')
test.drop(['일시', '강수량(mm)'], axis=1, inplace=True)
test.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid']

In [4]:
# train.csv 시간만 분리
tmp = [int(time[-2:]) for time in train['num_date_time']]
train['Time'] = tmp

In [5]:
# train.csv 날짜 분리
tmp = [int(time[-5:-3]) for time in train['num_date_time']]
train['Date'] = tmp

In [6]:
# train.csv 월 분리
tmp = [int(time[-7:-5]) for time in train['num_date_time']]
train['Month'] = tmp

In [7]:
# test.csv 시간만 분리
tmp = [int(time[-2:]) for time in test['num_date_time']]
test['Time'] = tmp

In [8]:
# test.csv 날짜 분리
tmp = [int(time[-5:-3]) for time in test['num_date_time']]
test['Date'] = tmp

In [9]:
# test.csv 월 분리
tmp = [int(time[-7:-5]) for time in test['num_date_time']]
test['Month'] = tmp

In [10]:
# train.csv 정답 분리
X = train.drop(['num_date_time', 'Elec'], axis=1)
Y = train['Elec']

## train, test 분리

In [11]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2)

In [12]:
train_x

Unnamed: 0,BNum,Temp,Wind,Humid,Time,Date,Month
169183,83,23.4,1.4,87.0,7,19,8
80097,40,27.1,3.3,89.0,9,23,6
87017,43,31.1,1.4,68.0,17,26,7
197197,97,30.7,5.6,70.0,13,27,7
90297,45,24.7,2.3,82.0,9,23,6
...,...,...,...,...,...,...,...
108428,54,19.4,3.9,80.0,20,13,6
145514,72,26.1,3.5,96.0,2,29,6
51608,26,25.6,1.5,78.0,8,26,6
142161,70,30.0,2.9,64.0,9,29,7


## 모델 학습
- GradientBoostingRegressor
- XGBoost

In [13]:
# 가중치 있는 mse 구현 https://saturncloud.io/blog/customizing-loss-functions-in-scikitlearn/ 참고
def MSE_W(alpha=1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [14]:
# 모델 정의
model = xgb.XGBRegressor(
    booster='gbtree',
    subsample=0.7,
    colsample_bytree=1,
    n_estimators=3000,
    eta = 0.01,
    max_depth=15,
    seed=42,
    gpu_id=0,
    reg_lambda=5,
    reg_alpha=5,
    gamma=1
)
model.set_params(**{'objective':MSE_W(3)})

In [15]:
model.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)],
            early_stopping_rounds=300, verbose=False)



## 모델 평가
- SMAPE

In [16]:
# 평가함수(해당 대회의 평가 방법)
def SMAPE(true, pred):
    return 2 * np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

In [17]:
pred = model.predict(test_x)

In [18]:
SMAPE(test_y, pred)

5.232840303583486

## 결과 제출

In [19]:
ansXGB = pd.DataFrame()
ansXGB['num_date_time'] = test['num_date_time']
ansXGB['answer'] = model.predict(test.drop('num_date_time', axis=1))
ansXGB.to_csv('ansXGB.csv', index=False)


In [20]:
ansXGB

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1977.320801
1,1_20220825 01,2074.975586
2,1_20220825 02,2038.807373
3,1_20220825 03,2002.721558
4,1_20220825 04,1965.742432
...,...,...
16795,100_20220831 19,893.678833
16796,100_20220831 20,760.837830
16797,100_20220831 21,663.249756
16798,100_20220831 22,613.011902
