# DACON 전력사용량 예측 AI 경진대회

- XGBoost

In [93]:
# 라이브러리 호출
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

## 데이터 전처리
- building_info.csv
- train.csv
- test.csv


In [94]:
# train.csv 호출 및 불필요 데이터 삭제, column 명 변경
train = pd.read_csv('train_dataset/train.csv')
train.drop(['일시', '강수량(mm)', '일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid', 'Elec']
train.dropna(inplace=True)


In [95]:
# test.csv 호출 및 불필요 데이터 삭제, column 명 변경
test = pd.read_csv('test/test.csv')
test.drop(['일시', '강수량(mm)'], axis=1, inplace=True)
test.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid']

In [96]:
# train.csv 시간만 분리
tmp = [int(time[-2:]) for time in train['num_date_time']]
train['Time'] = tmp

In [97]:
# train.csv 날짜 분리
tmp = [int(time[-5:-3]) for time in train['num_date_time']]
train['Date'] = tmp

In [98]:
# train.csv 월 분리
tmp = [int(time[-7:-5]) for time in train['num_date_time']]
train['Month'] = tmp

In [99]:
# test.csv 시간만 분리
tmp = [int(time[-2:]) for time in test['num_date_time']]
test['Time'] = tmp

In [100]:
# test.csv 날짜 분리
tmp = [int(time[-5:-3]) for time in test['num_date_time']]
test['Date'] = tmp

In [101]:
# test.csv 월 분리
tmp = [int(time[-7:-5]) for time in test['num_date_time']]
test['Month'] = tmp

In [102]:
# train.csv 정답 분리
X = train.drop(['num_date_time', 'Elec'], axis=1)
Y = train['Elec']

## train, test 분리

In [103]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2)

In [104]:
train_x

Unnamed: 0,BNum,Temp,Wind,Humid,Time,Date,Month
148590,73,23.6,1.1,100.0,6,11,8
21901,11,30.5,4.7,72.0,13,2,8
34165,17,28.6,2.6,85.0,13,3,8
35166,18,19.7,0.7,100.0,6,21,6
85959,43,28.1,1.5,49.0,15,12,6
...,...,...,...,...,...,...,...
66950,33,25.5,2.7,96.0,14,9,8
173386,85,26.0,4.2,65.0,10,24,8
100656,50,25.3,3.6,87.0,0,30,6
85519,42,23.0,0.8,88.0,7,18,8


## 모델 학습
- GradientBoostingRegressor
- XGBoost

In [108]:
# 가중치 있는 mse 구현 https://saturncloud.io/blog/customizing-loss-functions-in-scikitlearn/ 참고
def MSE_W(alpha=3):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [127]:
# 모델 정의
model = xgb.XGBRegressor(
    n_estimators=1000,
    eta = 0.01,
    max_depth=15,
    seed=42,
    gpu_id=0,
    reg_lambda=5,
    reg_alpha=5,
    gamma=1
)

In [128]:
model.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)],
            early_stopping_rounds=300, verbose=False)



## 모델 평가
- SMAPE

In [129]:
# 평가함수(해당 대회의 평가 방법)
def SMAPE(true, pred):
    return 2 * np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

In [130]:
pred = model.predict(test_x)

In [131]:
SMAPE(test_y, pred)

5.659556544844626

## 결과 제출

In [132]:
ansXGB = pd.DataFrame()
ansXGB['num_date_time'] = test['num_date_time']
ansXGB['answer'] = model.predict(test.drop('num_date_time', axis=1))
ansXGB.to_csv('ansXGB.csv', index=False)


In [133]:
ansXGB

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2060.836182
1,1_20220825 01,2033.545288
2,1_20220825 02,1979.461548
3,1_20220825 03,1932.427734
4,1_20220825 04,1818.896240
...,...,...
16795,100_20220831 19,986.890564
16796,100_20220831 20,855.643066
16797,100_20220831 21,695.974548
16798,100_20220831 22,613.997620
