# DACON 전력사용량 예측 AI 경진대회

- XGBoost

In [1]:
# 라이브러리 호출
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

## 데이터 전처리
- building_info.csv
- train.csv
- test.csv


In [2]:
# train.csv 호출 및 불필요 데이터 삭제, column 명 변경
train = pd.read_csv('train_dataset/train.csv')
train.drop(['일시', '강수량(mm)', '일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid', 'Elec']
train.dropna(inplace=True)


In [3]:
# test.csv 호출 및 불필요 데이터 삭제, column 명 변경
test = pd.read_csv('test/test.csv')
test.drop(['일시', '강수량(mm)'], axis=1, inplace=True)
test.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid']

In [4]:
# train.csv 시간만 분리
tmp = [int(time[-2:]) for time in train['num_date_time']]
train['Time'] = tmp

In [5]:
# test.csv 시간만 분리
tmp = [int(time[-2:]) for time in test['num_date_time']]
test['Time'] = tmp

In [6]:
# train.csv 정답 분리
X = train.drop(['num_date_time', 'Elec'], axis=1)
Y = train['Elec']

## train, test 분리

In [7]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2)

In [8]:
train_x

Unnamed: 0,BNum,Temp,Wind,Humid,Time
136569,67,26.4,1.9,92.0,9
128807,64,20.0,1.7,85.0,23
8390,5,22.8,2.9,74.0,14
100067,50,17.2,0.9,88.0,11
161381,80,17.4,1.8,89.0,5
...,...,...,...,...,...
117014,58,30.7,3.7,65.0,14
15544,8,26.5,0.8,81.0,16
116678,58,25.2,2.4,54.0,14
98814,49,25.7,1.8,99.0,6


## 모델 학습
- GradientBoostingRegressor
- XGBoost

In [9]:
# 가중치 있는 mse 구현 https://saturncloud.io/blog/customizing-loss-functions-in-scikitlearn/ 참고
def MSE_W(alpha=1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [29]:
# 모델 정의
model = xgb.XGBRegressor(
    n_estimators=1000,
    eta = 0.01,
    max_depth=15,
    seed=42,
    gpu_id=0,
    reg_lambda=5,
    reg_alpha=5
)

In [30]:
model.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)],
            early_stopping_rounds=300, verbose=False)



## 모델 평가
- SMAPE

In [31]:
# 평가함수(해당 대회의 평가 방법)
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

In [32]:
pred = model.predict(test_x)

In [33]:
SMAPE(test_y, pred)

4.734672655069357

## 결과 제출

In [34]:
ansXGB = pd.DataFrame()
ansXGB['num_date_time'] = test['num_date_time']
ansXGB['answer'] = model.predict(test.drop('num_date_time', axis=1))
ansXGB.to_csv('ansXGB.csv', index=False)


In [35]:
ansXGB

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1861.048584
1,1_20220825 01,1448.675293
2,1_20220825 02,1632.581543
3,1_20220825 03,1217.203247
4,1_20220825 04,954.250549
...,...,...
16795,100_20220831 19,874.658020
16796,100_20220831 20,728.286499
16797,100_20220831 21,643.138794
16798,100_20220831 22,610.183167
