# DACON 전력사용량 예측 AI 경진대회

- XGBoost

In [10]:
# 라이브러리 호출
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

## 데이터 전처리
- building_info.csv
- train.csv
- test.csv


In [2]:
# train.csv 호출 및 불필요 데이터 삭제, column 명 변경
train = pd.read_csv('train_dataset/train.csv')
train.drop(['일시', '강수량(mm)', '일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid', 'Elec']
train.dropna(inplace=True)


In [3]:
# test.csv 호출 및 불필요 데이터 삭제, column 명 변경
test = pd.read_csv('test/test.csv')
test.drop(['일시', '강수량(mm)'], axis=1, inplace=True)
test.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid']

In [4]:
# train.csv 시간만 분리
tmp = [int(time[-2:]) for time in train['num_date_time']]
train['Time'] = tmp

In [5]:
# test.csv 시간만 분리
tmp = [int(time[-2:]) for time in test['num_date_time']]
test['Time'] = tmp

In [6]:
# train.csv 정답 분리
X = train.drop(['num_date_time', 'Elec'], axis=1)
Y = train['Elec']

## train, test 분리

In [7]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2)

In [8]:
train_x

Unnamed: 0,BNum,Temp,Wind,Humid,Time
36946,19,24.3,1.6,65.0,10
48559,24,27.3,3.7,89.0,7
153885,76,28.1,2.2,82.0,21
12298,7,23.9,1.8,57.0,10
89516,44,26.2,1.1,90.0,20
...,...,...,...,...,...
16847,9,24.0,3.3,84.0,23
68000,34,25.0,6.1,95.0,8
173603,86,23.1,1.0,54.0,11
36631,18,25.0,0.9,94.0,7


## 모델 학습
- GradientBoostingRegressor
- XGBoost

In [22]:
# 가중치 있는 mse 구현 https://saturncloud.io/blog/customizing-loss-functions-in-scikitlearn/ 참고
def MSE_W(alpha=1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed


In [113]:
# 모델 정의
model = xgb.XGBRegressor(
    learning_rate=1,
    n_estimators=1000,
    eta = 0.1,
    max_depth=10,
    seed=42,
    gpu_id=1 
)
model.set_params(**{'objective':MSE_W(100)})

In [114]:
model.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)],
            early_stopping_rounds=300, verbose=False)





## 모델 평가
- SMAPE

In [115]:
# 평가함수(해당 대회의 평가 방법)
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

In [116]:
pred = model.predict(test_x)

In [118]:
SMAPE(test_y, pred)

7.978121004440116

## 결과 제출

In [119]:
ans = pd.DataFrame()
ans['num_date_time'] = test['num_date_time']
ans['answer'] = model.predict(test.drop('num_date_time', axis=1))
ans.to_csv('ans.csv', index=False)


In [120]:
ans

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1733.909790
1,1_20220825 01,1733.909790
2,1_20220825 02,1733.909790
3,1_20220825 03,1037.352051
4,1_20220825 04,1037.352051
...,...,...
16795,100_20220831 19,884.696411
16796,100_20220831 20,679.274353
16797,100_20220831 21,639.320740
16798,100_20220831 22,639.320740
