# DACON 전력사용량 예측 AI 경진대회

- XGBoost

In [170]:
# 라이브러리 호출
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

## 데이터 전처리
- building_info.csv
- train.csv
- test.csv


In [30]:
# train.csv 호출 및 불필요 데이터 삭제, column 명 변경
train = pd.read_csv('train_dataset/train.csv')
train.drop(['일시', '강수량(mm)', '일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid', 'Elec']
train.dropna(inplace=True)


In [31]:
# test.csv 호출 및 불필요 데이터 삭제, column 명 변경
test = pd.read_csv('test/test.csv')
test.drop(['일시', '강수량(mm)'], axis=1, inplace=True)
test.columns = ['num_date_time', 'BNum', 'Temp', 'Wind', 'Humid']

In [32]:
# train.csv 시간만 분리
tmp = [int(time[-2:]) for time in train['num_date_time']]
train['Time'] = tmp

In [33]:
# test.csv 시간만 분리
tmp = [int(time[-2:]) for time in test['num_date_time']]
test['Time'] = tmp

In [34]:
# train.csv 정답 분리
X = train.drop(['num_date_time', 'Elec'], axis=1)
Y = train['Elec']

## train, test 분리

In [35]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.2)

In [36]:
train_x

Unnamed: 0,BNum,Temp,Wind,Humid,Time
48658,24,26.7,1.3,78.0,10
163276,81,21.0,0.9,83.0,4
14219,7,29.2,2.1,70.0,11
33233,17,29.6,2.7,74.0,17
60469,30,29.3,1.1,66.0,13
...,...,...,...,...,...
3754,2,24.2,1.7,93.0,10
28249,14,23.0,1.8,92.0,1
14639,8,18.1,0.1,100.0,23
173233,85,23.4,2.2,82.0,1


## 모델 학습
- GradientBoostingRegressor
- XGBoost

In [157]:
# 가중치 있는 mse 구현 https://saturncloud.io/blog/customizing-loss-functions-in-scikitlearn/ 참고
def MSE_W(y_true, y_pred):
    if y_true > y_pred:
        return 1.2 * np.mean((y_true-y_pred)**2) 
    else:
        return np.mean((y_true-y_pred)**2)


In [165]:
# 모델 정의
model = GradientBoostingRegressor(
    loss='huber',
    criterion='friedman_mse',
    n_estimators=100,
    max_depth=7,
    learning_rate=1,
    random_state=42
)

In [166]:
model.fit(train_x, train_y)

## 모델 평가
- SMAPE

In [167]:
# 평가함수(해당 대회의 평가 방법)
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

In [168]:
pred = model.predict(test_x)

In [169]:
SMAPE(test_y, pred)

6.9892066904747

## 결과 제출

In [129]:
ans = pd.DataFrame()
ans['num_date_time'] = test['num_date_time']
ans['answer'] = model.predict(test.drop('num_date_time', axis=1))
ans.to_csv('ans.csv', index=False)


In [130]:
ans

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1820.976417
1,1_20220825 01,1942.746457
2,1_20220825 02,1750.081486
3,1_20220825 03,1116.202304
4,1_20220825 04,942.111747
...,...,...
16795,100_20220831 19,840.161109
16796,100_20220831 20,748.019439
16797,100_20220831 21,707.016966
16798,100_20220831 22,611.822732
