In [1]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ------------ --------------------------- 0.4/1.3 MB 13.2 MB/s eta 0:00:01
   --------------- ------------------------ 0.5/1.3 MB 10.9 MB/s eta 0:00:01
   ----------------------------- ---------- 1.0/1.3 MB 7.8 MB/s eta 0:00:01
   ---------------------------------------  1.3/1.3 MB 8.5 MB/s eta 0:00:01
   ---------------------------------------- 1.3/1.3 MB 7.7 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import random
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb

In [3]:
train_df = pd.read_csv('./Data/train.csv')
building_info = pd.read_csv('./Data/building_info.csv')
test_df = pd.read_csv('./Data/test.csv')

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [5]:
train_df = train_df.fillna(0)

In [6]:
# 파생변수 THI, CDH
train_df['THI'] = 9/5*train_df['기온(C)'] - 0.55*(1-train_df['습도(%)']/100)*(9/5*train_df['습도(%)']-26)+32

def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])

for num in range(1,101,1):
    temp = train_df[train_df['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
    
train_df['CDH'] = cdhs

In [7]:
# Weekday 변수 추가
import datetime

def to_datetime(s):
    """
    Args:
        s: ex) '20220601 01'
    Returns:
        weekday: 0~6(int), 0: 월요일, 1: 화요일, ...
    """
    s = s.split()[0]  # 20220601
    date = datetime.datetime.strptime(s, '%Y%m%d')
    weekday = date.weekday()  
    return weekday

In [8]:
train_df['Weekday'] = train_df.apply(lambda x:to_datetime(x['일시']), axis=1)

In [9]:
# 전날과의 차이 변수 추가
for i in range(1, 101):
    df = train_df[train_df['건물번호'] == i]
    train_df.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    train_df.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    train_df.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

In [10]:
train_df['기온_gap'] = train_df['기온_gap'].fillna(0)
train_df['풍속_gap'] = train_df['풍속_gap'].fillna(0)
train_df['습도_gap'] = train_df['습도_gap'].fillna(0)

In [11]:
def train_test_split(df, th):
    train = df[df['일시'].str[:8].astype(int) < th].reset_index(drop=True)
    test = df[df['일시'].str[:8].astype(int) >= th].reset_index(drop=True)
    return train, test

In [12]:
def preprocess_x(df):
    to_remove_columns = ['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)']
    df = df.fillna(0)
    #시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
    df['month'] = df['일시'].apply(lambda x : int(x[4:6]))
    df['day'] = df['일시'].apply(lambda x : int(x[6:8]))
    df['time'] = df['일시'].apply(lambda x : int(x[9:11]))

    df['holiday'] = df.apply(lambda x : 1 if x['day']==6 else 0, axis = 1)

    df['holiday'][(df['month']==6) & (df['day']==1)] = 1
    df['holiday'][(df['month']==6) & (df['day']==6)] = 1
    df['holiday'][(df['month']==8) & (df['day']==15)] = 1

    df['sin_time'] = np.sin(2*np.pi*df.time/24)
    df['cos_time'] = np.cos(2*np.pi*df.time/24)

    df = df.merge(building_info.iloc[:, :4])
    df['건물유형'] = df['건물유형'].astype('category').cat.codes

    # 요금 가중치
    df['fare_w'] = 0

    # 일반, 산업
    for i in [0,1,3,4,5,6,8,9,10,11] :
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 9) & (df['time'] <10))] = 1.144
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 12) & (df['time'] <13))] = 1.144
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 17) & (df['time'] <22))] = 1.144
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 10) & (df['time'] <12))] = 1.965
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 13) & (df['time'] <17))] = 1.965
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 8) & (df['time'] <9))] = 0.615
        df['fare_w'][(df['건물유형']==i) & ((df['time'] >= 22) | (df['time'] <8))] = 0.451
    # 교육
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 9) & (df['time'] <10))] = 0.944
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 12) & (df['time'] <13))] = 0.944
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 17) & (df['time'] <22))] = 0.944
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 10) & (df['time'] <12))] = 1.603
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 13) & (df['time'] <17))] = 1.603
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 8) & (df['time'] <9))] = 0.497
    df['fare_w'][(df['건물유형']==2) & ((df['time'] >= 22) | (df['time'] <8))] = 0.451

    # 토요일
    df['fare_w'][(df['Weekday']==5) & ((df['time'] >= 10) & (df['time'] <12))] = 1.144
    df['fare_w'][(df['Weekday']==5) & ((df['time'] >= 13) & (df['time'] <17))] = 1.144
    df['fare_w'][(df['Weekday']==5) & ((df['time'] >= 10) & (df['time'] <12)) & (df['건물유형']==2)] = 0.944
    # 공휴일
    df['fare_w'][(df['holiday']==1) & ((df['time'] >= 10) & (df['time'] <12))] = 0.615
    df['fare_w'][(df['holiday']==1) & ((df['time'] >= 13) & (df['time'] <17))] = 0.615
    df['fare_w'][(df['holiday']==1) & ((df['time'] >= 10) & (df['time'] <12)) & (df['건물유형']==2)] = 0.497

    # 주택
    df['fare_w'][(df['건물유형']==7)] = 0.782

    for c in to_remove_columns:
        if c in df.columns:
            df = df.drop(columns=[c])
    return df

In [13]:
date_th = 20220818
train, val = train_test_split(train_df, date_th)
train_x = preprocess_x(train)
train_y = train['전력소비량(kWh)']

valid_x = preprocess_x(val)
valid_y = val['전력소비량(kWh)']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['holiday'][(df['month']==6) & (df['day']==1)] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['holiday'][(df['month']==6) & (df['day']==6)] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['holiday'][(df['month']==8) & (df['day']==15)] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fare

In [14]:
def validate_multi(valid_x, valid_y, models):
    """
    Args:
        models: dict, {1: model1, 2: model2, ..., 100: model100}
    """
    preds = []
    
    for i in range(1, 101):
        _x = valid_x[valid_x['건물번호'] == i]
        _x = _x.drop(columns=['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)'])
        pred = models[i].predict(_x).tolist()
        preds.extend(pred)
        
    preds = np.array(preds)
    smape_score, mae_score = SMAPE(valid_y, preds), mae(valid_y, preds)
    
    return smape_score, mae_score

In [15]:
def train_multiple_models(train_x, train_y, n_estimators=100, lr=0.05, md=-1):
    models = {}
    for i in tqdm(range(1, 101)):
        _x = train_x[train_x['건물번호'] == i]
        _x = _x.drop(columns=['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)'])
        _y = train_y[_x.index]
        model_lgb = lgb.LGBMRegressor(objective='regression', n_estimators=n_estimators, learning_rate=lr, max_depth = md, verbose=-1)
        model_lgb.fit(_x, _y)
        models[i] = model_lgb
    return models

In [16]:
def SMAPE(y, pred):
    smape = abs((y - pred))/((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)
    return smape

def mae(y, pred):
    return np.mean(abs(y-pred))

def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

In [24]:
models = train_multiple_models(train_x, train_y, 100, 0.04, 10)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:21<00:00,  4.75it/s]


In [25]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

SMAPE: 5.379266940723972
MAE: 105.87189425466114


In [None]:
test_df['THI'] = 9/5*test_df['기온(C)'] - 0.55*(1-test_df['습도(%)']/100)*(9/5*test_df['습도(%)']-26)+32

cdhs = np.array([])
for num in range(1,101,1):
    temp = test_df[test_df['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
test_df['CDH'] = cdhs

In [None]:
test_df['Weekday'] = test_df.apply(lambda x:to_datetime(x['일시']), axis=1)

In [None]:
for i in range(1, 101):
    df = test_df[test_df['건물번호'] == i]
    test_df.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    test_df.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    test_df.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

In [None]:
test_df['기온_gap'] = test_df['기온_gap'].fillna(0)
test_df['풍속_gap'] = test_df['풍속_gap'].fillna(0)
test_df['습도_gap'] = test_df['습도_gap'].fillna(0)

In [None]:
test_df = preprocess_x(test_df)

In [None]:
preds_real = []
for i in tqdm(range(1, 101)):
    _x = test_df[test_df['건물번호'] == i]
    _x = _x.drop(columns=['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)'])
    pred = models[i].predict(_x).tolist()
    preds_real.extend(pred)

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['answer'] = preds_real
submission.to_csv('lgbm.csv', index = False)