In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

In [2]:
os.chdir('./energy/')

In [3]:
train = pd.read_csv('train.csv', encoding='euc-kr')
test = pd.read_csv('test.csv', encoding='euc-kr')
submission = pd.read_csv('sample_submission.csv', encoding='euc-kr')

In [4]:
train.head(3)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0


In [5]:
test.head(3) #테스트에는 전력사용량이 없다

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,


# 결측치 파악 및 보간

In [6]:
train[['num', '비전기냉방설비운영','태양광보유']]
ice = {}
hot = {}
count = 0

In [7]:
for i in range(0, len(train), len(train)//60):
    count +=1
    ice[count] = train.loc[i, '비전기냉방설비운영']
    hot[count] = train.loc[i, '태양광보유']

In [8]:
for i in range(len(test)):
    test.loc[i, '비전기냉방설비운영'] = ice[test['num'][i]]
    test.loc[i, '태양광보유'] = hot[test['num'][i]]

In [9]:
train.head(3)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0


In [10]:
test.head(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0
1,1,2020-08-25 01,,,,,,0.0,0.0
2,1,2020-08-25 02,,,,,,0.0,0.0


In [11]:
def time(x): return int(x[-2:])

In [12]:
#시간, 요일, 주말여부
train['time'] = train['date_time'].apply(lambda x: time(x))
test['time'] = test['date_time'].apply(lambda x: time(x))

In [13]:
train.head(3)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,time
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2


In [14]:
test.head(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,time
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,0
1,1,2020-08-25 01,,,,,,0.0,0.0,1
2,1,2020-08-25 02,,,,,,0.0,0.0,2


In [15]:
def weekday(x): return pd.to_datetime(x[:10]).weekday()

In [16]:
#평일 0~4 , 주말 5~6
train['weekday'] = train['date_time'].apply(lambda x: weekday(x))
test['weekday'] = test['date_time'].apply(lambda x: weekday(x))

In [17]:
#평일 0 주말 1
train['weekend'] = train['weekday'].apply(lambda x:0 if x < 4 else 1)
test['weekend'] = test['weekday'].apply(lambda x:0 if x < 4 else 1)

In [18]:
train.head(3)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,time,weekday,weekend
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,0,0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,0,0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,0,0


In [19]:
test.head(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,time,weekday,weekend
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,0,1,0
1,1,2020-08-25 01,,,,,,0.0,0.0,1,1,0
2,1,2020-08-25 02,,,,,,0.0,0.0,2,1,0


In [20]:
#기온 풍속 습도 등 기타 결측치도 즉당히 1/3 2/3 수치로 보간해줍시다
test = test.interpolate(method='values')
test.head(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,time,weekday,weekend
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,0,1,0
1,1,2020-08-25 01,27.633333,1.366667,75.333333,0.0,0.0,0.0,0.0,1,1,0
2,1,2020-08-25 02,27.466667,1.233333,76.666667,0.0,0.0,0.0,0.0,2,1,0


# 2.모델링

In [21]:
#학습용 set 생성
train.drop('date_time', axis=1, inplace=True) #학습에 불필요한 날짜 제거
train_x = train.drop('전력사용량(kWh)', axis=1) #문제
train_y = train[['전력사용량(kWh)']] #정답지

날짜가 불필요한 데이터???

In [22]:
train_x.head(3)

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,time,weekday,weekend
0,1,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,0,0
1,1,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,0,0
2,1,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,0,0


In [23]:
train_y.head(3)

Unnamed: 0,전력사용량(kWh)
0,8179.056
1,8135.64
2,8107.128


In [24]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=156)

## 추후에 하이퍼 파라미터 변경해보자

train_test_split 이거 모르겠다...   
4개 변수가 각자 어떤 목적으로 사용되는지를 모르겠음  

In [25]:
X_train.head(1)

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,time,weekday,weekend
91868,46,21.0,5.2,80.0,0.0,0.0,1.0,0.0,20,2,0


In [26]:
y_train.head(1)

Unnamed: 0,전력사용량(kWh)
91868,1609.632


In [27]:
# loss function : SMAPE정의
#from sklearn.metrics import mean_absolute_error
def smape(true, pred):
    true = np.array(true)
    pred = np.array(pred)
    return np.mean((np.abs(true-pred)) / (np.abs(true) + np.abs(pred))) 
# *2, *100 은 상수이므로 생략

In [28]:
SMAPE = make_scorer(smape, greater_is_better=False) #smape값이 작아져야하므로 False

In [29]:
def get_best_params(model, params):
    grid_model = GridSearchCV(
    model,
    param_grid = params,
    cv=5,
    scoring=SMAPE)
    
    grid_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=100)
    scr = grid_model.best_score_
    print(f'{model.__class__.__name__} 최적 score 값{scr}')
    return grid_model.best_estimator_ 

In [38]:
#파라미터 후보군 설정
# 어떤 파라미터로 하는게 좋을지 고민된다면 고민하는 것들을 리스트 안에 다 넣어보세요 알아서 골라줄겁니다.
# 저는 예시로 learning_rate만 0.1 or 0.01 중 더 좋은걸 골라달라고 했습니다.

params = {}
params['boosting_type'] = ['gbdt'] #gbdt?? 
params['objective'] = ['regression']
params['n_estimators'] = [1]
params['learning_rate'] = [0.001]
params['subsample'] = [1]

params['n_estimators'] = [50, 100, 150, 200, 250,300,350,400,450,500]  
params['learning_rate'] = [0.1,0.01,0.03,0.001,0.003]   
  
=> 이렇게 돌렸더니   

LGBMRegressor 최적 score 값-0.29181390036584975  
LGBMRegressor(learning_rate=0.001, n_estimators=50, objective='regression',subsample=1)  
  
이렇게 나온다  

이곳 hyper param 추후 변경 예정

In [39]:
#모델 정의
model = LGBMRegressor(params)

In [40]:
#학습 진행
best_lgbm = get_best_params(model, params)
best_lgbm

LGBMRegressor 최적 score 값-0.2915382416967771


LGBMRegressor(learning_rate=0.001, n_estimators=1, objective='regression',
              subsample=1)

In [41]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = best_lgbm.predict(X_train)

mse_score = mean_squared_error(y_train, y_pred)
r2_score = r2_score(y_train, y_pred)
print('MSE:', mse_score)
print('R2 :',r2_score)

MSE: 4184009.556690822
R2 : 0.001697887018490274


In [42]:
#from sklearn.externals import joblib
import joblib #얘는 어떤 lib일까?
joblib.dump(best_lgbm, 'best_lgbm.pkl')
load_lgbm = joblib.load('best_lgbm.pkl')

# 3. 제출용 데이터 

In [43]:
#모델에 넣기 위해 날짜 칼럼 제거
test_x = test.drop('date_time', axis=1)

In [44]:
print(test_x)

       num     기온(°C)   풍속(m/s)      습도(%)  강수량(mm, 6시간)  일조(hr, 3시간)  \
0        1  27.800000  1.500000  74.000000           0.0     0.000000   
1        1  27.633333  1.366667  75.333333           0.0     0.000000   
2        1  27.466667  1.233333  76.666667           0.0     0.000000   
3        1  27.300000  1.100000  78.000000           0.0     0.000000   
4        1  26.900000  1.166667  79.666667           0.0     0.000000   
...    ...        ...       ...        ...           ...          ...   
10075   60  28.633333  3.566667  66.000000           0.0     0.533333   
10076   60  28.266667  3.833333  67.000000           0.0     0.266667   
10077   60  27.900000  4.100000  68.000000           0.0     0.000000   
10078   60  27.900000  4.100000  68.000000           0.0     0.000000   
10079   60  27.900000  4.100000  68.000000           0.0     0.000000   

       비전기냉방설비운영  태양광보유  time  weekday  weekend  
0            0.0    0.0     0        1        0  
1            0.0    0.0

In [45]:
#모델 예측
submission_y = best_lgbm.predict(test_x)

In [46]:
#submission.csv 생성
test['answer'] = submission_y
test['num_date_time'] = test.apply(lambda x: str(x['num']) + ' ' + x['date_time'], axis=1)
submission = test[['num_date_time', 'answer']]
submission.to_csv('advanced_lgbm_submission.csv',index=False, encoding='euc-kr')
submission

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,2321.005514
1,1 2020-08-25 01,2321.005514
2,1 2020-08-25 02,2321.005514
3,1 2020-08-25 03,2321.005514
4,1 2020-08-25 04,2321.005514
...,...,...
10075,60 2020-08-31 19,2315.666557
10076,60 2020-08-31 20,2314.630035
10077,60 2020-08-31 21,2314.630035
10078,60 2020-08-31 22,2314.630035
