# 전처리 ver.2를 사용해 예측 모델링

후보 모델

 - Xgboost
 
 - Random Forest
 
 - Lightgbm

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
directory = './energy'

train_energy = pd.read_csv(directory + '/train.csv', encoding = 'euc-kr')
test_energy = pd.read_csv(directory + '/test.csv', encoding = 'euc-kr')
sample_submission = pd.read_csv(directory + './sample_submission.csv', encoding = 'euc-kr')

In [3]:
def preprocessing_dataset(train, test):
    '''train dataset, test dataset 전처리'''
    
    # 불쾌지수, 체감온도 계산 함수 정의
    def calculate_discomfort_index(temp, humidity):
        '''불쾌지수 계산'''
        discomfort_index = 9 * temp / 5 - 0.55 * (1 - humidity / 100) * (9 * temp / 5 - 26) + 32
        return discomfort_index

    def calculate_windchill_temp(temp, wind):
        '''체감온도 계산'''
        windchill_temp = 13.12 + 0.6215 * temp - 11.37 * (wind ** 0.16) + 0.3965 * (wind ** 0.16) * temp
        return windchill_temp

    # 원본에 영향이 가지 않게 사본 저장
    train_dataset = train.copy()
    test_dataset = test.copy()

    # 1. 변수명 변경
    train_columns_name = ['num', 'date_time', 'usage', 'temp', 'wind', 'humidity', 'precipitation', 'sunshine', 'operation_non_elec', 'solar']
    test_columns_name = ['num', 'date_time', 'temp', 'wind', 'humidity', 'precipitation', 'sunshine', 'operation_non_elec', 'solar']

    train_dataset.columns = train_columns_name
    test_dataset.columns = test_columns_name

    # 2. train, test 변수의 데이터 타입 변경
    train_dtypes = {}
    train_dtypes_list = ['category', 'object', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'category', 'category']

    for index, value in enumerate(train_dataset.columns):
        train_dtypes[value] = train_dtypes_list[index]

    test_dtypes = {}
    test_dtypes_list = ['category', 'object', 'float64', 'float64', 'float64', 'float64', 'float64', 'category', 'category']

    for index, value in enumerate(test_dataset.columns):
        test_dtypes[value] = test_dtypes_list[index]

    train_dataset = train_dataset.astype(train_dtypes)
    test_dataset = test_dataset.astype(test_dtypes)

    train_dataset['date_time'] = pd.to_datetime(train_dataset['date_time'], format='%Y-%m-%d %H')
    test_dataset['date_time'] = pd.to_datetime(test_dataset['date_time'], format='%Y-%m-%d %H')

    # 3. test_dataset의 결측치 채워넣기 1 : train에서 operation_non_elec, solar 변수 가져오기
    train_operation_non_elec_solar = train_dataset.loc[:, ['num', 'operation_non_elec', 'solar']].drop_duplicates(subset = 'num').reset_index(drop = True)
    test_dataset = test_dataset.drop(columns = ['operation_non_elec', 'solar'])
    test_dataset = pd.merge(test_dataset, train_operation_non_elec_solar, how = 'left')
    
    # 3. test_dataset에서 결측치 채워넣기 2 : 결측치 선형 보간
    test_dataset_list = []

    for index in range(60):
        # test 데이터를 num별로 분리
        temp = test_dataset.loc[test_dataset.num == index + 1].copy()
        # 결측치 선형 보간
        for column in range(2, 7):
            temp.iloc[:, column] = temp.iloc[:, column].interpolate(method = 'linear')
        # 다시 저장
        test_dataset_list.append(temp)

    test_dataset = pd.concat(test_dataset_list, axis = 0)

    # 4. 요일 변수 추가, 시간 변수 추가
    train_dataset['dayofweek'] = train_dataset['date_time'].dt.dayofweek
    test_dataset['dayofweek'] = test_dataset['date_time'].dt.dayofweek
    
    train_dataset['hour'] = train_dataset['date_time'].dt.hour
    test_dataset['hour'] = test_dataset['date_time'].dt.hour
    
    # 5. 파생변수 생성 - 불쾌지수, 체감온도 계산
    train_dataset['discomfort_index'] = calculate_discomfort_index(train_dataset.temp, train_dataset.humidity)
    train_dataset['windchill_temp'] = calculate_windchill_temp(train_dataset.temp, train_dataset.wind)
    test_dataset['discomfort_index'] = calculate_discomfort_index(test_dataset.temp, test_dataset.humidity)
    test_dataset['windchill_temp'] = calculate_windchill_temp(test_dataset.temp, test_dataset.wind)

    # 5. 파생변수 생성 - 일조 변수의 이동평균 변수 추가
    train_dataset_list = []
    for index in range(60):
        temp = train_dataset.loc[train_dataset.num == index + 1].copy()
        temp['sunshine_rolling'] = temp.sunshine.rolling(window = 12, min_periods = 1, center = True).mean()
        train_dataset_list.append(temp)

    train_dataset = pd.concat(train_dataset_list, axis = 0)

    test_dataset_list = []
    for index in range(60):
        temp = test_dataset.loc[test_dataset.num == index + 1].copy()
        temp['sunshine_rolling'] = temp.sunshine.rolling(window = 12, min_periods = 1, center = True).mean()
        test_dataset_list.append(temp)

    test_dataset = pd.concat(test_dataset_list, axis = 0)

    # 6. 요일 원핫 인코딩 : 후보 모델을 고려했을 때 drop = None으로 실행
    
    from sklearn.preprocessing import OneHotEncoder
    dayofweek_label = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dayofweek_onehotencoder = OneHotEncoder(drop = None, sparse = False)
    dayofweek_onehotencoder.fit(train_dataset[['dayofweek']])
    dayofweek_train_matrix = pd.DataFrame(dayofweek_onehotencoder.transform(train_dataset[['dayofweek']]), columns = dayofweek_label)
    dayofweek_test_matrix = pd.DataFrame(dayofweek_onehotencoder.transform(test_dataset[['dayofweek']]), columns = dayofweek_label)

    # 합치기
    train_dataset = pd.concat([train_dataset, dayofweek_train_matrix], axis = 1) # num_train_matrix
    test_dataset = pd.concat([test_dataset, dayofweek_test_matrix], axis = 1) # num_test_matrix

    # X_train, y_train, X_test, y_test로 분리
    X_train = train_dataset.drop(columns = ['date_time', 'dayofweek', 'usage', 'sunshine', 'operation_non_elec', 'solar'])
    X_test = test_dataset.drop(columns = ['date_time', 'dayofweek', 'sunshine', 'operation_non_elec', 'solar'])
    
    X_train_list = [X_train.loc[X_train.num == index, :].drop(columns = ['num']).copy() for index in range(1, 61)]
    X_test_list = [X_test.loc[X_test.num == index, :].drop(columns = ['num']).copy() for index in range(1, 61)]
    
    y_train_list = [train_dataset.loc[X_train.num == index, 'usage'].copy().reset_index(drop = True) for index in range(1, 61)]

    # 7. 전력사용량의 이상치 제거

    y_train_list[0][204] = (y_train_list[0][203] + y_train_list[0][205]) / 2
    y_train_list[0][1033] = (y_train_list[0][1032] + y_train_list[0][1034]) / 2
    y_train_list[8][1427] = (y_train_list[8][1426] + y_train_list[8][1428]) / 2
    y_train_list[15][634] = (y_train_list[15][633] + y_train_list[15][635]) / 2
    y_train_list[24][994] = (y_train_list[24][993] + y_train_list[24][996]) / 2
    y_train_list[24][995] = (y_train_list[24][993] + y_train_list[24][996]) / 2
    y_train_list[26][1644:1649] = np.round(np.linspace(y_train_list[26][1643], y_train_list[26][1649], 7)[1:-1], 3)
    y_train_list[30][257] = (y_train_list[30][256] + y_train_list[30][258]) / 2
    y_train_list[32][257] = (y_train_list[32][256] + y_train_list[32][258]) / 2
    y_train_list[35][438] = (y_train_list[35][437] + y_train_list[35][439]) / 2
    y_train_list[35][1733:1737] = np.round(np.linspace(y_train_list[35][1732], y_train_list[35][1737], 6)[1:-1], 3)
    y_train_list[44][817] = (y_train_list[44][816] + y_train_list[44][818]) / 2
    y_train_list[51][258] = (y_train_list[51][257] + y_train_list[51][259]) / 2
    y_train_list[54][1643] = (y_train_list[54][1642] + y_train_list[54][1644]) / 2
    y_train_list[59][721] = (y_train_list[59][720] + y_train_list[59][722]) / 2

    # 전처리 결과 반환

    return X_train_list, X_test_list, y_train_list

In [4]:
X_train_list, X_test_list, y_train_list = preprocessing_dataset(train = train_energy, test = test_energy)

In [5]:
X_train_list[0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2040 entries, 0 to 2039
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temp              2040 non-null   float64
 1   wind              2040 non-null   float64
 2   humidity          2040 non-null   float64
 3   precipitation     2040 non-null   float64
 4   hour              2040 non-null   int64  
 5   discomfort_index  2040 non-null   float64
 6   windchill_temp    2040 non-null   float64
 7   sunshine_rolling  2040 non-null   float64
 8   Monday            2040 non-null   float64
 9   Tuesday           2040 non-null   float64
 10  Wednesday         2040 non-null   float64
 11  Thursday          2040 non-null   float64
 12  Friday            2040 non-null   float64
 13  Saturday          2040 non-null   float64
 14  Sunday            2040 non-null   float64
dtypes: float64(14), int64(1)
memory usage: 255.0 KB


# Modeling

In [6]:
import time

# score function, model selection
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

# model
from  xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

# save model
import joblib

In [7]:
# 평가 지표

def smape(y_true, y_pred):
    '''
    평가 지표인 smape 계산
    y_true: np.array 
    y_pred: np.array
    '''
    return 100 * np.mean((np.abs(y_true - y_pred))/(np.abs(y_true) + np.abs(y_pred)))

smape_score = make_scorer(smape, greater_is_better = False)

In [8]:
X_train_array = [np.asarray(X_train_list[index]) for index in range(60)]
y_train_array = [np.asarray(y_train_list[index]) for index in range(60)]
X_test_array = [np.asarray(X_test_list[index]) for index in range(60)]

## Xgboost

In [9]:
xgb_reg = XGBRegressor(learning_rate = 0.1, random_state = 20152410)

xgb_params = {
    'n_estimators' : list(range(10, 301, 10))
}

xgb_reg_grid = [GridSearchCV(estimator = xgb_reg,
                             param_grid = xgb_params,
                             scoring = smape_score,
                             cv = 4,
                             verbose = 2,
                             n_jobs = 3) for index in range(60)]

In [10]:
# start = time.time()

# print("학습 시작")
# print('\n')

# for index in range(60):
#     print("{}번째 모델 학습중".format(index + 1))
#     xgb_reg_grid[index].fit(X_train_array[index], y_train_array[index])

# end = time.time() - start

# print('\n')
# print("학습 완료")
# print("학습 소요 시간 : {}분 {}초".format(round(end // 60), round(end % 60)))

# 학습 소요 시간 : 36분 12초

In [11]:
# joblib.dump(xgb_reg_grid, './model/01_xgb_reg_grid_list.pkl')
xgb_reg_grid = joblib.load('./model/01_xgb_reg_grid_list.pkl')

In [12]:
xgb_best_cv_result = pd.concat([pd.DataFrame(xgb_reg_grid[index].cv_results_).sort_values(by = 'rank_test_score', ascending = True).iloc[:1, 4:]
                            for index in range(60)], axis = 0).reset_index(drop = True)

xgb_best_cv_result

Unnamed: 0,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,290,{'n_estimators': 290},-0.648834,-0.306463,-0.336074,-0.286409,-0.394445,0.14793,1
1,80,{'n_estimators': 80},-1.510443,-1.250527,-1.468844,-2.406797,-1.659153,0.442798,1
2,50,{'n_estimators': 50},-1.176605,-1.006851,-3.471435,-3.023296,-2.169547,1.091054,1
3,40,{'n_estimators': 40},-14.687145,-11.941101,-9.690267,-11.21309,-11.882901,1.811316,1
4,70,{'n_estimators': 70},-3.945959,-3.858506,-4.02875,-5.755308,-4.397131,0.786451,1
5,80,{'n_estimators': 80},-3.282157,-2.213186,-1.826528,-3.275161,-2.649258,0.64408,1
6,70,{'n_estimators': 70},-2.561478,-2.685792,-3.059706,-5.134698,-3.360419,1.040663,1
7,70,{'n_estimators': 70},-4.006715,-2.594129,-2.453516,-4.323358,-3.34443,0.829699,1
8,60,{'n_estimators': 60},-1.723713,-1.641926,-1.359859,-1.573528,-1.574756,0.134984,1
9,50,{'n_estimators': 50},-2.300585,-2.995502,-4.244297,-5.701773,-3.810539,1.295102,1


In [13]:
print("XGBRegressor CV SMAPE : {}".format((-1) * xgb_best_cv_result.mean_test_score.mean()))

XGBRegressor CV SMAPE : 3.447886183089466


## Random Forest

In [14]:
rf_reg = RandomForestRegressor(random_state = 20152410)

rf_params = {
    'n_estimators' : list(range(30, 301, 30))
}

rf_reg_grid = [GridSearchCV(estimator = rf_reg,
                            param_grid = rf_params,
                            scoring = smape_score,
                            cv = 4,
                            verbose = 2,
                            n_jobs = 3) for index in range(60)]

In [15]:
# start = time.time()

# print("학습 시작")
# print('\n')

# for index in range(60):
#     print("{}번째 모델 학습중".format(index + 1))
#     rf_reg_grid[index].fit(X_train_array[index], y_train_array[index])

# end = time.time() - start

# print('\n')
# print("학습 완료")
# print("학습 소요 시간 : {}분 {}초".format(round(end // 60), round(end % 60)))

# 학습 소요 시간 : 29분 15초

In [16]:
# joblib.dump(rf_reg_grid, './model/02_rf_reg_grid_list.pkl')
rf_reg_grid = joblib.load('./model/02_rf_reg_grid_list.pkl')

In [17]:
rf_best_cv_result = pd.concat([pd.DataFrame(rf_reg_grid[index].cv_results_).sort_values(by = 'rank_test_score', ascending = True).iloc[:1, 4:]
                            for index in range(60)], axis = 0).reset_index(drop = True)

rf_best_cv_result

Unnamed: 0,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,60,{'n_estimators': 60},-0.658851,-0.300757,-0.334933,-0.279668,-0.393552,0.154434,1
1,90,{'n_estimators': 90},-1.531562,-1.243345,-1.393231,-2.132459,-1.575149,0.337521,1
2,90,{'n_estimators': 90},-1.356395,-1.235809,-3.401806,-3.280674,-2.318671,1.024353,1
3,240,{'n_estimators': 240},-15.873337,-11.808458,-10.184596,-9.932099,-11.949622,2.377035,1
4,270,{'n_estimators': 270},-3.898779,-3.875942,-4.062789,-6.051495,-4.472251,0.914621,1
5,210,{'n_estimators': 210},-2.942056,-1.981586,-1.677612,-3.374148,-2.49385,0.690013,1
6,150,{'n_estimators': 150},-2.454173,-2.510228,-2.798308,-4.959093,-3.180451,1.035166,1
7,210,{'n_estimators': 210},-4.118575,-2.549939,-2.334811,-4.573735,-3.394265,0.968388,1
8,300,{'n_estimators': 300},-1.744273,-1.57591,-1.404583,-1.531415,-1.564045,0.121568,1
9,150,{'n_estimators': 150},-2.388138,-2.942945,-4.08555,-5.359082,-3.693929,1.139646,1


In [18]:
print("RandomForestRegressor CV SMAPE : {}".format((-1) * rf_best_cv_result.mean_test_score.mean()))

RandomForestRegressor CV SMAPE : 3.4374174056523015


## Lightgbm

In [19]:
lgbm_reg = LGBMRegressor(learning_rate = 0.1, random_state = 20152410)

lgbm_params = {
    'n_estimators' : list(range(10, 401, 10))
}

lgbm_reg_grid = [GridSearchCV(estimator = lgbm_reg,
                             param_grid = lgbm_params,
                             scoring = smape_score,
                             cv = 4,
                             verbose = 2,
                             n_jobs = 3) for index in range(60)]

In [20]:
# start = time.time()

# print("학습 시작")
# print('\n')

# for index in range(60):
#     print("{}번째 모델 학습중".format(index + 1))
#     lgbm_reg_grid[index].fit(X_train_array[index], y_train_array[index])

# end = time.time() - start

# print('\n')
# print("학습 완료")
# print("학습 소요 시간 : {}분 {}초".format(round(end // 60), round(end % 60)))

# 학습 소요 시간 : 40분 6초

In [21]:
# joblib.dump(lgbm_reg_grid, './model/03_lgbm_reg_grid_list.pkl')
lgbm_reg_grid = joblib.load('./model/03_lgbm_reg_grid_list.pkl')

In [22]:
lgbm_best_cv_result = pd.concat([pd.DataFrame(lgbm_reg_grid[index].cv_results_).sort_values(by = 'rank_test_score', ascending = True).iloc[:1, 4:]
                            for index in range(60)], axis = 0).reset_index(drop = True)

lgbm_best_cv_result

Unnamed: 0,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,90,{'n_estimators': 90},-0.657112,-0.299437,-0.325078,-0.276549,-0.389544,0.155432,1
1,70,{'n_estimators': 70},-1.487704,-1.371442,-1.392156,-2.165699,-1.60425,0.327105,1
2,10,{'n_estimators': 10},-1.530382,-1.242177,-3.240173,-2.220339,-2.058268,0.769389,1
3,30,{'n_estimators': 30},-16.295616,-11.177342,-10.009829,-9.82585,-11.827159,2.631411,1
4,30,{'n_estimators': 30},-3.864359,-3.919177,-3.840225,-5.532845,-4.289151,0.718616,1
5,70,{'n_estimators': 70},-3.249418,-2.35024,-1.987659,-3.670196,-2.814378,0.674643,1
6,60,{'n_estimators': 60},-2.987861,-3.056344,-3.055234,-5.085289,-3.546182,0.889037,1
7,60,{'n_estimators': 60},-4.272655,-2.738498,-2.382423,-4.457542,-3.46278,0.913401,1
8,20,{'n_estimators': 20},-1.784483,-1.482757,-1.387683,-1.363528,-1.504613,0.167617,1
9,50,{'n_estimators': 50},-3.078742,-3.89129,-4.59205,-5.863481,-4.356391,1.02171,1


In [23]:
print("LGBMRegressor CV SMAPE : {}".format((-1) * lgbm_best_cv_result.mean_test_score.mean()))

LGBMRegressor CV SMAPE : 3.453547802512631


## test data prediction

weighted ensemble은 각 모델의 성능의 역수를 가중치로 사용해서 앙상블 예측값을 계산했습니다.

성능의 역수를 가중치로 사용한 이유는 성능 지표값이 높을수록 좋지 않은 모델이기 때문입니다.

In [24]:
xgb_y_test_pred = [xgb_reg_grid[index].predict(X_test_list[index]) for index in range(60)]
rf_y_test_pred = [rf_reg_grid[index].predict(X_test_list[index]) for index in range(60)]
lgbm_y_test_pred = [lgbm_reg_grid[index].predict(X_test_list[index]) for index in range(60)]

xgb_y_test_pred_all = np.concatenate(xgb_y_test_pred, axis = 0)
rf_y_test_pred_all = np.concatenate(rf_y_test_pred, axis = 0)
lgbm_y_test_pred_all = np.concatenate(lgbm_y_test_pred, axis = 0)

In [25]:
# ensemble

ensemble_weight = np.concatenate([
    1 / ((-1) * np.asarray(xgb_best_cv_result.mean_test_score).reshape(-1, 1)),
    1 / ((-1) * np.asarray(rf_best_cv_result.mean_test_score).reshape(-1, 1)),
    1 / ((-1) * np.asarray(lgbm_best_cv_result.mean_test_score).reshape(-1, 1))
],
    axis = 1)

weighted_ensemble_y_test_pred = [
    (ensemble_weight[index, 0] * xgb_y_test_pred[index] +
     ensemble_weight[index, 1] * rf_y_test_pred[index] +
     ensemble_weight[index, 2] * lgbm_y_test_pred[index]) / np.sum(ensemble_weight[index, :])
    for index in range(60)
]

simple_ensemble_y_test_pred_all = (xgb_y_test_pred_all + rf_y_test_pred_all + lgbm_y_test_pred_all) / 3
weighted_ensemble_y_test_pred_all = np.concatenate(weighted_ensemble_y_test_pred, axis = 0)

In [26]:
sample_submission1 = sample_submission.copy()
sample_submission2 = sample_submission.copy()
sample_submission3 = sample_submission.copy()
sample_submission4 = sample_submission.copy()
sample_submission5 = sample_submission.copy()

In [27]:
sample_submission1['answer'] = xgb_y_test_pred_all
sample_submission2['answer'] = rf_y_test_pred_all
sample_submission3['answer'] = lgbm_y_test_pred_all
sample_submission4['answer'] = simple_ensemble_y_test_pred_all
sample_submission5['answer'] = weighted_ensemble_y_test_pred_all

In [28]:
# sample_submission1.to_csv('./submission/01_xgb_reg_grid_list.csv',
#                          index = False)

# sample_submission2.to_csv('./submission/02_rf_reg_grid_list.csv',
#                          index = False)

# sample_submission3.to_csv('./submission/03_lgbm_reg_grid_list.csv',
#                          index = False)

# sample_submission4.to_csv('./submission/04_simple_ensemble.csv',
#                          index = False)

# sample_submission5.to_csv('./submission/05_weighted_ensemble.csv',
#                          index = False)

## dacon website test score

sample_submission1 (xgboost) : 7.9461635351

sample_submission2 (random forest) : 7.6233149001

sample_submission3 (lgbm): 7.1884082184

sample_submission4 (simple ensemble): 7.2497439626

sample_submission5 (weighted ensemble): 7.2533592019