# 전처리 ver.1을 이용해 모델링

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
directory = './energy'

train_energy = pd.read_csv(directory + '/train.csv', encoding = 'euc-kr')
test_energy = pd.read_csv(directory + '/test.csv', encoding = 'euc-kr')

In [3]:
def preprocessing_dataset(train, test):
    '''train dataset, test dataset 전처리'''
    
    # 불쾌지수, 체감온도 계산 함수 정의
    def calculate_discomfort_index(temp, humidity):
        '''불쾌지수 계산'''
        discomfort_index = 9 * temp / 5 - 0.55 * (1 - humidity / 100) * (9 * temp / 5 - 26) + 32
        return discomfort_index

    def calculate_windchill_temp(temp, wind):
        '''체감온도 계산'''
        windchill_temp = 13.12 + 0.6215 * temp - 11.37 * (wind ** 0.16) + 0.3965 * (wind ** 0.16) * temp
        return windchill_temp

    # 원본에 영향이 가지 않게 사본 저장
    train_dataset = train.copy()
    test_dataset = test.copy()

    # 1. 변수명 변경
    train_columns_name = ['num', 'date_time', 'usage', 'temp', 'wind', 'humidity', 'precipitation', 'sunshine', 'operation_non_elec', 'solar']
    test_columns_name = ['num', 'date_time', 'temp', 'wind', 'humidity', 'precipitation', 'sunshine', 'operation_non_elec', 'solar']

    train_dataset.columns = train_columns_name
    test_dataset.columns = test_columns_name

    # 2. train, test 변수의 데이터 타입 변경
    train_dtypes = {}
    train_dtypes_list = ['category', 'object', 'float64', 'float64', 'float64', 'float64', 'float64', 'float64', 'category', 'category']

    for index, value in enumerate(train_dataset.columns):
        train_dtypes[value] = train_dtypes_list[index]

    test_dtypes = {}
    test_dtypes_list = ['category', 'object', 'float64', 'float64', 'float64', 'float64', 'float64', 'category', 'category']

    for index, value in enumerate(test_dataset.columns):
        test_dtypes[value] = test_dtypes_list[index]

    train_dataset = train_dataset.astype(train_dtypes)
    test_dataset = test_dataset.astype(test_dtypes)

    train_dataset['date_time'] = pd.to_datetime(train_dataset['date_time'], format='%Y-%m-%d %H')
    test_dataset['date_time'] = pd.to_datetime(test_dataset['date_time'], format='%Y-%m-%d %H')

    # 3. test_dataset의 결측치 채워넣기 1 : train에서 operation_non_elec, solar 변수 가져오기
    train_operation_non_elec_solar = train_dataset.loc[:, ['num', 'operation_non_elec', 'solar']].drop_duplicates(subset = 'num').reset_index(drop = True)
    test_dataset = test_dataset.drop(columns = ['operation_non_elec', 'solar'])
    test_dataset = pd.merge(test_dataset, train_operation_non_elec_solar, how = 'left')
    
    # 3. test_dataset에서 결측치 채워넣기 2 : 결측치 선형 보간
    test_dataset_list = []

    for index in range(60):
        # test 데이터를 num별로 분리
        temp = test_dataset.loc[test_dataset.num == index + 1].copy()
        # 결측치 선형 보간
        for column in range(2, 7):
            temp.iloc[:, column] = temp.iloc[:, column].interpolate(method = 'linear')
        # 다시 저장
        test_dataset_list.append(temp)

    test_dataset = pd.concat(test_dataset_list, axis = 0)

    # 4. 요일 변수 추가
    train_dataset['dayofweek'] = train_dataset['date_time'].dt.dayofweek
    test_dataset['dayofweek'] = test_dataset['date_time'].dt.dayofweek
    
    # 5. 파생변수 생성 - 불쾌지수, 체감온도 계산
    train_dataset['discomfort_index'] = calculate_discomfort_index(train_dataset.temp, train_dataset.humidity)
    train_dataset['windchill_temp'] = calculate_windchill_temp(train_dataset.temp, train_dataset.wind)
    test_dataset['discomfort_index'] = calculate_discomfort_index(test_dataset.temp, test_dataset.humidity)
    test_dataset['windchill_temp'] = calculate_windchill_temp(test_dataset.temp, test_dataset.wind)

    # 5. 파생변수 생성 - sunshine 이동평균 변수 추가
    train_dataset_list = []
    for index in range(60):
        temp = train_dataset.loc[train_dataset.num == index + 1].copy()
        temp['sunshine_rolling'] = temp.sunshine.rolling(window = 4, min_periods = 1, center = False).mean()
        train_dataset_list.append(temp)

    train_dataset = pd.concat(train_dataset_list, axis = 0)

    test_dataset_list = []
    for index in range(60):
        temp = test_dataset.loc[test_dataset.num == index + 1].copy()
        temp['sunshine_rolling'] = temp.sunshine.rolling(window = 4, min_periods = 1, center = False).mean()
        test_dataset_list.append(temp)

    test_dataset = pd.concat(test_dataset_list, axis = 0)

    # 6. 건물 번호 원핫 인코딩 : 후보 모델을 고려했을 때 drop=None으로 실행
    from sklearn.preprocessing import OneHotEncoder

    num_onehotencoder = OneHotEncoder(drop = None, sparse = False)
    num_onehotencoder.fit(train_dataset[['num']])
    num_train_matrix = pd.DataFrame(num_onehotencoder.transform(train_dataset[['num']]), columns = ['num' + str(index) for index in range(1, 61)])
    num_test_matrix = pd.DataFrame(num_onehotencoder.transform(test_dataset[['num']]), columns = ['num' + str(index) for index in range(1, 61)])

    # 7. 요일 원핫 인코딩 : 후보 모델을 고려했을 때 drop=None으로 실행

    dayofweek_label = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dayofweek_onehotencoder = OneHotEncoder(drop = None, sparse = False)
    dayofweek_onehotencoder.fit(train_dataset[['dayofweek']])
    dayofweek_train_matrix = pd.DataFrame(dayofweek_onehotencoder.transform(train_dataset[['dayofweek']]), columns = dayofweek_label)
    dayofweek_test_matrix = pd.DataFrame(dayofweek_onehotencoder.transform(test_dataset[['dayofweek']]), columns = dayofweek_label)

    # 합치기
    train_dataset = pd.concat([train_dataset, num_train_matrix, dayofweek_train_matrix], axis = 1)
    test_dataset = pd.concat([test_dataset, num_test_matrix, dayofweek_test_matrix], axis = 1)

    # X_train, y_train, X_test, y_test로 분리
    X_train = train_dataset.drop(columns = ['date_time', 'num', 'dayofweek', 'usage'])
    X_test = test_dataset.drop(columns = ['date_time', 'num', 'dayofweek'])

    y_train = train_dataset.usage
    
    return X_train, X_test, y_train

In [4]:
X_train, X_test, y_train = preprocessing_dataset(train = train_energy, test = test_energy)

In [5]:
print('X_train shape : {}'.format(X_train.shape))
print('X_test shape : {}'.format(X_test.shape))
print('y_train shape : {}'.format(y_train.shape))

X_train shape : (122400, 77)
X_test shape : (10080, 77)
y_train shape : (122400,)


# Modeling

In [6]:
import time

# metrics, model selection
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

# model
import xgboost
from sklearn.linear_model import Lasso

# save
import joblib

In [7]:
# 평가 지표

def smape(y_true, y_pred):
    '''
    평가 지표인 smape 계산
    y_true: np.array 
    y_pred: np.array
    '''
    return 100 * np.mean((np.abs(y_true - y_pred))/(np.abs(y_true) + np.abs(y_pred)))

smape_score = make_scorer(smape, greater_is_better = False)

## Xgboost

In [8]:
X_train_array = np.asarray(X_train)
y_train_array = np.asarray(y_train)
X_test_array = np.asarray(X_test)

xgb_reg = xgboost.XGBRegressor(learning_rate = 0.3, random_state = 20152410)

xgb_params = {
    'n_estimators' : list(range(100, 1001, 100))
}

xgb_reg_grid = GridSearchCV(estimator = xgb_reg,
                           param_grid = xgb_params,
                           scoring = smape_score,
                           cv = 4,
                           verbose = 2,
                           n_jobs = 3)

In [9]:
# start = time.time()

# print("학습 시작")
# print('\n')

# xgb_reg_grid.fit(X_train_array, y_train_array)

# end = time.time() - start

# print('\n')
# print("학습 완료")
# print("학습 소요 시간 : {}분 {}초".format(round(end // 60), round(end % 60)))
# 학습 소요 시간 : 65분 34초

In [10]:
# joblib.dump(xgb_reg_grid, './model/xgb_reg_grid.pkl')
xgb_reg_grid = joblib.load('./model/xgb_reg_grid.pkl')

In [11]:
xgb_cv_result = pd.DataFrame(xgb_reg_grid.cv_results_).sort_values(by = 'rank_test_score', ascending = True)
xgb_cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
9,443.342993,68.017164,0.546011,0.036314,1000,{'n_estimators': 1000},-29.018253,-23.070317,-24.915218,-23.007394,-25.002795,2.441704,1
8,445.186995,2.223837,0.558251,0.028084,900,{'n_estimators': 900},-29.029382,-23.077181,-24.919604,-23.035267,-25.015359,2.439203,2
7,393.778245,0.891623,0.485508,0.053207,800,{'n_estimators': 800},-29.048295,-23.081616,-24.92076,-23.05957,-25.02756,2.441178,3
6,344.435498,1.508994,0.389005,0.034665,700,{'n_estimators': 700},-29.074621,-23.085082,-24.937589,-23.080107,-25.04435,2.447012,4
5,295.057245,1.746091,0.384257,0.011648,600,{'n_estimators': 600},-29.09858,-23.11234,-24.946037,-23.120138,-25.069274,2.443318,5
4,247.169747,1.498941,0.350254,0.029974,500,{'n_estimators': 500},-29.141056,-23.110129,-24.960457,-23.166628,-25.094568,2.451887,6
3,197.730998,1.118531,0.304001,0.039337,400,{'n_estimators': 400},-29.200519,-23.131318,-24.975214,-23.219882,-25.131733,2.461522,7
2,150.194497,1.513229,0.205759,0.013848,300,{'n_estimators': 300},-29.266646,-23.168835,-25.019517,-23.315441,-25.19261,2.46207,8
1,99.122747,0.550468,0.185756,0.014937,200,{'n_estimators': 200},-29.360005,-23.212176,-25.056078,-23.414247,-25.260626,2.472448,9
0,52.979995,1.941786,0.112755,0.022671,100,{'n_estimators': 100},-29.462815,-23.227658,-25.077624,-23.563388,-25.332871,2.484179,10


In [12]:
print("XGBRegressor CV SMAPE : {}".format(np.min((-1) * xgb_cv_result.mean_test_score)))

XGBRegressor CV SMAPE : 25.002795374262252


## Lasso

In [13]:
X_train_linear = np.asarray(X_train.drop(columns = ['num60', 'Sunday']))
X_test_linear = np.asarray(X_test.drop(columns = ['num60', 'Sunday']))

lasso_reg = Lasso(normalize = True)
lasso_params = {
    'alpha' : list(np.logspace(-2, 1, 100))
}

lasso_reg_grid = GridSearchCV(estimator = lasso_reg,
                              param_grid = lasso_params,
                              scoring = smape_score,
                              cv = 4,
                              verbose = 2,
                              n_jobs = 3)

In [14]:
# start = time.time()

# print("학습 시작")
# print('\n')

# lasso_reg_grid.fit(X_train_linear, y_train_array)

# end = time.time() - start

# print('\n')
# print("학습 완료")
# print("학습 소요 시간 : {}분 {}초".format(round(end // 60), round(end % 60)))
# 학습 소요 시간 : 9분 53초

In [15]:
# joblib.dump(lasso_reg_grid, './model/lasso_reg_grid.pkl')
lasso_reg_grid = joblib.load('./model/lasso_reg_grid.pkl')

In [16]:
lasso_cv_result = pd.DataFrame(lasso_reg_grid.cv_results_).sort_values(by = 'rank_test_score', ascending = True)
lasso_cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
27,4.508749,1.503798,0.020257,0.005968,0.065793,{'alpha': 0.06579332246575682},-30.116349,-22.424827,-26.119324,-24.975423,-25.908981,2.772952,1
26,5.280498,1.399438,0.016760,0.002488,0.061359,{'alpha': 0.06135907273413173},-30.117314,-22.354593,-26.273502,-24.922701,-25.917028,2.803975,2
28,4.734992,1.278063,0.017501,0.003845,0.070548,{'alpha': 0.07054802310718646},-30.119406,-22.496850,-26.028665,-25.031619,-25.919135,2.745633,3
25,5.247995,1.105707,0.017003,0.004298,0.057224,{'alpha': 0.05722367659350217},-30.123929,-22.295800,-26.438974,-24.874698,-25.933350,2.835891,4
29,4.790754,1.474213,0.016007,0.004122,0.075646,{'alpha': 0.07564633275546291},-30.126485,-22.569435,-25.964075,-25.092955,-25.938237,2.720543,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,0.745495,0.014112,0.012756,0.001301,4.037017,{'alpha': 4.037017258596554},-32.500751,-29.721839,-25.233762,-30.458503,-29.478714,2.653838,85
85,0.758241,0.013792,0.012256,0.000434,3.764936,{'alpha': 3.7649358067924674},-32.500751,-29.721839,-25.233762,-30.458503,-29.478714,2.653838,85
84,0.714245,0.008872,0.012757,0.001483,3.511192,{'alpha': 3.511191734215131},-32.500751,-29.721839,-25.233762,-30.458503,-29.478714,2.653838,85
90,0.742744,0.034524,0.013010,0.002545,5.336699,{'alpha': 5.336699231206313},-32.500751,-29.721839,-25.233762,-30.458503,-29.478714,2.653838,85


In [17]:
print("Lasso CV SMAPE : {}".format(np.min((-1) * lasso_cv_result.mean_test_score)))

Lasso CV SMAPE : 25.908980512288654


## test data prediction

In [18]:
sample_submission = pd.read_csv(directory + '/sample_submission.csv', encoding = 'euc-kr')

sample_submission6 = sample_submission.copy()
sample_submission7 = sample_submission.copy()

xgb_y_test_pred = xgb_reg_grid.best_estimator_.predict(X_test_array)
lasso_y_test_pred = lasso_reg_grid.best_estimator_.predict(X_test_linear)

sample_submission6.loc[:, 'answer'] = xgb_y_test_pred
sample_submission7.loc[:, 'answer'] = lasso_y_test_pred

In [19]:
# sample_submission6.to_csv('./submission/06_xgb_reg_grid.csv',
#                           index = False)

# sample_submission7.to_csv('./submission/07_lasso_reg_grid.csv',
#                           index = False)

## dacon website test score

sample_submission6 (xgb with version1 preprocessing) : 32.113861307

sample_submission7 (lasso with version1 preprocessing) : 33.1894393113