# 모델링 - part 1
** 사용모델 **  
- XGBoost  
- LightGBM  
- RandomForest  
- GradientBoost  
- ElasticNet (Linear Regression)  
- DNN (Deep Neural Network)  

** GridSearchCV를 통해 최적 하이퍼파라미터 설정 **

In [1]:
# 주피터 노트북 전체화면 설정
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder
from tensorflow import keras

# 0. 사용 함수 정의

In [2]:
def rmse(ground_truth, pred) :
    m = mean_squared_error(ground_truth, pred)
    rm = np.sqrt(m)
    
    return rm

def time_feature(df) :
    
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].apply(lambda x : x.month)
    df['day'] = df['time'].apply(lambda x : x.day)
    df['hour'] = df['time'].apply(lambda x : x.hour)
    df['minute'] = df['time'].apply(lambda x : x.minute)
    
    return df

def onehot(df_train, df_test, cols) :
    
    for col in cols :
        df_train[col] = df_train[col].apply(lambda x : str(x))
        df_test[col] = df_test[col].apply(lambda x : str(x))
    
    oh = OneHotEncoder()
    
    df_train_oh = oh.fit_transform(df_train[cols])
    df_train_new = np.hstack([df_train['volt'].values.reshape(-1,1), df_train_oh.toarray()])

    df_test_oh = oh.transform(df_test[cols])
    df_test_new = np.hstack([df_test['volt'].values.reshape(-1,1), df_test_oh.toarray()])
    
    return df_train_new, df_test_new

def data2seq(df, col, length = 4) :
    
    res = np.zeros((df.shape[0],length))
    init = df.loc[0,col]
    
    for i in range(df.shape[0]) :
        for j in range(length) :
            if (i - j) < 0 :
                res[i,length - 1 - j] = init
            else :
                res[i,length - 1 - j] = df.loc[i-j, col]
            
    return res

# 1. 데이터 전처리

In [3]:
# 필요한 칼럼 추출

data = pd.read_excel('전체 데이터 통합 파일(0624).xlsx', index_col=0)
data.reset_index(inplace = True)

d1 = data[data['투입 후 운전용량'].notnull()][['일시', '모선전압', '투입 후 운전용량']]
d2 = data[data['투입 후 운전용량'].isnull()][['일시', '모선전압', '투입 전 운전용량']]

cols = ['time', 'volt', 'cap']
d1.columns = cols
d2.columns = cols

d12 = pd.concat([d1, d2])
d12.reset_index(drop = True, inplace = True)
d12

Unnamed: 0,time,volt,cap
0,2019-01-02 08:51:00,351.1,724.1
1,2019-01-02 18:07:00,356.5,845.8
2,2019-01-05 09:21:00,348.9,716.6
3,2019-01-05 09:39:00,349.3,615.0
4,2019-01-05 18:43:00,355.7,829.7
...,...,...,...
451323,2021-01-18 10:00:00,348.2,507.0
451324,2021-01-18 10:02:00,348.0,506.9
451325,2021-01-18 10:03:00,348.0,506.9
451326,2021-01-18 10:04:00,348.0,506.6


In [5]:
# 이상치가 존재하는 행 제거

ind = d12[d12['volt'] < 340].index

d12.drop(ind, axis = 0, inplace = True)
d12.reset_index(drop = True, inplace = True)

# 2. 모델링 - 현재 시점의 전압
## (0) 데이터 준비

In [None]:
X = d12['volt'].values.reshape(-1,1)

# 정규화
X = (X - X.min()) / (X.max() - X.min())

y = d12['cap'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 42)

## (1) XGBoost

In [None]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)

rmse(y_test, pred)



183.13472084309097

## (2) LightGBM

In [None]:
lgbm = LGBMRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

lgbm.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = lgbm.predict(X_test)

rmse(y_test, pred)

183.1191453790413

## (3) RandomForest

In [None]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

rmse(y_test, pred)

183.1347356996297

## (4) GradientBoosting

In [None]:
gb = GradientBoostingRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 0.8, max_depth = 2,random_state = 42)

gb.fit(X_train, y_train)

pred = gb.predict(X_test)

rmse(y_test, pred)

183.15261537862463

## (5) ElasticNet

In [None]:
en = ElasticNet(alpha = 0.05, l1_ratio = 0.5, random_state = 42)

en.fit(X_train, y_train)

pred = en.predict(X_test)

rmse(y_test, pred)

184.12182015320366

## (6) DNN

In [None]:
model = keras.Sequential()
model.add(keras.layers.Dense(256, activation = 'relu', input_shape = (1, )))
model.add(keras.layers.Dense(128, activation = 'relu'))
model.add(keras.layers.Dense(64, activation = 'relu'))
model.add(keras.layers.Dense(10, activation = 'relu'))
model.add(keras.layers.Dense(1))

model.compile(loss = 'mse', optimizer='adam')

callbacks = [keras.callbacks.EarlyStopping(patience = 10, restore_best_weights=True)]

model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 2000, batch_size = 14, callbacks = callbacks, verbose = 0)

pred = model.predict(X_test)

rmse(y_test, pred)

184.9000883751361

# 3. 모델링 - 현재 시점의 전압 + 시간
## (0) 데이터 준비

In [None]:
X = time_feature(d12)
X = X[['volt', 'month', 'day', 'hour', 'minute']].values.reshape(-1,5)

# 정규화
X = (X - X.min(axis = 0)) / (X.max(axis = 0) - X.min(axis = 0))

y = d12['cap'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 42)

## (1) XGBoost

In [None]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)

rmse(y_test, pred)



79.07464433065839

## (2) LightGBM

In [None]:
lgbm = LGBMRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

lgbm.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = lgbm.predict(X_test)

rmse(y_test, pred)

124.93373321851269

## (3) RandomForest

In [None]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

rmse(y_test, pred)

78.54837146104732

## (4) GradientBoost

In [None]:
gb = GradientBoostingRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 0.8, max_depth = 2,random_state = 42)

gb.fit(X_train, y_train)

pred = gb.predict(X_test)

rmse(y_test, pred)

148.6826154371397

## (5) ElasticNet

In [None]:
en = ElasticNet(alpha = 0.05, l1_ratio = 0.5, random_state = 42)

en.fit(X_train, y_train)

pred = en.predict(X_test)

rmse(y_test, pred)

180.35480624460658

## (6) DNN

In [None]:
model = keras.Sequential()
model.add(keras.layers.Dense(256, activation = 'relu', input_shape = (5, )))
model.add(keras.layers.Dense(128, activation = 'relu'))
model.add(keras.layers.Dense(64, activation = 'relu'))
model.add(keras.layers.Dense(10, activation = 'relu'))
model.add(keras.layers.Dense(1))

model.compile(loss = 'mse', optimizer='adam')

callbacks = [keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)]

model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 2000, batch_size = 14, callbacks = callbacks, verbose = 0)

pred = model.predict(X_test)

rmse(y_test, pred)

138.97219639241055

# 4.모델링 - 현재 시점의 전압 + 과거 시점의 전압
## (0) 데이터 준비

In [20]:
X = data2seq(d12, 'volt', 4) # length = 4

# 정규화
X = (X - X.min()) / (X.max() - X.min())

y = d12['cap'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 42)

## (1) XGBoost

In [10]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)

rmse(y_test, pred)



182.22298063754238

## (2) LightGBM

In [11]:
lgbm = LGBMRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

lgbm.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = lgbm.predict(X_test)

rmse(y_test, pred)

179.03714528347442

## (3) RandomForest

In [12]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

rmse(y_test, pred)

182.14688835293228

## (4) GradientBoost

In [13]:
gb = GradientBoostingRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 0.8, max_depth = 2,random_state = 42)

gb.fit(X_train, y_train)

pred = gb.predict(X_test)

rmse(y_test, pred)

182.07710465762452

## (5) ElasticNet

In [14]:
en = ElasticNet(alpha = 0.05, l1_ratio = 0.5, random_state = 42)

en.fit(X_train, y_train)

pred = en.predict(X_test)

rmse(y_test, pred)

184.0955221000567

## (6) DNN

In [16]:
model = keras.Sequential()
model.add(keras.layers.Dense(256, activation = 'relu', input_shape = (4, )))
model.add(keras.layers.Dense(128, activation = 'relu'))
model.add(keras.layers.Dense(64, activation = 'relu'))
model.add(keras.layers.Dense(10, activation = 'relu'))
model.add(keras.layers.Dense(1))

model.compile(loss = 'mse', optimizer='adam')

callbacks = [keras.callbacks.EarlyStopping(patience = 10, restore_best_weights=True)]

model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 2000, batch_size = 14, callbacks = callbacks, verbose = 0)

pred = model.predict(X_test)

rmse(y_test, pred)

183.08677988949583

# 5. 모델링 - 현재 시점의 전압 + 과거 시점의 전압 + 과거 시점의 투입용량
## (0) 데이터 준비

In [25]:
seq_v = data2seq(d12, 'volt', 4) # length = 4
seq_c = data2seq(d12, 'cap', 5) # length = 4 (현재 투입용량 제거)

# 현재 투입 용량은 제거
seq_c = np.delete(seq_c, -1, 1)

# 정규화
seq_v = (seq_v - seq_v.min()) / (seq_v.max() - seq_v.min())
seq_c = (seq_c - seq_c.min()) / (seq_c.max() - seq_c.min())

X = np.hstack([seq_v, seq_c])
y = d12['cap'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 42)

## (1) XGBoost

In [28]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)

rmse(y_test, pred)



14.319706559942455

## (2) LightGBM

In [29]:
lgbm = LGBMRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 1, max_depth = 20, random_state = 42)

lgbm.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = lgbm.predict(X_test)

rmse(y_test, pred)

14.010354640932356

## (3) RandomForest

In [30]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)

rf.fit(X_train, y_train)

pred = rf.predict(X_test)

rmse(y_test, pred)

14.13713552229093

## (4) GradientBoost

In [31]:
gb = GradientBoostingRegressor(n_estimators = 1000, learning_rate = 0.01, subsample = 0.8, max_depth = 2,random_state = 42)

gb.fit(X_train, y_train)

pred = gb.predict(X_test)

rmse(y_test, pred)

19.717901725571153

## (5) ElasticNet

In [32]:
en = ElasticNet(alpha = 0.05, l1_ratio = 0.5, random_state = 42)

en.fit(X_train, y_train)

pred = en.predict(X_test)

rmse(y_test, pred)

32.53824411604585

## (6) DNN

In [34]:
model = keras.Sequential()
model.add(keras.layers.Dense(256, activation = 'relu', input_shape = (8, )))
model.add(keras.layers.Dense(128, activation = 'relu'))
model.add(keras.layers.Dense(64, activation = 'relu'))
model.add(keras.layers.Dense(10, activation = 'relu'))
model.add(keras.layers.Dense(1))

model.compile(loss = 'mse', optimizer='adam')

callbacks = [keras.callbacks.EarlyStopping(patience = 10, restore_best_weights=True)]

model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 2000, batch_size = 14, callbacks = callbacks, verbose = 0)

pred = model.predict(X_test)

rmse(y_test, pred)

13.971193168654366