# 모델링
** 사용모델 **  
- XGBoost  
- GradientBoost  
- RandomForest  
- LinearRegression (Ridge, Lasso)  

** GridSearchCV를 통해 최적 하이퍼파라미터 설정 **

In [1]:
# 주피터 노트북 전체화면 설정
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;}</style>"))

# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder

## 0. 함수 정의

In [32]:
def rmse(ground_truth, pred) :
    m = mean_squared_error(ground_truth, pred)
    rm = np.sqrt(m)
    
    return rm

def time_feature(df) :
    
    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].apply(lambda x : x.month)
    df['day'] = df['time'].apply(lambda x : x.day)
    df['hour'] = df['time'].apply(lambda x : x.hour)
    df['minute'] = df['time'].apply(lambda x : x.minute)
    
    return df

def onehot(df_train, df_test, cols) :
    
    for col in cols :
        df_train[col] = df_train[col].apply(lambda x : str(x))
        df_test[col] = df_test[col].apply(lambda x : str(x))
    
    oh = OneHotEncoder()
    
    df_train_oh = oh.fit_transform(df_train[cols])
    df_train_new = np.hstack([df_train['volt'].values.reshape(-1,1), df_train_oh.toarray()])

    df_test_oh = oh.transform(df_test[cols])
    df_test_new = np.hstack([df_test['volt'].values.reshape(-1,1), df_test_oh.toarray()])
    
    return df_train_new, df_test_new

## 1. 데이터 불러오기

In [4]:
train = pd.read_csv('train.csv') # 학습
test = pd.read_csv('test.csv') # 테스트

# 결과 저장을 위한 배열 생성

results = np.zeros((5, 6))

## 2. 기본

In [6]:
# 학습 데이터 생성

X = train['volt'].values.reshape(-1,1)
y = train['cap'].values

X_test = test['volt'].values.reshape(-1,1)
y_test = test['cap'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

### 2.1. XGBoost

In [10]:
# Grid Search 후 최적 하이퍼파라미터
xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.05, subsample = 1, max_depth = 20, random_state = 42)
xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)
score = rmse(y_test, pred)

results[0,0] = score

score

183.52429099397133

### 2.2. GradientBoost

In [11]:
# Grid Search 후 최적 하이퍼파라미터
gb = GradientBoostingRegressor(learning_rate = 0.01, n_estimators = 100, subsample = 0.8, max_depth = 2,random_state = 42)
gb.fit(X_train, y_train)

pred = gb.predict(X_test)
score = rmse(y_test, pred)

results[0,1] = score

score

184.22256146757067

### 2.3. RandomForest

In [15]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
score = rmse(y_test, pred)

results[0,2] = score

score

183.44229924915967

### 2.4. LinearRegression

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
score = rmse(y_test, pred)

results[0,3] = score

score

184.54553716460083

In [17]:
r = Ridge()
r.fit(X_train, y_train)

pred = r.predict(X_test)
score = rmse(y_test, pred)

results[0,4] = score

score

184.54553742968582

In [18]:
l = Lasso()
l.fit(X_train, y_train)

pred = l.predict(X_test)
score = rmse(y_test, pred)

results[0,5] = score

score

184.55361788815364

## 3. Feature 추가 (PolynomialFeatures)

In [19]:
# 학습 데이터 생성

X = train['volt'].values.reshape(-1,1)
y = train['cap'].values

X_test = test['volt'].values.reshape(-1,1)
y_test = test['cap'].values

poly = PolynomialFeatures(degree = 3, include_bias = False)
X_poly = poly.fit_transform(X)
X_test = poly.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X_poly, y, random_state = 42)

### 3.1. XGBoost

In [20]:
xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.05, subsample = 1, max_depth = 20, random_state = 42)
xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)
score = rmse(y_test, pred)

results[1,0] = score

score

183.52429099397133

### 3.2. GradientBoost

In [21]:
gb = GradientBoostingRegressor(learning_rate = 0.01, n_estimators = 100, subsample = 0.8, max_depth = 2,random_state = 42)
gb.fit(X_train, y_train)

pred = gb.predict(X_test)
score = rmse(y_test, pred)

results[1,1] = score

score

184.22256146757067

### 3.3. RandomForest

In [22]:
rf = RandomForestRegressor(n_estimators = 1000, max_depth = 20, random_state = 42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
score = rmse(y_test, pred)

results[1,2] = score

score

183.4422997592526

### 3.4. LinearRegression

In [23]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
score = rmse(y_test, pred)

results[1,3] = score

score

184.16103604500944

In [24]:
r = Ridge()
r.fit(X_train, y_train)

pred = r.predict(X_test)
score = rmse(y_test, pred)

results[1,4] = score

score

  return linalg.solve(A, Xy, sym_pos=True,


184.17878295228206

In [25]:
l = Lasso()
l.fit(X_train, y_train)

pred = l.predict(X_test)
score = rmse(y_test, pred)

results[1,5] = score

score

  model = cd_fast.enet_coordinate_descent(


184.50261554393012

## 4. Feature 추가 (Time Feature)

In [27]:
# 학습 데이터 생성

train_new = time_feature(train)
test_new = time_feature(test)

X = train_new[['volt', 'month', 'day', 'hour', 'minute']].values.reshape(-1,5)
y = train_new['cap'].values

X_test = test_new[['volt', 'month', 'day', 'hour', 'minute']].values.reshape(-1,5)
y_test = test_new['cap'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

### 4.1. XGBoost

In [29]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.05, subsample = 1, max_depth = 20, random_state = 42)
xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)
score = rmse(y_test, pred)

results[2,0] = score

score

78.02599700067795

### 4.2. GradientBoost

In [30]:
gb = GradientBoostingRegressor(learning_rate = 0.01, n_estimators = 1000, subsample = 0.8, max_depth = 2,random_state = 42)
gb.fit(X_train, y_train)

pred = gb.predict(X_test)
score = rmse(y_test, pred)

results[2,1] = score

score

140.25109107553752

### 4.3. RandomForest

In [33]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
score = rmse(y_test, pred)

results[2,2] = score

score

74.05241185735645

### 4.4. LinearRegression

In [34]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
score = rmse(y_test, pred)

results[2,3] = score

score

181.0038467647388

In [35]:
r = Ridge()
r.fit(X_train, y_train)

pred = r.predict(X_test)
score = rmse(y_test, pred)

results[2,4] = score

score

181.00384684328515

In [36]:
l = Lasso()
l.fit(X_train, y_train)

pred = l.predict(X_test)
score = rmse(y_test, pred)

results[2,5] = score

score

181.0076785543828

## 5. 전처리 과정 추가 (Time Feature + OneHotEncoding)

In [37]:
# 학습 데이터 생성

train_new = time_feature(train)
test_new = time_feature(test)

X, X_test = onehot(train_new, test_new, ['month', 'day', 'hour', 'minute'])

y = train_new['cap'].values
y_test = test_new['cap'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

### 5.1. XGBoost

In [40]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.05, subsample = 1, max_depth = 20, random_state = 42)
xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)
score = rmse(y_test, pred)

results[3,0] = score

score

83.90436385449279

### 5.2. GradientBoost

In [41]:
gb = GradientBoostingRegressor(learning_rate = 0.01, n_estimators = 1000, subsample = 0.8, max_depth = 2,random_state = 42)
gb.fit(X_train, y_train)

pred = gb.predict(X_test)
score = rmse(y_test, pred)

results[3,1] = score

score

156.2615034353547

### 5.3. RandomForest

In [43]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
score = rmse(y_test, pred)

results[3,2] = score

score

83.08500084513624

### 5.4. LinearRegression

In [44]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
score = rmse(y_test, pred)

results[3,3] = score

score

150.60705917578284

In [45]:
r = Ridge(alpha = 0.01)
r.fit(X_train, y_train)

pred = r.predict(X_test)
score = rmse(y_test, pred)

results[3,4] = score

score

150.60748839313294

In [46]:
l = Lasso(alpha = 0.01)
l.fit(X_train, y_train)

pred = l.predict(X_test)
score = rmse(y_test, pred)

results[3,5] = score

score

150.631670209021

## 6. 전처리 과정 추가 2 (PolynomialFeatures + Time Feature + OnehotEncoding)

In [56]:
# 학습 데이터 생성

train_new = time_feature(train)
test_new = time_feature(test)

X, X_test = onehot(train_new, test_new, ['month', 'day', 'hour', 'minute'])

y = train_new['cap'].values
y_test = test_new['cap'].values

poly = PolynomialFeatures(degree = 3, include_bias = False)
X_poly = poly.fit_transform(X[:,0].reshape(-1,1))
X_test_poly = poly.transform(X_test[:,0].reshape(-1,1))

X = np.hstack([X_poly, X[:,1:]])
X_test = np.hstack([X_test_poly, X_test[:,1:]])

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [57]:
X.shape, X_test.shape

((361063, 130), (90265, 130))

### 6.1. XGBoost

In [58]:
xgb = XGBRegressor(n_estimators = 1000, learning_rate = 0.05, subsample = 1, max_depth = 20, random_state = 42)
xgb.fit(X_train, y_train, early_stopping_rounds = 5, eval_set = [(X_val, y_val)], verbose = False)

pred = xgb.predict(X_test)
score = rmse(y_test, pred)

results[4,0] = score

score

83.90436385449279

### 6.2. GradientBoost

In [65]:
gb = GradientBoostingRegressor(learning_rate = 0.01, n_estimators = 1000, subsample = 0.8, max_depth = 2,random_state = 42)
gb.fit(X_train, y_train)

pred = gb.predict(X_test)
score = rmse(y_test, pred)

results[4,1] = score

score

156.26150343535474

### 6.3. RandomForest

In [66]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)

pred = rf.predict(X_test)
score = rmse(y_test, pred)

results[4,2] = score

score

82.93697996938943

### 6.4. LinearRegression

In [61]:
lr = LinearRegression()
lr.fit(X_train, y_train)

pred = lr.predict(X_test)
score = rmse(y_test, pred)

results[4,3] = score

score

150.49718812340376

In [62]:
r = Ridge()
r.fit(X_train, y_train)

pred = r.predict(X_test)
score = rmse(y_test, pred)

results[4,4] = score

score

  return linalg.solve(A, Xy, sym_pos=True,


150.49816210451803

In [63]:
l = Lasso()
l.fit(X_train, y_train)

pred = l.predict(X_test)
score = rmse(y_test, pred)

results[4,5] = score

score

  model = cd_fast.enet_coordinate_descent(


153.5876014716225

## 결과

In [67]:
df_results = pd.DataFrame(results)
df_results.columns = [xgb.__class__.__name__, gb.__class__.__name__, rf.__class__.__name__, lr.__class__.__name__, r.__class__.__name__, l.__class__.__name__]
df_results.index = ['Basic', 'PF', 'TF', 'TF + OH', 'PF + TF + OH']
df_results

Unnamed: 0,XGBRegressor,GradientBoostingRegressor,RandomForestRegressor,LinearRegression,Ridge,Lasso
Basic,183.524291,184.222561,183.442299,184.545537,184.545537,184.553618
PF,183.524291,184.222561,183.4423,184.161036,184.178783,184.502616
TF,78.025997,140.251091,74.052412,181.003847,181.003847,181.007679
TF + OH,83.904364,156.261503,83.085001,150.607059,150.607488,150.63167
PF + TF + OH,83.904364,156.261503,82.93698,150.497188,150.498162,153.587601
