### sklearn.linear_model.ElasticNet

#### 릿지회귀와 라쏘회귀를 절충한 모델로, 규제항은 릿지와 랐의 규제항을 단순히 더하여 사용
- 두 규제항의 혼합정도를 혼합비율 r을 사용하여 조절
- r=0일 경우 릿지회귀
- r=1일 경우 라쏘회귀

#### 주요 Hyperparameter
- alpha : 기본값 1로, 0에 가까울수록 규제가 없고(선형회귀와 유사한 결과) 값이 커질수록 규제가 강해져 회귀계수가 0에 근접(훈련데이터의 정확도는 낮아지지만 일반화에 기여)
- l1_ratio : 릿지와 라쏘의 규제 비율에 대한 가중 정도(조절 어려움)

##### ElasticNet(alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')

# 분석 코드

In [1]:
# 라이브러리 및 데이터 로드
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

df = pd.read_csv('../input/big-data-certification-study/house_price.csv', encoding='utf-8')
df.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [2]:
# 데이터셋 분리
X = df.drop(columns=['house_value'])
y = df[['house_value']]

df.shape, X.shape, y.shape

((17689, 6), (17689, 5), (17689, 1))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# 정규화 - Min-Max Scaling
scaler=MinMaxScaler()
scaler.fit(X_train) # feature data의 train data만 fit
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [5]:
# 모델 적용/예측 및 정확도
model=ElasticNet()
model.fit(scaled_X_train, y_train)
pred_x = model.predict(scaled_X_train)
model.score(scaled_X_train, y_train)

0.05038517013198185

In [6]:
pred_y = model.predict(scaled_X_test)
model.score(scaled_X_test, y_test)

0.05184820641905341

In [7]:
# RMSE(Root Mean Squared Error)
rmse_train=np.sqrt(mean_squared_error(y_train,pred_x)) 
rmse_test=np.sqrt(mean_squared_error(y_test,pred_y)) 
print('Train RMSE :', round(rmse_train), '\nTest  RMSE :', round(rmse_test))

Train RMSE : 93009 
Test  RMSE : 93090


In [8]:
# Hyperparameter Tuning
# Grid Search
g_param = {'alpha':[0.0,1e-6,1e-5,1e-4,1e-3,1e-2,0.1,0.5,1.0,2.0,3.0]} 
g_search= GridSearchCV(ElasticNet(), g_param, cv=5, return_train_score=True) 
g_search.fit(scaled_X_train, y_train)

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.0, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1,
                                   0.5, 1.0, 2.0, 3.0]},
             return_train_score=True)

In [9]:
print('Best Parameter :', g_search.best_params_) 
print('Best Score :',round(g_search.best_score_,4)) 
print('Test Score :',round(g_search.score(scaled_X_test, y_test),4))

Best Parameter : {'alpha': 1e-05}
Best Score : 0.5703
Test Score : 0.5826


In [10]:
# Randomized Search 
r_param={'alpha':randint(low=1e-5, high=10)} 
r_search=RandomizedSearchCV(ElasticNet(), param_distributions=r_param, cv=5, n_iter=100, return_train_score=True) 
r_search.fit(scaled_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=ElasticNet(), n_iter=100,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f30a78b6250>},
                   return_train_score=True)

In [11]:
print('Best Parameter :', r_search.best_params_) 
print('Best Score :',round(r_search.best_score_,4)) 
print('Test Score :',round(r_search.score(scaled_X_test, y_test),4))

Best Parameter : {'alpha': 0}
Best Score : 0.5703
Test Score : 0.5826
