### sklearn.linear_model.Ridge

#### 선형회귀분석의 기본원리를 따르나 가중치(회귀계수) 값을 최대한 작게(0에 가깝게) 만들어 모든 독립변수(Feature)가 종속변수(Label)에 미치는 영향을 최소화하는 제약(Regularization)을 반영한 회귀모델
- 다항곡선도 추정 가능
- 훈련데이터의 과대적합되지 않도록 특성의 영향을 제약한 모델

#### 주요 Hyperparameter
- alpha : 기본값 1로, 0에 가까울수록 규제가 없고(선형회귀와 유사한 결과) 값이 커질수록 규제가 강해져 회귀계수가 0에 근접

##### Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)

# 분석 코드

In [1]:
# 라이브러리 및 데이터 로드
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

df = pd.read_csv('../input/big-data-certification-study/house_price.csv', encoding='utf-8')
df.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [2]:
# 데이터셋 분리
X = df.drop(columns=['house_value'])
y = df[['house_value']]

df.shape, X.shape, y.shape

((17689, 6), (17689, 5), (17689, 1))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# stratify : y 범주의 비율에 따라 데이터 분리

In [4]:
# 정규화 - Min-Max Scaling
scaler=MinMaxScaler()
scaler.fit(X_train) # feature data의 train data만 fit
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [5]:
# 모델 적용/예측 및 정확도
model=Ridge()
model.fit(scaled_X_train, y_train)
pred_x = model.predict(scaled_X_train)
model.score(scaled_X_train, y_train)

0.5706637113374347

In [6]:
pred_y = model.predict(scaled_X_test)
model.score(scaled_X_test, y_test)

0.5826346815681084

In [7]:
# RMSE(Root Mean Squared Error)
rmse_train=np.sqrt(mean_squared_error(y_train,pred_x)) 
rmse_test=np.sqrt(mean_squared_error(y_test,pred_y)) 
print('Train RMSE :', round(rmse_train), '\nTest  RMSE :', round(rmse_test))

Train RMSE : 62539 
Test  RMSE : 61762


In [8]:
# Hyperparameter Tuning
# Grid Search
g_param = {'alpha':[1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0]} 
g_search= GridSearchCV(Ridge(), g_param, cv=5, return_train_score=True) 
g_search.fit(scaled_X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0,
                                   10.0]},
             return_train_score=True)

In [9]:
print('Best Parameter :', g_search.best_params_) 
print('Best Score :',round(g_search.best_score_,4)) 
print('Test Score :',round(g_search.score(scaled_X_test, y_test),4))

Best Parameter : {'alpha': 0.01}
Best Score : 0.5703
Test Score : 0.5826


In [10]:
# Randomized Search 
r_param={'alpha':randint(low=1e-4, high=100)} 
r_search=RandomizedSearchCV(Ridge(), param_distributions=r_param, cv=5, n_iter=100, return_train_score=True) 
r_search.fit(scaled_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=Ridge(), n_iter=100,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe78fe00550>},
                   return_train_score=True)

In [11]:
print('Best Parameter :', r_search.best_params_) 
print('Best Score :',round(r_search.best_score_,4)) 
print('Test Score :',round(r_search.score(scaled_X_test, y_test),4))

Best Parameter : {'alpha': 0}
Best Score : 0.5703
Test Score : 0.5826
