In [1]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

# boston 데이터셋 로드
bostondf = pd.read_csv('./datasets/Boston.csv', index_col=0)

# boston dataset의 target array는 주택 가격. price 컬럼으로 설정
bostondf.rename(columns = {'medv':'price'},inplace=True)

y_target = bostondf['price']
X_data = bostondf.drop(['price'],axis=1,inplace=False)

In [2]:
# alpha값에 따른 회귀 모델의 폴드 평균 RMSE를 출력하고, 회귀 계수값들을 DataFrame으로 반환해주는 함수
def get_linear_reg_eval(model_name, params = None, X_data_n=None, y_target_n=None, verbose=True):
    coeff_df = pd.DataFrame()
    if verbose : print('###### ', model_name, '######')
    for param in params:
        if model_name == 'Ridge' : model = Ridge(alpha=param)
        elif model_name == 'Lasso' : model = Lasso(alpha=param)
        elif model_name == 'ElasticNet' : model = ElasticNet(alpha=param,l1_ratio=0.7)
        neg_mse_scores = cross_val_score(model, X_data_n, y_target_n, scoring='neg_mean_squared_error', cv = 5)
        avg_rmse = np.mean(np.sqrt(-1*neg_mse_scores))
        print('alpha {0}일때 5 폴드 세트의 평균 RMSE: {1:.3f}'.format(param, avg_rmse))
        
        # cross_val_score는 evaluation metric만 반환하므로 모델을 다시 학습하여 회귀 계수 추출
        model.fit(X_data_n, y_target_n)
        # alpha에 따른 피처별 회귀 계수르 Series로 변환하고 이를 DataFrame의 컬럼으로 추가
        coeff = pd.Series(data=model.coef_, index=X_data_n.columns)
        colname = 'alpha:' + str(param)
        coeff_df[colname] = coeff
    return coeff_df
# end of get_linear_reg_eval

In [3]:
# 라쏘에 사용될 alpha 파라미터의 값들을 정의하고 get_linear_reg_eval() 함수 호출
lasso_alphas = [0.07, 0.1, 0.5, 1, 3]
coeff_lasso_df = get_linear_reg_eval('Lasso', params=lasso_alphas, X_data_n=X_data, y_target_n=y_target)

######  Lasso ######
alpha 0.07일때 5 폴드 세트의 평균 RMSE: 5.612
alpha 0.1일때 5 폴드 세트의 평균 RMSE: 5.615
alpha 0.5일때 5 폴드 세트의 평균 RMSE: 5.669
alpha 1일때 5 폴드 세트의 평균 RMSE: 5.776
alpha 3일때 5 폴드 세트의 평균 RMSE: 6.189


In [4]:
# 반환된 coeff_lasso_df를 첫번째 컬럼순으로 내림차순 정렬하여 회귀 계수 DataFrame 출력
sort_column= 'alpha:'+str(lasso_alphas[0])
coeff_lasso_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
rm,3.789725,3.703202,2.498212,0.949811,0.0
chas,1.434343,0.95519,0.0,0.0,0.0
rad,0.270936,0.274707,0.277451,0.264206,0.061864
zn,0.049059,0.049211,0.049544,0.049165,0.037231
black,0.010248,0.010249,0.009469,0.008247,0.00651
nox,-0.0,-0.0,-0.0,-0.0,0.0
age,-0.011706,-0.010037,0.003604,0.02091,0.042495
tax,-0.01429,-0.01457,-0.015442,-0.015212,-0.008602
indus,-0.04212,-0.036619,-0.005253,-0.0,-0.0
crim,-0.098193,-0.097894,-0.083289,-0.063437,-0.0


In [5]:
# 엘라스틱넷에 사용될 alpha 파라미터의 값들을 정의하고 get_linear_reg_eval() 함수 호출
# l1_ratio는 0.7로 고정
elastic_alphas = [0.07, 0.1, 0.5, 1, 3]
coeff_elastic_df = get_linear_reg_eval('ElasticNet', params=elastic_alphas, X_data_n=X_data, y_target_n=y_target)

######  ElasticNet ######
alpha 0.07일때 5 폴드 세트의 평균 RMSE: 5.542
alpha 0.1일때 5 폴드 세트의 평균 RMSE: 5.526
alpha 0.5일때 5 폴드 세트의 평균 RMSE: 5.467
alpha 1일때 5 폴드 세트의 평균 RMSE: 5.597
alpha 3일때 5 폴드 세트의 평균 RMSE: 6.068


In [6]:
# 반환된 coeff_elastic_df를 첫번째 컬럼순으로 내림차순 정렬하여 회귀 계수 DataFrame 출력
sort_column2= 'alpha:'+str(elastic_alphas[0])
coeff_elastic_df.sort_values(by=sort_column2, ascending=False)

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
rm,3.574162,3.414154,1.918419,0.938789,0.0
chas,1.330724,0.979706,0.0,0.0,0.0
rad,0.27888,0.283443,0.300761,0.289299,0.146846
zn,0.050107,0.050617,0.052878,0.052136,0.038268
black,0.010122,0.010067,0.009114,0.00832,0.00702
age,-0.010116,-0.008276,0.00776,0.020348,0.043446
tax,-0.014522,-0.014814,-0.016046,-0.016218,-0.011417
indus,-0.044855,-0.042719,-0.023252,-0.0,-0.0
crim,-0.099468,-0.099213,-0.08907,-0.073577,-0.019058
nox,-0.175072,-0.0,-0.0,-0.0,-0.0
