#### 보스턴 집값 예측 모델
- 데이터셋 : boston.csv
- 학습방법 : 지도학습 >> 회귀
- 피쳐/독립 : 13개
- 타겟/종속 : 1개

[1] 데이터 준비

In [103]:
# 모듈 로딩
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, minmax_scale, RobustScaler
from sklearn.model_selection import train_test_split

In [104]:
# 데이터
DATA_FILE = r'C:\VSCode\KDT\머신러닝\DAY05\boston.csv'

In [105]:
data_df = pd.read_csv(DATA_FILE)
data_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [106]:
# 데이터 기본 정보 확인
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


[2] 전처리
* [2-1] 데이터 정제

In [107]:
### 결측치, 중북값, 이상치, 컬럼별 고유값 추출로 이상 데이터 체크

- [2-2] 표준화 & 정규화 ==> 진행여부에 따라 성능의 변화는 경우에 따라 다름!!
    * 정규분포 데이터셋을 기반으로 한 모델 ==> StandardScaler, MinMaxScaler ,Log 변환
    * 피쳐의 값의 범위 차이를 줄이기 ==> 피쳐 스케일링, MinMaxScaler, RobustScaler...
    * 범주형 피쳐 ==> 수치화 인코딩 OneHotEncoder, OrdinalEncoder
    * 문자열 타겟 ==> 정수형 라벨 인코딩 LabelEncoder

- [2-3] 피쳐와 타겟 분리

In [108]:
featureDF = data_df.iloc[:,:-1]
targetSR = data_df['MEDV']

In [109]:
print(f'featureDF : {featureDF.shape}  targetSR : {targetSR.shape}')

featureDF : (506, 13)  targetSR : (506,)


[3] 학습 준비

[3-1] 학습용 & 테스트용 데이터셋 분리

In [110]:
X_train,X_test,y_train,y_test = train_test_split(featureDF,
                                                 targetSR,
                                                 random_state=10) # 분류가 아니라서 Stratify 필요 없음

In [111]:
print(f'X_train : {X_train.shape}  y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}  y_test : {y_test.shape}')

X_train : (379, 13)  y_train : (379,)
X_test : (127, 13)  y_test : (127,)


[3-2] 학습용 데이터셋으로 스케일러 생성

In [112]:
### - 수치 피쳐 값의 범위 차가 큼 ==> Scaling 진행
ssScaler = StandardScaler()
ssScaler.fit(X_train)

In [113]:
X_train_scaled = ssScaler.transform(X_train)
X_test_scaled = ssScaler.transform(X_test)

[4] 학습 진행 ==> 교차검증으로 진행

In [114]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [115]:
# 모델 인스턴스 생성
ridge_model = Ridge(alpha=1.0) # 기본값 : 알파 = 1.0

In [116]:
### 모델의 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_values = [0.,1.,10,100]

for value in alpha_values:
    # 모델 인스턴스 생성
    ridge_model = Ridge(alpha=value)


    # 학습 진행
    # - cv : 3개
    # - scoring : 'neg_mean_squared_error', 'r2'
    # - return_train_score = True
    result = cross_validate(ridge_model,X_train_scaled,y_train,
                            cv=3,scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True,return_estimator=True)
    
    
    resultDF=pd.DataFrame(result)[['test_r2','train_r2']]

    resultDF['diff'] = abs(resultDF['test_r2']-resultDF['train_r2'])

    print(result['estimator'][0].coef_)
    # best_idx = resultDF['diff'].sort_values()[0]
    # print(best_idx)
    # print(f'[Ridge(alpha={value}]')
    # print(resultDF,end='\n\n')

[-1.41407793  1.56590993  0.15536906  0.65522098 -2.36200159  2.31948624
  0.1173831  -3.59071105  2.71475429 -2.33252925 -1.88390034  1.04036915
 -3.50250877]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]


[-1.23221033  1.29302258 -0.12737786  0.70280521 -1.80949922  2.48028701
 -0.00860666 -2.99831755  1.75466332 -1.51704375 -1.73434856  1.00368486
 -3.30809117]
[-0.78141029  0.70910255 -0.46407849  0.72503917 -0.69294458  2.41757287
 -0.24148703 -1.21831206  0.28616643 -0.63423538 -1.31602563  0.78528977
 -2.39571659]


In [117]:
resultDF =  pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.007987,0.009629,Ridge(alpha=100),-18.988666,-24.050879,0.724036,0.708269
1,0.002867,0.002025,Ridge(alpha=100),-25.390202,-21.95231,0.725993,0.686628
2,0.008802,0.005867,Ridge(alpha=100),-26.468497,-20.660563,0.627335,0.744452


In [118]:

### 모델의 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_values = [0.,1.,10,100]

for value in alpha_values:
    # 모델 인스턴스 생성
    ridge_model = Lasso(alpha=value,max_iter=3)


    # 학습 진행
    # - cv : 3개
    # - scoring : 'neg_mean_squared_error', 'r2'
    # - return_train_score = True
    result = cross_validate(ridge_model,X_train_scaled,y_train,
                            cv=3,scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True,return_estimator=True)
    
    
    resultDF=pd.DataFrame(result)[['test_r2','train_r2']]

    resultDF['diff'] = abs(resultDF['test_r2']-resultDF['train_r2'])

    print(result['estimator'][0].coef_)
    # best_idx = resultDF['diff'].sort_values()[0]
    # print(best_idx)
    # print(f'[Ridge(alpha={value}]')
    # print(resultDF,end='\n\n')

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[-0.76918209  1.30798802 -1.3660128   0.70871821 -1.12810945  3.13078874
  0.20140226 -3.18951128  0.40006951 -1.02796444 -1.33246342  1.05170534
 -2.85931196]
[-0.12685525  0.         -0.68948499  0.         -0.35867851  3.50097227
 -0.         -0.         -0.02775436 -0.34045443 -1.07046702  0.47097032
 -2.11146537]


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]


- 하이퍼파라미터 튜닝과 교차 검정을 동시에 진행

In [119]:
from sklearn.model_selection import GridSearchCV

In [120]:
# Ridge의 Hyper-parameter 값 설정
parmas = {'alpha':[0.,0.1,0.5,1.0],
          'max_iter':[3,5]}

#==> 0,3 => Model    #==> 0., 5 => Model
#==> 0.1,3 => Model    #==> 0.1., 5 => Model
#==> 0.5,3 => Model    #==> 0.5, 5 => Model
#==> 1.0,3 => Model    #==> 1.0, 5 => Model
#==> 8개의 Ridge 모델 생성


In [121]:
# 인스턴스 생성
rModel = Ridge()

# GridSearchCV 인스턴스 생성
serchCV = GridSearchCV(rModel,param_grid=parmas,cv=3,verbose=True,return_train_score=True)

In [122]:
# 학습 진행
serchCV.fit(X_train_scaled,y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [123]:
# fit() 진행 후 모델 파라미터 확인
serchCV.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [126]:
bestModel = serchCV.best_estimator_

In [129]:
resultDF = pd.DataFrame(serchCV.cv_results_)
resultDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.004859,0.000529,0.00701,0.006008,0.0,3,"{'alpha': 0.0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.009067,0.004234,0.004822,0.002592,0.0,5,"{'alpha': 0.0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.004061,0.000882,0.004337,0.002641,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
3,0.005275,0.000545,0.004212,0.001607,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
4,0.009116,0.003517,0.002715,0.000215,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
5,0.016661,0.013253,0.006226,0.001925,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
6,0.003035,0.000741,0.002839,0.000651,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
7,0.006132,0.002397,0.006751,0.003468,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
