### 교차 검증 단순화
- scikit-learn.model_selection 모듈 내에 모델 검증관련 기능 활용
- 교차 검증 데이터 기반 검증 결과 처리

(1) 모듈 로딩 및 데이터 준비 <hr>

In [1]:
import pandas as pd

# 생선 데이터 준비
fishDF = pd.read_csv('../DATA/fish.csv')

# 붓꽃 데이터 준비
irisDF = pd.read_csv('../DATA/iris.csv')

In [2]:
fishDF.head(3)

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961


In [3]:
irisDF.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa


(2) 데이터 준비 => 피쳐 / 타겟 분리 <hr>

In [4]:
# 타겟 : Weight, 피쳐 : Length, Diagonal, Height, Width
fish_targetSR = fishDF[fishDF.columns[1]]
fish_featureDF = fishDF[fishDF.columns[2:]]

fish_targetSR.head(2), fish_featureDF.head(2)

(0    242.0
 1    290.0
 Name: Weight, dtype: float64,
    Length  Diagonal  Height   Width
 0    25.4      30.0   11.52  4.0200
 1    26.3      31.2   12.48  4.3056)

In [5]:
# 타겟 : species, 피쳐 : sepal_length, sepal_width, petal_length, petal_width
iris_targetSR = irisDF['species']
iris_featureDF = irisDF[irisDF.columns[:-1]]

iris_targetSR.head(2), iris_featureDF.head(2)

(0    setosa
 1    setosa
 Name: species, dtype: object,
    sepal_length  sepal_width  petal_length  petal_width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2)

In [6]:
# 학습용, 테스트용 데이터 분리
from sklearn.model_selection import train_test_split

# 생선 데이터
fish_x_train, fish_x_test, fish_y_train, fish_y_test = train_test_split(fish_featureDF, fish_targetSR, 
                                                                        test_size=0.2, random_state=5)

# 붗꽃 데이터
iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_featureDF, iris_targetSR, 
                                                                        test_size=0.2, random_state=5, stratify=iris_targetSR)

(3) 데이터 전처리 <hr>
- 피쳐 스케일링

In [11]:
from sklearn.preprocessing import StandardScaler

# 생선 데이터셋 피쳐 스케일링
fish_scaler = StandardScaler()
fish_scaler.fit(fish_x_train)

scaled_fish_x_train = fish_scaler.transform(fish_x_train)
scaled_fish_x_test = fish_scaler.transform(fish_x_test)

# 붓꽃 데이터셋 피쳐 스케일링
iris_scaler = StandardScaler()
iris_scaler.fit(iris_x_train)

scaled_iris_x_train = iris_scaler.transform(iris_x_train)
scaled_iris_x_test = iris_scaler.transform(iris_x_test)

(4) 학습 <hr>

(4-1) 생선 무게 예측 모델 (회귀)

In [13]:
# 모듈 로딩
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

In [14]:
# 교차 검증으로 학습 진행
# => 준비 : 모델 인스턴스, 학습용 피쳐 데이터, 학습용 라벨 데이터
lr_model = LinearRegression()

In [34]:
# 학습/검증에 대한 모든 결과에 대한 처리
result = cross_validate(lr_model, 
                        scaled_fish_x_train, fish_y_train, 
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True,
                        return_estimator=True)
print(result)

{'fit_time': array([9.99212265e-04, 1.08361244e-03, 1.99437141e-03, 9.98258591e-04,
       7.20024109e-05]), 'score_time': array([0.00206709, 0.00133729, 0.00111938, 0.0009973 , 0.00099444]), 'estimator': [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])}


In [35]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.000999,0.002067,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.001084,0.001337,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.001994,0.001119,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.000998,0.000997,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,7.2e-05,0.000994,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


In [36]:
best_model = resultDF.iloc[0]['estimator']
best_model.coef_, best_model.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [37]:
### CV에서 score만 추출
cross_val_score(lr_model, scaled_fish_x_train, fish_y_train)

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [38]:
### CV에서 predict만 추출
cross_val_predict(lr_model, scaled_fish_x_test, fish_y_test)

array([ 622.9370399 ,  190.32868097,  367.23105864,  177.61960659,
        217.04446718,  475.09635801,  617.48701967,    2.18489816,
        347.19295855,  623.02091668,  360.15563829,  194.11457966,
        225.89235241, 1243.33282317, 1358.91524558,  355.4838006 ,
        672.73652147,  349.46697396,  719.4008871 ,  142.66810206,
        248.11096427, -197.37549711,   71.76179815,   98.99430885,
        155.66806474, -345.54229042,  537.92791374,  134.89750312,
        625.38205691, -186.9521243 ,  181.81549631,  688.90322353])

### 교차검증과 튜닝까지 한꺼번에 진행
- 단점 : 시간이 오래 걸림

In [40]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

In [49]:
est = LogisticRegression(max_iter=10000, solver='liblinear')
params = {'penalty': ['l1', 'l2'],
          # 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    }

In [50]:
gscv = GridSearchCV(est, param_grid=params, return_train_score=True)
gscv.fit(scaled_iris_x_train, iris_y_train)

In [51]:
cv_resultsDF = pd.DataFrame(gscv.cv_results_)
cv_resultsDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.001923,0.000272,0.000798,0.000399,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588
1,0.001471,0.000448,0.000599,0.000489,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.908333,0.066667,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795


In [52]:
gscv.best_params_, gscv.best_index_, gscv.best_score_

({'penalty': 'l1'}, 0, 0.9416666666666668)

In [53]:
best_model = gscv.best_estimator_

### 데이터에 적합한 모델 찾기

In [54]:
from sklearn.utils.discovery import all_estimators

In [60]:
models = all_estimators('classifier')

for model_name, model in models:
    # print(model)
    try:
        print(model(scaled_iris_x_train, iris_y_train))
    except Exception as e:
        print(f'오류 발생 : {model}')

오류 발생 : <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
BaggingClassifier(estimator=array([[-0.67338676,  1.45883235, -1.31265713, -1.32669067],
       [-1.04981414,  0.99995979, -1.42826729, -1.19533516],
       [-1.30076572, -0.14722161, -1.37046221, -1.19533516],
       [ 0.58137117,  0.54108723,  0.53710558,  0.5122865 ],
       [ 0.70684697, -0.60609417,  1.05735134,  1.16906405],
       [ 0.95779855, -0.37665789,  0.47930049,  0.11821996],
       [ 2.33803228, -0.14722161,  1.34...
       [-0.17148358, -0.60609417,  0.19027507,  0.11821996],
       [ 0.45589538, -0.37665789,  0.30588524,  0.11821996],
       [-1.6771931 , -1.75327557, -1.42826729, -1.19533516],
       [-0.79886255,  2.37657747, -1.31265713, -1.45804618],
       [-0.29695938, -0.14722161,  0.42149541,  0.38093098]]),
                  n_estimators=48         setosa
40         setosa
45         setosa
56     versicolor
128     virginica
          ...    
99     versicolor
97     versicolor
41        