### 교차검증 단순화
- sklearn의 model_selection 모듈 내에 모델 검증 관련 기능 활용
- 교차검증 데이터 기반 검증결과 처리

1. 모듈 로딩 & 데이터 준비
<hr>

In [48]:
import pandas as pd
# 생선 데이터
fish = pd. read_csv('../data/fish.csv')

#iris 데이터
iris = pd. read_csv('../data/iris.csv')

In [49]:
fish.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


In [50]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


2. 데이터 준비: feature, target 분리

In [51]:
#target : weight, feature; else
fish_target = fish[fish.columns[1]]
fish_feature = fish[fish.columns[2:]]

In [52]:
fish_feature.head(2)

Unnamed: 0,Length,Diagonal,Height,Width
0,25.4,30.0,11.52,4.02
1,26.3,31.2,12.48,4.3056


In [53]:
#target : variety, feature: else

iris_target = iris[iris.columns[4]]
iris_feature = iris[iris.columns[:4]]

In [54]:
iris_feature.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


In [55]:
# 학습 / 테스트용 데이터 분리하기
from sklearn.model_selection import train_test_split
#fish -> 학습용, 테스트용 : 회귀
fxtrain, fxtest, fytrain, fytest = train_test_split(fish_feature, fish_target, random_state=5, test_size=0.2)

#iris -> 학습용, 데이터용 : 분류
ixtrain, ixtest, iytrain, iytest = train_test_split(iris_feature, iris_target, random_state=5, test_size=0.2, stratify= iris_target)

3. 데이터 전처리
<hr>
- feature scale


In [56]:
from sklearn.preprocessing import StandardScaler
fishscaler = StandardScaler()
fishscaler.fit(fxtrain)
scaledfish_Train = fishscaler.transform(fxtrain)
scaledfish_Test = fishscaler.transform(fxtest)

irisstd = StandardScaler()
irisstd.fit(ixtrain)
scalediris_Train = irisstd.transform(ixtrain)
scalediris_Test = irisstd.transform(ixtest)

4. 학습
<hr>
- 생선 무게 예측 모델

In [57]:
from sklearn. model_selection import cross_val_score, cross_val_predict, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression

In [58]:
#교차검증으로 학습진행
#준비 : 모델 인스턴스, 학습용 feature 데이터, 학습용 label 데이터

#instance
lr_md = LinearRegression()


In [59]:
# 학습 / 검증에 대한 평가 모든 결과에 대한 처리

result = cross_validate(lr_md, 
                        scaledfish_Train, 
                        fytrain,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True,
                        return_estimator=True,
                        ) 
# cv=3 이라는 것은 3등분 했다는 소리

print(result)

{'fit_time': array([0.00099897, 0.0014987 , 0.00099945, 0.00099897, 0.00099993]), 'score_time': array([0.00200009, 0.0010047 , 0.00199986, 0.00200081, 0.00100017]), 'estimator': [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])}


In [60]:
resultDF = pd.DataFrame(result)

In [61]:
best_model = resultDF.iloc[0]['estimator']
best_model.coef_, best_model.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [62]:
## CV 스코어만 추출

cross_val_score(lr_md, scaledfish_Train, fytrain)

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [63]:
## CV predict 추출
cross_val_predict(lr_md, scaledfish_Train, fytrain)

array([ 9.09792517e+01,  9.85612151e+01,  3.87029719e+02,  1.13011547e+02,
        6.81676563e+02,  2.82456988e+02,  5.34379642e+02,  3.61848302e+02,
        6.12934598e+02,  1.70756130e+02,  5.53222970e+02,  1.69433076e+01,
       -2.53895688e+01,  8.14926155e+02,  6.97225129e+01,  3.38157931e+02,
        4.76306355e+02,  7.67659158e+02,  6.55686457e+02,  1.80300946e+02,
        8.45315559e+02,  2.92145322e+02,  6.08539351e+02,  9.02782406e+02,
        6.99788981e+02,  9.40316876e+02,  7.47628344e+02,  3.28419355e+02,
        7.89622699e+02,  9.09130831e+02, -1.98986854e+02,  1.81089559e+02,
        6.36731679e+02, -1.09209894e+02,  3.57087822e+02,  7.88250361e+02,
        3.25180589e+02,  6.56473977e+02, -2.37032025e+02,  4.55882834e+01,
        9.57130255e+01, -2.10830505e+02,  1.28969696e+02, -2.21199132e+02,
       -1.10282630e+02,  6.39911566e+02,  2.12288357e+02,  2.41098815e+02,
        2.61932359e+02, -2.58301758e+02,  2.93250859e+01,  8.87950700e+02,
        2.46460034e+02,  

## 교차검증, 튜닝까지 한번에 진행
- 단점: 시간이 준내오래걸린다"

In [64]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [71]:
from sklearn.linear_model import LogisticRegression
params  = {'penalty': ['l1', 'l2']}
est = LogisticRegression(max_iter=1000, solver='liblinear')

gscv = GridSearchCV(est, param_grid=params, return_train_score=True)
gscv.fit(scalediris_Train, iytrain)

In [72]:
cv_resultsDF = pd.DataFrame(gscv.cv_results_)

In [73]:
cv_resultsDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.001199,0.000406,0.0006,0.00049,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588
1,0.000801,0.000401,0.000401,0.000491,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.916667,0.052705,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795


In [74]:
print(gscv.best_params_, gscv.best_index_,gscv.best_score_)

{'penalty': 'l1'} 0 0.9416666666666668


## 데이터에 적합한 모델 찾기

In [75]:
from sklearn.utils.discovery import all_estimators

In [80]:
models = all_estimators('classifier')

for model_name, model in models:
    try:
        print(model().fit(scalediris_Train, iytrain))
    except Exception as e:
        print(e)

AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
Negative values in data passed to CategoricalNB (input X)
__init__() missing 1 required positional argument: 'base_estimator'
Negative values in data passed to ComplementNB (input X)
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()




LinearSVC()
LogisticRegression()
LogisticRegressionCV()
MLPClassifier()
__init__() missing 1 required positional argument: 'estimator'
Negative values in data passed to MultinomialNB (input X)
NearestCentroid()
NuSVC()
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'
