# iris_data를 xgboost로 적용을 해보자.

In [None]:
#import packages
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

#Loading iris dataset from sklearn
iris = load_iris()

#independent feautres
X = iris.data

# target features
y = iris.target

In [None]:
#import XGboost
from xgboost import XGBClassifier

#Defining XGB Classification model
clf = XGBClassifier()

- xgboost의 단점은 복잡한 하이퍼 파라미터가 있다.

# 1.Grid SearchCV
- 사용자가 하이퍼 파라미터마다 몇가지 값을 가진 리스트를 입력하면, 가능한 하이퍼 파라미터의 경우의 수마다 예측 성능을 측정하여 사용자가 일일이 하이퍼 파라미터를 설정하고, 예측 성능을 비교하여 최적의 파라미터를 찾는 수고를 줄이고 이 과정을 한꺼번에 진행한다.

In [None]:
#Importing packages from sklearn

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

#defining a set of values as a dictionary for hyperparameters

param_grid = {
    "n_estimators":[100,200,300,400],
    "max_depth":[1,3,5,7],
    "reg_lambda":[.01,.1,.5]    
}

#declaring GridSearchCV model

model = model_selection.GridSearchCV(
    estimator = clf,
    param_grid = param_grid,
    scoring = 'accuracy',
    verbose = 10,
    n_jobs = 1,
    cv = 5    
)

#fitting values to the gridsearchcv model

model.fit(X,y)
#printing the best possible values to enhance accuracy
print(model.best_params_)
print(model.best_estimator_)
#printing the best score
print(model.best_score_)

# 2. RandomizedSearchCV

- 그리드 서치에서는 grid_param과 같이 매개변수마다 특정 값을 지정해주었습니다. 만약에 변수 범위가 너무 다양하다면 하나하나 작성해주는게 너무 힘들다.

- 하이퍼 파라미터 검색 반영이 너무 클때 사용하는 방식이 Randomized Search입니다.


In [None]:
#defining a set of values as a dictionary for hyperparameters

param_grid = {
    "n_estimators":[100,200,300,400],
    "max_depth":[1,3,5,7],
    "reg_lambda":[.01,.1,.5]    
}

#declaring RandomizedSearchCV model

model = model_selection.RandomizedSearchCV(
    estimator = clf,
    param_distributions = param_grid,
    scoring = 'accuracy',
    verbose = 10,
    n_jobs = 1,
    cv = 5,
    n_iter=10
)

#fitting values to the RandomizedSearchCV model

model.fit(X,y)

#printing the best possible values to enhance accuracy

print(model.best_params_)
print(model.best_estimator_)
#printing the best score
print(model.best_score_)

## 3. Bayesian optimization

- 참고 : https://wooono.tistory.com/102

Bayesian Optimization 은 어느 입력값(x)를 받는 미지의 목적 함수 (f(x))를 판단하고 결정하여, 해당 함숫값 (f(x))을 최대로 만드는 최적해를 찾는 것을 목적으로 합니다.

즉, 목적 함수(탐색대상함수)와 하이퍼파라미터 쌍(pair)을 대상으로 Surrogate Model(대체 모델) 을 만들고,
순차적으로 하이퍼 파라미터를 업데이트해 가면서 평가를 통해 최적의 하이퍼파라미터 조합을 탐색합니다.
이 때의 목적 함수를 black-box function 이라고 합니다.
Bayesian Optimization 에는 두 가지 필수 요소가 존재합니다.

먼저 Surrogate Model 은, 현재까지 조사된 입력값-함숫결과값 점들 $(x_1, f(x_1)),...,(x_t, f(x_t))$ 을 바탕으로, 미지의 목적 함수의 형태에 대한 확률적인 추정을 수행하는 모델을 지칭합니다. 그리고 Acquisition Function 은, 목적 함수에 대한 현재까지의 확률적 추정 결과를 바탕으로, ‘최적 입력값을 찾는 데 있어 가장 유용할 만한’ 다음 입력값 후보를 추천해 주는 함수를 지칭합니다.

<img src='https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fb33tsP%2FbtraMpvxJG0%2FSn7uQK7k910IQ7cP3ZM9vk%2Fimg.png'>

In [None]:
def black_box_function(x, y):
    """Function with unknown internals we wish to maximize.

    This is just serving as an example, for all intents and
    purposes think of the internals of this function, i.e.: the process
    which generates its output values, as unknown.
    """
    return -x ** 2 - (y - 1) ** 2 + 1

In [None]:
!pip install bayesian-optimization

In [None]:
from bayes_opt import BayesianOptimization

# Bounded region of parameter space
pbounds = {'x': (2, 4), 'y': (-3, 3)}

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=1,
)

- n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.

- init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.

In [None]:
optimizer.maximize(
    init_points=2,
    n_iter=3,
)

In [None]:
print(optimizer.max)

In [None]:
for i, res in enumerate(optimizer.res):
    print("Iteration {}: \n\t{}".format(i, res))

In [None]:
optimizer.set_bounds(new_bounds={"x": (-2, 3)})

optimizer.maximize(
    init_points=0,
    n_iter=5,
)


# 4.Hyperopt

- HyperOpt는 자동화된 하이퍼파라미터 튜닝 프레임워크로서, fmin()이라는 함수 안에는 3가지의 파라미터가 있다:

    - Objective Function: 최소화할 손실 함수
    - Domain Space: 탐색 범위. 베이지안 최적화에서는 이 범위가 각  하이퍼파라미터에 대해 통계 분포를 만들어낸다.
    - Optimization Algorithm : 최적의 조합을 찾기 위한 알고리즘

참고 : https://velog.io/@emseoyk/%ED%95%98%EC%9D%B4%ED%8D%BC%ED%8C%8C%EB%9D%BC%EB%AF%B8%ED%84%B0-%ED%8A%9C%EB%8B%9D

In [None]:
!pip install scikit-optimize

In [None]:
#importing packages 

from hyperopt import hp,fmin, tpe, Trials

from hyperopt.pyll.base import scope

from functools import partial

from skopt import space

from skopt import gp_minimize

#defining a method that will perfrom a 5 split cross validation over

#dataset and and will produce the optimum value of the accuracy

def optimize(params, x,y):

    clf = XGBClassifier(**params)

    kf = model_selection.StratifiedKFold(n_splits=5)

    accuracies = []

    for idx in kf.split(X=x,y=y):

        train_idx,test_idx = idx[0],idx[1]

        xtrain = x[train_idx]

        ytrain = y[train_idx]

        xtest = x[test_idx]

        ytest = y[test_idx]

        clf.fit(xtrain,ytrain)

        preds =  clf.predict(xtest)

        fold_acc = metrics.accuracy_score(ytest,preds)

        accuracies.append(fold_acc)

    return -1.0 * np.mean(accuracies)

#defining a set of values as hp for hyperparameters

param_space = {

    "max_depth" : scope.int(hp.quniform("max_depth",3,20, 1)) ,

    "min_child_weight" : scope.int(hp.quniform("min_child_weight",1,8, 1)),

    "n_estimators": scope.int(hp.quniform("n_estimators",100,1500,1)),

    'learning_rate': hp.uniform("learning_rate",0.01,1),

    'reg_lambda': hp.uniform("reg_lambda",0.01,1),

    'gamma': hp.uniform("gamma",0.01,1),

    'subsample': hp.uniform("subsample",0.01,1)

    }

#defiing optimization_fuction as partial and calling optimize within it

optimization_fuction = partial(optimize,x = X, y = y) 

trials = Trials()

#Getting the optimum values for hyperparameters

result = fmin(

    fn = optimization_fuction,

    space = param_space,

    algo = tpe.suggest,

    max_evals = 15,

    trials = trials

)

#Printing the best hyperparemeter set

print(result)

# 5. Optuna

- Optuna는 ML 알고리즘의 하이퍼파라미터 튜닝을 자동화해주는 오픈소스 툴입니다. 유사한 툴로 Hyperopt가 있지만 사용성과 문서화, 시각화 제공 여부 등에서 Optuna의 손을 들어주는 경우가 많음.

- 하이퍼파라미터 튜닝에 쓰고 있는 최신 Automl 기법입니다.
- 빠르게 튜닝이 가능하다는 장점이 있음.
- 하이퍼파라미터 튜닝 방식을 지정할수 있다. -> 직관적인 api인 튜닝된 lightgbm도 제공해줍니다.

- 다른 라이브러리들에 비해 직관적인 장점이 있어 코딩하기 용이합니다.


In [None]:
!pip install optuna

In [None]:
#import packages
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

#Loading iris dataset from sklearn
iris = load_iris()

#independent feautres
X = iris.data

# target features
y = iris.target

In [None]:
import optuna
print(optuna.__version__)

In [None]:
#importing packages

import optuna

from functools import partial

#defining a method that will perfrom a 5 split cross validation over

#dataset and and will produce the optimum value of the accuracy

def optimize(trial, x,y):

    #parameter set is declare within function

    reg_lambda = trial.suggest_uniform('reg_lambda',0.01,1)

    n_estimators = trial.suggest_int('n_estimators',100,1500)

    max_depth = trial.suggest_int('max_depth',3,15)

    max_features = trial.suggest_uniform('max_features',0.01,1)

    clf = XGBClassifier(

    n_estimators= n_estimators,

    reg_lambda=reg_lambda,

    max_depth=max_depth,

    max_features= max_features)

    kf = model_selection.StratifiedKFold(n_splits=5)

    accuracies = []

    for idx in kf.split(X=x,y=y):

        train_idx,test_idx = idx[0],idx[1]

        xtrain = x[train_idx]

        ytrain = y[train_idx]

        xtest = x[test_idx]

        ytest = y[test_idx]

        clf.fit(xtrain,ytrain)

        preds =  clf.predict(xtest)

        fold_acc = metrics.accuracy_score(ytest,preds)

        accuracies.append(fold_acc)

    return -1.0 * np.mean(accuracies)

#defiing optimization_fuction as partial and calling optimize within it

optimization_fuction = partial(optimize,x = X, y = y) 

study = optuna.create_study(direction='minimize')

#Printing the best hyperparemeter set

study.optimize(optimization_fuction, n_trials=15)

## Create an objective function

In [None]:
## In optuna, A Trial represents a single call of the objective function
## Study shows an optimization session which contains a set of trials
## Study: optimization based on an objective function
## Trial: a single execution of the objective function

## In this demo, "alpha" is the hyperparameter which is need to be optimized
def objective(trial):
   
    # hyperparameter setting, trial.suggest_uniform will suggest uniform hyperparameter
    #alpha between the range of 0.0 to 2.0, lowest value of interval is closed and 
    #when low=high, it will return low value
    alpha = trial.suggest_uniform('alpha', 0.0, 2.0)
    
    # data loading and train-test split
    X, y = load_iris(return_X_y=True)
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
    
    # model training and evaluation
    model = sklearn.linear_model.Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    error = sklearn.metrics.mean_squared_error(y_val, y_pred)

    # output: evaluation score
    return error

## Create an study for that ML model and optimize it

In [None]:
# In Optuna, we use the study object to manage optimization.
# Method :func:`~optuna.create_study` returns a study object.
# A study object has useful properties for analyzing the optimization outcome.
study = optuna.create_study(direction='minimize') #Set minimize for minimization and maximize for maximization.
#To start the optimization, we create a study object and pass the objective function to method
study.optimize(objective, n_trials=50)

## Visualized the above hyperparameter optimization study

In [None]:
#importing all the plot functions
from optuna.visualization import plot_edf
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

In [None]:
# Visualize the optimization history. See :func:`~optuna.visualization.plot_optimization_history` for the details.
plot_optimization_history(study)

In [None]:
# Visualize high-dimensional parameter relationships. See :func:`~optuna.visualization.plot_parallel_coordinate` for the details.
plot_parallel_coordinate(study)

In [None]:
# Visualize individual hyperparameters as slice plot. See :func:`~optuna.visualization.plot_slice` for the details.
plot_slice(study)

In [None]:
# Visualize parameter importances. See :func:`~optuna.visualization.plot_param_importances` for the details.
#In this case, we have only one parameter.
plot_param_importances(study)

In [None]:
# Visualize empirical distribution function. See :func:`~optuna.visualization.plot_edf` for the details.
plot_edf(study)

In [None]:
import optuna
import sklearn
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

def objective(trial):
    iris = sklearn.datasets.load_iris()
    x, y = iris.data, iris.target

    classifier_name = trial.suggest_categorical('classifier', ['SVC', 'RandomForest'])
    if classifier_name == 'SVC':
        svc_c = trial.suggest_loguniform('svc_c', 1e-10, 1e10)
        classifier_obj = sklearn.svm.SVC(C=svc_c, gamma='auto')
    

    else:
        xgb_max_depth = int(trial.suggest_loguniform('xgb_max_depth', 1, 32))
        classifier_obj = XGBClassifier(max_depth=xgb_max_depth, n_estimators=10)
    
    accuracy = cross_val_score(classifier_obj, x, y, cv = 4).mean()
    return accuracy

study = optuna.create_study(direction='maximize')


study.optimize(objective, n_trials=200)

In [None]:
print(study.best_trial.params)


optuna_acc = study.best_trial.value
print(optuna_acc)

## Visualization

In [None]:
optuna.visualization.plot_param_importances(study)

optuna.visualization.plot_optimization_history(study)

In [None]:
#importing all the required packages
import optuna
from sklearn.datasets import load_iris
import sklearn.linear_model
import sklearn.metrics
from sklearn.model_selection import train_test_split

## Import all the required libraries

# 6.Pycaret

- AutoML을 하게 해주는 파이썬 라이브러리
- scikit-learning 패키지를 기반으로 하고 있으며 Classification, Regression, Clustering, Anomaly Detection 등 다양한 모델을 지원함.

- 공식문서에 설명이 매우 잘 되어 있고, 몇 줄의 코드로 쉽게 구현이 가능하기 때문에 유용하게 사용할 수 있음.

In [None]:
#import packages
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

#Loading iris dataset from sklearn
iris = load_iris()

#independent feautres
X = iris.data

# target features
y = iris.target

In [None]:
!pip install optuna
import optuna
print(optuna.__version__)

In [None]:
#importing packages

import optuna

from functools import partial

#defining a method that will perfrom a 5 split cross validation over

#dataset and and will produce the optimum value of the accuracy

def optimize(trial, x,y):

    #parameter set is declare within function

    reg_lambda = trial.suggest_uniform('reg_lambda',0.01,1)

    n_estimators = trial.suggest_int('n_estimators',100,1500)

    max_depth = trial.suggest_int('max_depth',3,15)

    max_features = trial.suggest_uniform('max_features',0.01,1)

    clf = XGBClassifier(

    n_estimators= n_estimators,

    reg_lambda=reg_lambda,

    max_depth=max_depth,

    max_features= max_features)

    kf = model_selection.StratifiedKFold(n_splits=5)

    accuracies = []

    for idx in kf.split(X=x,y=y):

        train_idx,test_idx = idx[0],idx[1]

        xtrain = x[train_idx]

        ytrain = y[train_idx]

        xtest = x[test_idx]

        ytest = y[test_idx]

        clf.fit(xtrain,ytrain)

        preds =  clf.predict(xtest)

        fold_acc = metrics.accuracy_score(ytest,preds)

        accuracies.append(fold_acc)

    return -1.0 * np.mean(accuracies)

#defiing optimization_fuction as partial and calling optimize within it

optimization_fuction = partial(optimize,x = X, y = y) 

study = optuna.create_study(direction='minimize')

#Printing the best hyperparemeter set

study.optimize(optimization_fuction, n_trials=15)

1) 설치

In [None]:
!pip install pycaret
!pip install shap #interpret_model 사용할때 필요

2) Data load

In [None]:
from pycaret.datasets import get_data
diabetes = get_data('diabetes')

3) Set up
- 학습 데이터가 무엇인지, 목표 클래스는 무엇인지 설정

- 엔터 한번을 꼭 클릭을 할 것!


### Numba
 - Python 및 Numpy 코드의 하위 집합을 빠른 기계 코드로 변환하는 오픈소스 Jit 컴파일러.

 - Just In Time 컴파일러를 사용해 파이썬 코드 내에서 일반 코드 및 Numpy를 아주 빠른 속도로 처리 가능한 기능을 제공.

In [None]:
!pip install numba --upgrade

In [None]:
from pycaret.classification import *
data = setup(diabetes, target = 'Class variable')

4) 모델 비교

In [None]:
compare_models()

5) 모델 생성

In [None]:
lr = create_model('lr')

6) 모델 튜닝

In [None]:
tuned_lr = tune_model(lr)

7) Bagging

In [None]:
bagged_lr = ensemble_model(lr, method = 'Bagging')

8) Stacking

In [None]:
lda = create_model('lda')
rf = create_model('rf')
stacker = stack_models(estimator_list = [lda, rf], meta_model = lr)

9) Plot

In [None]:
plot_model(lr)

10) 모델 저장

In [None]:
save_model(lr, 'lr_saved')

11) Model load

In [None]:
lr_saved = load_model('lr_saved')