# Bayesian-Optimization

## 패키지 로딩 및 데이터 로딩

### 패키지 로딩

In [1]:
# data handling
import numpy as np
import pandas as pd

# imputation
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# scoring
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# use model
from catboost import CatBoostClassifier

# bayesian optimization
from bayes_opt import BayesianOptimization

# make events
from itertools import product

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

### 데이터 로딩

In [2]:
data = pd.read_csv('C:/bad_web/data/train_dataset.csv')
data.drop(columns = ['Unnamed: 0'], inplace = True)
data['Result_v1'] = data['Result_v1'].apply(lambda x: -1 if x == 'malicious' else 1) # 1 - benign, -1 - malicious

## Config
- 데이터 셋 모델링 함수 구축
- 앞전에 추출했던 성능이 가장 높게 나오는 경우의 수로 데이터 셋 구성

In [3]:
# 데이터 셋 모델링
def select_dataset(data,config) :
    """
    config : dict
    """
    
    data.reset_index(drop = True, inplace = True)

    # one_columns
    one_columns = ['url_chinese_present',"html_num_tags('applet')"]

    # one_distribute
    one_distribute = ['url_query_len','url_num_query_para']

    # all_columns
    all_columns = list(data.columns)
    
    if config['one_column_on'] == False :
        for i in one_columns :
            all_columns.remove(i)
        
    if config['url_len_off'] == False :
        all_columns.remove('url_len')
    
    if config['url_entropy_off'] == False :
        all_columns.remove('url_entropy')
    
    if config['url_port_off'] == False :
        all_columns.remove('url_port')
    
    if config['iframe_off'] == False :
        all_columns.remove("html_num_tags('iframe')")
    
    if config['head_off'] == False :
        all_columns.remove("html_num_tags('head')")
    
    if config['body_off'] == False :
        all_columns.remove("html_num_tags('body')")
        
    
    if config['one_distribute_on'] == False :
        for i in one_distribute :
            all_columns.remove(i)

    data = data[all_columns]

    if config['drop_duplicate'] == True :
        data.drop_duplicates(inplace = True)
        data.reset_index(drop = True, inplace = True)
    
    if config['missing_imput'] == True :
        X = data.drop(columns = ["Result_v1"])
        y = data["Result_v1"]
        imp = IterativeImputer(estimator = RandomForestRegressor(verbose=0, random_state = 42),
                               max_iter=10,
                               verbose = 0,
                               imputation_order= 'ascending',
                               random_state=0)
        X_imp=pd.DataFrame(imp.fit_transform(X))
        X_imp.columns = X.columns
        data = pd.concat([X_imp,y], axis = 1)
        
    else :
        data = data.dropna()
        data = data.reset_index(drop = True)

    return data

In [5]:
# 성능이 가장 높게 나온 경우의 수로 config 설정
# | iter                                                                     | f1
# | (False, False, True, False, True, True, False, True, False, True, 'NONE')| 0.965608466
config = dict()
config['one_column_on'] =False 
config['url_len_off'] =False
config['url_entropy_off'] =True
config['url_port_off'] =False 
config['iframe_off'] =True
config['head_off'] =True
config['body_off'] =False
config['one_distribute_on'] =True
config['drop_duplicate'] =False
config['missing_imput'] =True

In [6]:
# 데이터 셋 만들기
data = select_dataset(data,config)

In [7]:
# feature, target 분리
X = data.drop(columns = ["Result_v1"])
y = data['Result_v1']

## FOLD 5 교차검증

### 모델 fitting 함수 만들기

In [8]:
def fit_model(X,y,X_test,i,depth,bagging_temperature,learning_rate,subsample):
    cat_model = CatBoostClassifier(depth = int(depth),
                                  bagging_temperature=bagging_temperature,
                                  learning_rate=learning_rate,
                                  random_state=1339,#
                                  verbose=0,#
                                  subsample=subsample,
                                  eval_metric='F1'
                                  ).fit(X,y)
    cat_predict = cat_model.predict(X_test)
    return cat_predict

### FOLD 5 교차검증 함수 만들기

In [9]:
def CAT_cv(depth,bagging_temperature,learning_rate,subsample):

    # KOLD CV
    kf=KFold(5,shuffle=True,random_state=1339)
    score = 0
    i=0
    for train_index,val_index in kf.split(X,y):
        i+=1
        X_train,X_val=X.iloc[train_index],X.iloc[val_index]
        y_train,y_val=y.iloc[train_index],y.iloc[val_index]
        cat_predict = fit_model(np.array(X_train),np.array(y_train),np.array(X_val),i,depth,bagging_temperature,learning_rate,subsample)
        score += metrics.f1_score(np.array(y_val),cat_predict)

    return score/5

## Bayesian Optimization

In [10]:
# pbounds : 하이퍼 파라미터들의 범위 설정하기

pbounds = { 'depth': (1, 16),
            'bagging_temperature': (1, 10),
            'learning_rate': (0.01, 1.0),
            'subsample' : (0.01,1),
            }

In [None]:
# 베이지안 옵티마이저 fitting
bo = BayesianOptimization(f = CAT_cv, pbounds = pbounds, random_state = 1,verbose = 2)
bo.maximize(init_points = 5, n_iter = 10,acq = 'ei',xi = 0.01)