# SUBMIT

## 패키지 및 데이터 불러오기

### 패키지 불러오기

In [90]:
# Data Handling
import pandas as pd
import numpy as np

# imputation
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# visualization
import matplotlib.pyplot as plt

# catboost regressor package
from catboost import CatBoostClassifier

# ensemble model
from sklearn.ensemble import VotingClassifier

### 데이터 불러오기

In [91]:
# 데이터 불러오기
train = pd.read_csv("C:/bad_web/data/train_dataset.csv")
test = pd.read_csv("C:/bad_web/data/test_dataset_v01.csv")
submission = pd.read_csv("C:/bad_web/data/sample_submission.csv")

# 필요없는 열 제거
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

# 악성이면 1, 정상이면 1로 바꾸기
train['Result_v1'] = train['Result_v1'].apply(lambda x: -1 if x == 'malicious' else 1)

## 데이터 셋 모델링

In [94]:
# 데이터 셋 모델링
def select_dataset(data,config,train) :
    """
    config : dict
    """
    
    data.reset_index(drop = True, inplace = True)

    # one_columns
    one_columns = ['url_chinese_present',"html_num_tags('applet')"]

    # one_distribute
    one_distribute = ['url_query_len','url_num_query_para']

    # all_columns
    all_columns = list(data.columns)
    
    if config['one_column_on'] == False :
        for i in one_columns :
            all_columns.remove(i)
        
    if config['url_len_off'] == False :
        all_columns.remove('url_len')
    
    if config['url_entropy_off'] == False :
        all_columns.remove('url_entropy')
    
    if config['url_port_off'] == False :
        all_columns.remove('url_port')
    
    if config['iframe_off'] == False :
        all_columns.remove("html_num_tags('iframe')")
    
    if config['head_off'] == False :
        all_columns.remove("html_num_tags('head')")
    
    if config['body_off'] == False :
        all_columns.remove("html_num_tags('body')")
        
    
    if config['one_distribute_on'] == False :
        for i in one_distribute :
            all_columns.remove(i)

    data = data[all_columns]

    if config['drop_duplicate'] == True :
        data.drop_duplicates(inplace = True)
        data.reset_index(drop = True, inplace = True)
    
    if (config['missing_imput'] == True) and (train == True) :
        X = data.drop(columns = ["Result_v1"])
        y = data["Result_v1"]
        imp = IterativeImputer(estimator = RandomForestRegressor(verbose=0, random_state = 42),
                               max_iter=10,
                               verbose = 0,
                               imputation_order= 'ascending',
                               random_state=0)
        X_imp=pd.DataFrame(imp.fit_transform(X))
        X_imp.columns = X.columns
        data = pd.concat([X_imp,y], axis = 1)
        
    
    if (config['missing_imput'] == True) and (train == False) :
        X = data
        imp = IterativeImputer(estimator = RandomForestRegressor(verbose=0, random_state = 42),
                               max_iter=10,
                               verbose = 0,
                               imputation_order= 'ascending',
                               random_state=0)
        X_imp=pd.DataFrame(imp.fit_transform(X))
        X_imp.columns = X.columns
        data = X_imp
    
    return data

In [95]:
# 데이터 셋 환경설정
config = dict()
config['one_column_on'] =True 
config['url_len_off'] =True
config['url_entropy_off'] =True
config['url_port_off'] =True 
config['iframe_off'] =True
config['head_off'] =True
config['body_off'] =True
config['one_distribute_on'] =True
config['drop_duplicate'] =False
config['missing_imput'] =True

In [None]:
# 데이터 셋 모델링
train = select_dataset(train,config,True)
test = select_dataset(test,config,False)

# feautures, target 분리
X = train.drop(columns = ['Result_v1'])
y = train['Result_v1']

## 베이지안 옵티마이저 결과를 이용한 모델 학습 및 예측

In [98]:
# 베이지안 옵티마이저 결과
# |   iter    |  target   | baggin... |   depth   | learni... | subsample |
# -------------------------------------------------------------------------
# |  1        |  0.9686   |  4.753    |  11.8     |  0.01011  |  0.3093   |
# |  2        |  0.9611   |  2.321    |  2.385    |  0.1944   |  0.3521   |
# |  3        |  0.9632   |  4.571    |  9.082    |  0.425    |  0.6884   |
# |  4        |  0.968    |  2.84     |  14.17    |  0.03711  |  0.6738   |
# |  5        |  0.9658   |  4.756    |  9.38     |  0.149    |  0.2061   |
# |  6        |  0.961    |  1.069    |  3.428    |  0.7123   |  0.7671   |
# |  7        |  0.9686   |  5.675    |  14.44    |  0.4161   |  0.1487   |
# |  8        |  0.9682   |  7.914    |  12.06    |  0.02387  |  0.1831   |

In [118]:
# 앙상블 할 각각의 모형 정의하기
cat1 = CatBoostClassifier(bagging_temperature =7.914, depth = 12, learning_rate = 0.02387, subsample = 0.1831,random_state=1339,verbose = 0,eval_metric='F1')
cat2 = CatBoostClassifier(bagging_temperature =5.675, depth = 14, learning_rate = 0.4161, subsample = 0.1487,random_state=1339,verbose = 0,eval_metric='F1')

# 앙상블 모형 정의하기
eclf = VotingClassifier(estimators=[
         ('cat1', cat1), ('cat2', cat2)],
         voting='soft')

# 모델 학습하기
eclf.fit(X,y)

# 예측한 후 저장하기
predict = eclf.predict(test)
submit = pd.read_csv("C:/bad_web/data/sample_submission.csv")
submit['expected'] = predict
submit.to_csv("C:/bad_web/cat_esb_best2.csv", index = False)