In [8]:
# HashingVetorizer와 TF-IDF를 사용한 자연어 처리
# 텍스트를 숫자로 정보로 변환을 위한 처리 기술
# 토큰 : 텍스트를 처리하는 단위(Unit)
# 토큰을 단어Word 문장Sentence 문자Character로 지정가능하다
# CountVectorizer : 텍스트 -> 텍스트 토큰(Textual token)
# HashVectorzier : CountVectorizer 함수의 변형 (더 확작성 좋다)
# 해석상 Interpretability
# 해시 충돌 Hashing Collision
# 문서의 단어의 수를 세는 것만으로는 오도Misleading 을 발생할 수 있다
# Stop word는 빈도수가 높지만 정보를 가지고 있지 않기 때문이다.
# Stop word : the, a 같은 단어
# 이런 단어를 제거하기 위해서 토큰에 가중치 weight 를 부여 Stop word 단어는 빈도를 상쇄:
# 단어 빈도와 역문서 빈도 : TF-IDF , Term-Frequency, Inverse-Document-Frequency
# 이벤트 로그나 대화 내용 등등 데이터 활용 가능

In [9]:
with open('./data/anonops_short.txt', encoding='utf8') as f:
    anonops_chat_logs = f.readlines()

In [10]:
# TV 채널의 대화 로그 파일
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

my_vector = HashingVectorizer(input='content', ngram_range=(1,2))
X_train_counts = my_vector.fit_transform(anonops_chat_logs,)
tf_transformer = TfidfTransformer(use_idf=True,).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf

<180830x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 3158166 stored elements in Compressed Sparse Row format>

In [11]:
print(X_train_tf)

  (0, 938273)	0.10023429482560929
  (0, 871172)	-0.33044470291777067
  (0, 755834)	-0.2806123960092745
  (0, 556974)	-0.2171490773135763
  (0, 548264)	-0.09851435603064428
  (0, 531189)	-0.2566310842337745
  (0, 522961)	-0.3119912982467716
  (0, 514190)	-0.2527659565181208
  (0, 501800)	-0.33044470291777067
  (0, 499727)	-0.18952297847436425
  (0, 488876)	0.13502094828386488
  (0, 377854)	0.22710724511856722
  (0, 334594)	-0.25581186158424035
  (0, 256577)	0.20949022238574433
  (0, 197273)	-0.30119674850360456
  (0, 114899)	0.09713499033205285
  (0, 28523)	-0.3060506288368513
  (1, 960098)	0.09780838928665199
  (1, 955748)	-0.2747271490090429
  (1, 952302)	0.26070217969901804
  (1, 938273)	0.12095603891963835
  (1, 937092)	-0.2947114257264502
  (1, 927866)	0.21727726371674563
  (1, 820768)	-0.11065660403137358
  (1, 772066)	-0.14344517367198276
  :	:
  (180828, 329790)	0.06808618130417012
  (180828, 312887)	-0.08249409552977467
  (180828, 209871)	0.17685927011939476
  (180828, 193711)	

In [12]:
# 초매개변수 조정
# 격자 검색 grid search
# 전수 공격 접근 방식 : brute-force
# 베이즈 최적화 Bayesian optimization

In [13]:
from sklearn import datasets

wine_dataset = datasets.load_wine()
X = wine_dataset.data
y = wine_dataset.target
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [14]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [15]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV

n_iterations = 50
estimator = xgb.XGBClassifier(
    n_jobs = -1,
    objective = 'multi:softmax',
    eval_metric = 'merror',
    verbosity = 0,
    num_class = len(set(y))
)

In [16]:
search_space = {
    "learning_rate": (0.01, 1.0),
    "min_child_weight": (0, 10),
    "max_depth": (1, 50),
    "max_delta_step": (0, 10),
    "subsample": (0.01, 1.0),
    "colsample_bytree": (0.01, 1.0),
    "colsample_bylevel": (0.01, 1.0),
    "reg_lambda": (1e-9, 1000),
    "reg_alpha": (1e-9, 1.0),
    "gamma": (1e-9, 0.5),
    "min_child_weight": (0, 5),
    "n_estimators": (5, 5000),
    "scale_pos_weight": (1e-6, 500),
}

In [17]:
# 교차검증 cross-validation 유형 지정
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [18]:
bayes_cv_tuner = BayesSearchCV(
    estimator = estimator,
    search_spaces = search_space,
    scoring = 'accuracy',
    cv = cv,
    n_jobs = -1,
    n_iter = n_iterations,
    verbose = 0,
    refit = True
)

In [28]:
import pandas as pd
import numpy as np

# 콜백함수 callback function
# 진행 상황을 출력

def print_status(optimal_result):
    model_tested = pd.DataFrame(bayes_cv_tuner.cv_results_)
    best_parameters_so_far = pd.Series(bayes_cv_tuner.best_params_)
    print(f'Model #{len(model_tested):03}')
    print(f'Best accuracy: {np.round(bayes_cv_tuner.best_score_, 3)}')
    print(f'Best parameters: {bayes_cv_tuner.best_params_}\n')

    clf_type = bayes_cv_tuner.estimator.__class__.__name__
    print(clf_type) # 원래는 파일 생성

In [30]:
# 파일이 깨진듯
bayes_cv_tuner.fit(X, y)



BayesSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
              estimator=XGBClassifier(base_score=None, booster=None,
                                      colsample_bylevel=None,
                                      colsample_bynode=None,
                                      colsample_bytree=None,
                                      enable_categorical=False,
                                      eval_metric='merror', gamma=None,
                                      gpu_id=None, importance_type=None,
                                      interaction_constraints=None,
                                      learning_rate=None, max_delta_step=N...
                                      validate_parameters=None, ...),
              n_jobs=-1, scoring='accuracy',
              search_spaces={'colsample_bylevel': (0.01, 1.0),
                             'colsample_bytree': (0.01, 1.0),
                             'gamma': (1e-09, 0.5),
                     

In [31]:
bayes_cv_tuner.cv_results_

{'mean_fit_time': array([4.21828771, 4.59623861, 1.34322611, 0.12021041, 2.15893618,
        3.68327022, 4.00386977, 2.97931083, 0.7329812 , 4.70753765,
        0.01262641, 0.01529082, 1.83944122, 0.66886632, 4.43755126,
        2.26539342, 0.01239951, 1.33023866, 2.89529681, 2.72121819,
        0.01130319, 5.12550481, 0.24929078, 4.44738317, 3.14833554,
        4.53815198, 0.01495949, 3.09329907, 5.46401668, 3.59396935,
        3.2769479 , 0.01312415, 3.33611472, 1.87134918, 5.73249737,
        6.24024487, 1.29704452, 3.21142499, 2.98610147, 4.22936638,
        5.61118491, 4.46032166, 4.24184243, 4.04643861, 0.01479499,
        0.43826501, 3.48437985, 0.01362809, 3.97385192, 1.04976002]),
 'std_fit_time': array([4.40629183e-02, 4.59053627e-02, 1.50435943e-01, 1.77578426e-03,
        1.48159626e-02, 2.22408915e-02, 4.60358661e-02, 3.27613504e-02,
        8.69022313e-03, 4.79803083e-02, 4.68353622e-04, 4.69122522e-04,
        6.03231894e-03, 1.56318251e-02, 3.12292850e-02, 2.72134195e-0