# QSO VS GALAXY



In [1]:
%matplotlib inline
# from sklearn.ensemble import RandomForestClassifier
import optuna
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
from astropy.table import Table
from astropy.io import fits

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    f1_score, 
    accuracy_score, 
    precision_score,
    recall_score,
    matthews_corrcoef,
    )


# def make_binary_classification_target(y, pos_label, verbose=False):
#     '''Turn multi-class targets into binary classification targets.'''
#     pos_idx = (y==pos_label)
#     y[pos_idx] = 1
#     y[~pos_idx] = 0
#     if verbose:
#         print ('Positive target:\t{}'.format(pos_label))
#         print ('Imbalance ratio:\t{:.3f}'.format((y==0).sum()/(y==1).sum()))
#     y = y.astype(int)
#     return y

In [2]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Load QSO&GALAXY binary data

In [10]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(['QSO','GALAXY'])
df = pd.read_csv('trainingset_binary2_test6.csv')
features = ['iz','zy','yj','jh','hk','iw1','zw1','yw1','jw1','hw1','kw1','w1w2']

In [None]:
# 和test5一样，不用重新训练了

In [11]:
X = df[features].values
y = le.transform(df['class'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=8888)

In [13]:
X_train.shape

(186057, 12)

In [14]:
y_train.shape

(186057,)

In [15]:
y_train[np.where(y_train==0)].shape, y_train[np.where(y_train==1)].shape

((89336,), (96721,))

In [16]:
y_test.shape

(46515,)

# Train model using XGBoost CV

In [10]:
def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {
        'verbosity': 1,
        'objective': 'binary:logistic',  # 注意objective不是multi
        'booster': 'gbtree',
        'tree_method': 'hist',
        'lambda': trial.suggest_uniform('lambda', 0.5, 3.0),
        'alpha': trial.suggest_uniform('alpha', 0, 2.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_categorical('eta', [0.01, 0.1, 0.2, 0.3]), #learning_rate
        'gamma': trial.suggest_uniform('gamma', 0, 3.0), #min_split_loss
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 1, 8)
    }


    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'test-logloss')
    history = xgb.cv(
        param,
        dtrain,
        num_boost_round=100,  # fixed,because the increasing num boost round can offset the decreasing eta.
        nfold=5, 
        metrics=['logloss'],
        early_stopping_rounds=10, 
        stratified=True, 
        seed=8888,
        callbacks=[pruning_callback]
    )
    mean_logloss = history['test-logloss-mean'].values[-1]
    return mean_logloss

In [12]:
if __name__ == '__main__':
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
    study = optuna.create_study(pruner=pruner, direction='minimize')
    study.optimize(objective, 
                   #n_trials=50
                   n_trials=500
                  )


[32m[I 2023-05-02 22:01:12,259][0m A new study created in memory with name: no-name-f1c77b5b-0058-4deb-b115-cb0ea4b662a8[0m
[32m[I 2023-05-02 22:01:29,954][0m Trial 0 finished with value: 0.0428942 and parameters: {'lambda': 2.129022935164331, 'alpha': 0.7754099067961129, 'max_depth': 6, 'eta': 0.1, 'gamma': 0.838087354398074, 'grow_policy': 'lossguide', 'min_child_weight': 3, 'subsample': 0.9542725205803843, 'colsample_bytree': 0.8739444008169417, 'max_delta_step': 4}. Best is trial 0 with value: 0.0428942.[0m
[32m[I 2023-05-02 22:01:54,790][0m Trial 1 finished with value: 0.042928 and parameters: {'lambda': 1.5703247114048, 'alpha': 0.08856498335288654, 'max_depth': 8, 'eta': 0.2, 'gamma': 0.4903595064920122, 'grow_policy': 'lossguide', 'min_child_weight': 1, 'subsample': 0.9470349864875071, 'colsample_bytree': 0.8338554197250998, 'max_delta_step': 1}. Best is trial 0 with value: 0.0428942.[0m
[32m[I 2023-05-02 22:02:19,945][0m Trial 2 finished with value: 0.2342302 and pa

[32m[I 2023-05-02 22:05:38,602][0m Trial 45 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:05:42,659][0m Trial 46 pruned. Trial was pruned at iteration 38.[0m
[32m[I 2023-05-02 22:05:44,541][0m Trial 47 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:05:46,438][0m Trial 48 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:05:50,144][0m Trial 49 pruned. Trial was pruned at iteration 40.[0m
[32m[I 2023-05-02 22:05:56,903][0m Trial 50 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:06:02,651][0m Trial 51 finished with value: 0.0427314 and parameters: {'lambda': 2.833936642268488, 'alpha': 0.2570879605588737, 'max_depth': 9, 'eta': 0.2, 'gamma': 0.8598969311610598, 'grow_policy': 'depthwise', 'min_child_weight': 2, 'subsample': 0.8254556532706808, 'colsample_bytree': 0.9458733489411156, 'max_delta_step': 2}. Best is trial 3 with value: 0.0424108.[0m
[32m[I 2023-05-02 22:06:08,817][0m Trial 52 fini

[32m[I 2023-05-02 22:09:46,066][0m Trial 121 pruned. Trial was pruned at iteration 18.[0m
[32m[I 2023-05-02 22:09:51,407][0m Trial 122 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:10:06,287][0m Trial 123 pruned. Trial was pruned at iteration 27.[0m
[32m[I 2023-05-02 22:10:10,791][0m Trial 124 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:10:20,157][0m Trial 125 pruned. Trial was pruned at iteration 18.[0m
[32m[I 2023-05-02 22:10:24,393][0m Trial 126 pruned. Trial was pruned at iteration 33.[0m
[32m[I 2023-05-02 22:10:26,561][0m Trial 127 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:10:31,159][0m Trial 128 pruned. Trial was pruned at iteration 40.[0m
[32m[I 2023-05-02 22:10:36,309][0m Trial 129 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:10:38,073][0m Trial 130 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:10:39,872][0m Trial 131 pruned. Trial was prune

[32m[I 2023-05-02 22:15:18,863][0m Trial 203 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:25,366][0m Trial 204 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:27,063][0m Trial 205 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:29,634][0m Trial 206 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:31,304][0m Trial 207 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:33,262][0m Trial 208 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:35,252][0m Trial 209 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:48,312][0m Trial 210 pruned. Trial was pruned at iteration 23.[0m
[32m[I 2023-05-02 22:15:50,321][0m Trial 211 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:52,384][0m Trial 212 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:15:55,607][0m Trial 213 pruned. Trial was prune

[32m[I 2023-05-02 22:21:05,729][0m Trial 281 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:07,631][0m Trial 282 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:12,089][0m Trial 283 pruned. Trial was pruned at iteration 39.[0m
[32m[I 2023-05-02 22:21:19,038][0m Trial 284 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:20,862][0m Trial 285 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:22,583][0m Trial 286 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:35,144][0m Trial 287 pruned. Trial was pruned at iteration 23.[0m
[32m[I 2023-05-02 22:21:37,166][0m Trial 288 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:39,089][0m Trial 289 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:42,988][0m Trial 290 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:21:45,728][0m Trial 291 pruned. Trial was prune

[32m[I 2023-05-02 22:26:07,153][0m Trial 363 pruned. Trial was pruned at iteration 26.[0m
[32m[I 2023-05-02 22:26:09,353][0m Trial 364 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:26:15,935][0m Trial 365 pruned. Trial was pruned at iteration 51.[0m
[32m[I 2023-05-02 22:26:18,114][0m Trial 366 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:26:20,414][0m Trial 367 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:26:23,552][0m Trial 368 pruned. Trial was pruned at iteration 20.[0m
[32m[I 2023-05-02 22:26:26,388][0m Trial 369 pruned. Trial was pruned at iteration 18.[0m
[32m[I 2023-05-02 22:26:30,491][0m Trial 370 pruned. Trial was pruned at iteration 31.[0m
[32m[I 2023-05-02 22:26:32,602][0m Trial 371 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:26:36,478][0m Trial 372 pruned. Trial was pruned at iteration 30.[0m
[32m[I 2023-05-02 22:26:38,623][0m Trial 373 pruned. Trial was prune

[32m[I 2023-05-02 22:31:01,397][0m Trial 438 finished with value: 0.0426028 and parameters: {'lambda': 2.9874551404626355, 'alpha': 0.31584457698547014, 'max_depth': 9, 'eta': 0.2, 'gamma': 1.1739368294449886, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.8187497841774649, 'colsample_bytree': 0.9371791403641716, 'max_delta_step': 4}. Best is trial 3 with value: 0.0424108.[0m
[32m[I 2023-05-02 22:31:05,415][0m Trial 439 pruned. Trial was pruned at iteration 37.[0m
[32m[I 2023-05-02 22:31:09,243][0m Trial 440 pruned. Trial was pruned at iteration 32.[0m
[32m[I 2023-05-02 22:31:13,197][0m Trial 441 pruned. Trial was pruned at iteration 35.[0m
[32m[I 2023-05-02 22:31:17,202][0m Trial 442 pruned. Trial was pruned at iteration 34.[0m
[32m[I 2023-05-02 22:31:19,739][0m Trial 443 pruned. Trial was pruned at iteration 16.[0m
[32m[I 2023-05-02 22:31:21,789][0m Trial 444 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:31:25,414][0m Tria

[32m[I 2023-05-02 22:34:15,032][0m Trial 490 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:34:17,287][0m Trial 491 pruned. Trial was pruned at iteration 12.[0m
[32m[I 2023-05-02 22:34:19,344][0m Trial 492 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:34:24,900][0m Trial 493 pruned. Trial was pruned at iteration 36.[0m
[32m[I 2023-05-02 22:34:27,224][0m Trial 494 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:34:29,340][0m Trial 495 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:34:31,519][0m Trial 496 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2023-05-02 22:34:37,901][0m Trial 497 finished with value: 0.04254 and parameters: {'lambda': 2.8099764439521437, 'alpha': 0.3900092903899828, 'max_depth': 9, 'eta': 0.2, 'gamma': 1.1567621970055249, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.8085057720887568, 'colsample_bytree': 0.9388749550964745, 'max_delta_step'

In [13]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Number of finished trials: 500
Best trial:
  Value: 0.0424108
  Params: 
    lambda: 1.2413359401393276
    alpha: 0.16326090314688657
    max_depth: 8
    eta: 0.1
    gamma: 1.1868232671204701
    grow_policy: depthwise
    min_child_weight: 4
    subsample: 0.8780950055491423
    colsample_bytree: 0.9241363413993084
    max_delta_step: 3


#### n_trials=50

Params: 
    lambda: 2.652579588756935
    alpha: 1.4208405720586856
    max_depth: 8
    eta: 0.3
    gamma: 1.0809389255690658
    grow_policy: lossguide
    min_child_weight: 2
    subsample: 0.9033901196294835
    colsample_bytree: 0.8010648180780473
    max_delta_step: 8
   
   
#### n_trials=500
Params: 
    lambda: 0.8545938248212175
    alpha: 0.4512391408998466
    max_depth: 8
    eta: 0.1
    gamma: 0.2038341012648358
    grow_policy: depthwise
    min_child_weight: 1
    subsample: 0.9094012577087329
    colsample_bytree: 0.8079200202937279
    max_delta_step: 2
    
    
#### n_trials=500, test3
Params: 
    lambda: 2.013028734650969
    alpha: 0.7436089160412054
    max_depth: 7
    eta: 0.1
    gamma: 0.1781704703152439
    grow_policy: depthwise
    min_child_weight: 1
    subsample: 0.820997112976606
    colsample_bytree: 0.9745950996848067
    max_delta_step: 3
    
    
#### n_trials=500, test4
Params: 
    lambda: 1.4203472995540372
    alpha: 1.6997113468066785
    max_depth: 9
    eta: 0.2
    gamma: 0.79634077651323
    grow_policy: depthwise
    min_child_weight: 3
    subsample: 0.908673742111451
    colsample_bytree: 0.9750229974864385
    max_delta_step: 5

#### n_trials=500 test5
Params: 
    lambda: 1.2413359401393276
    alpha: 0.16326090314688657
    max_depth: 8
    eta: 0.1
    gamma: 1.1868232671204701
    grow_policy: depthwise
    min_child_weight: 4
    subsample: 0.8780950055491423
    colsample_bytree: 0.9241363413993084
    max_delta_step: 3

In [14]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_eta,params_gamma,params_grow_policy,params_lambda,params_max_delta_step,params_max_depth,params_min_child_weight,params_subsample,state
0,0,0.042894,2023-05-02 22:01:12.268272,2023-05-02 22:01:29.953150,0 days 00:00:17.684878,0.775410,0.873944,0.10,0.838087,lossguide,2.129023,4,6,3,0.954273,COMPLETE
1,1,0.042928,2023-05-02 22:01:29.965689,2023-05-02 22:01:54.789103,0 days 00:00:24.823414,0.088565,0.833855,0.20,0.490360,lossguide,1.570325,1,8,1,0.947035,COMPLETE
2,2,0.234230,2023-05-02 22:01:54.801161,2023-05-02 22:02:19.943776,0 days 00:00:25.142615,1.248677,0.902500,0.01,0.258070,lossguide,1.395564,3,7,5,0.977190,COMPLETE
3,3,0.042411,2023-05-02 22:02:19.953053,2023-05-02 22:02:27.667210,0 days 00:00:07.714157,0.163261,0.924136,0.10,1.186823,depthwise,1.241336,3,8,4,0.878095,COMPLETE
4,4,0.043318,2023-05-02 22:02:27.678245,2023-05-02 22:02:37.806582,0 days 00:00:10.128337,0.158091,0.890529,0.30,2.885271,lossguide,1.528537,6,8,4,0.977175,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,0.090542,2023-05-02 22:34:27.230714,2023-05-02 22:34:29.340239,0 days 00:00:02.109525,0.434947,0.938860,0.20,1.183211,depthwise,2.862477,4,9,1,0.824971,PRUNED
496,496,0.090535,2023-05-02 22:34:29.346444,2023-05-02 22:34:31.519644,0 days 00:00:02.173200,0.329799,0.934419,0.20,1.248255,depthwise,2.904723,4,9,1,0.805107,PRUNED
497,497,0.042540,2023-05-02 22:34:31.525711,2023-05-02 22:34:37.900280,0 days 00:00:06.374569,0.390009,0.938875,0.20,1.156762,depthwise,2.809976,4,9,1,0.808506,COMPLETE
498,498,0.090537,2023-05-02 22:34:37.915485,2023-05-02 22:34:40.046048,0 days 00:00:02.130563,0.447858,0.937877,0.20,1.147958,depthwise,2.830302,4,9,1,0.807937,PRUNED


In [15]:
study.best_trial

FrozenTrial(number=3, values=[0.0424108], datetime_start=datetime.datetime(2023, 5, 2, 22, 2, 19, 953053), datetime_complete=datetime.datetime(2023, 5, 2, 22, 2, 27, 667210), params={'lambda': 1.2413359401393276, 'alpha': 0.16326090314688657, 'max_depth': 8, 'eta': 0.1, 'gamma': 1.1868232671204701, 'grow_policy': 'depthwise', 'min_child_weight': 4, 'subsample': 0.8780950055491423, 'colsample_bytree': 0.9241363413993084, 'max_delta_step': 3}, distributions={'lambda': UniformDistribution(high=3.0, low=0.5), 'alpha': UniformDistribution(high=2.0, low=0.0), 'max_depth': IntUniformDistribution(high=9, low=3, step=1), 'eta': CategoricalDistribution(choices=(0.01, 0.1, 0.2, 0.3)), 'gamma': UniformDistribution(high=3.0, low=0.0), 'grow_policy': CategoricalDistribution(choices=('depthwise', 'lossguide')), 'min_child_weight': IntUniformDistribution(high=5, low=1, step=1), 'subsample': UniformDistribution(high=1.0, low=0.8), 'colsample_bytree': UniformDistribution(high=1.0, low=0.8), 'max_delta_s

In [16]:
opdf = study.trials_dataframe()
from pathlib import Path
Path("param_tune_records").mkdir(exist_ok=True)
opdf.to_pickle("param_tune_records/optuna_xgb_qso_vs_gal_cv-logloss-500trials-5.pkl")