In [7]:
%matplotlib inline
# from sklearn.ensemble import RandomForestClassifier
import optuna
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
from astropy.table import Table
from astropy.io import fits

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    f1_score, 
    accuracy_score, 
    precision_score,
    recall_score,
    matthews_corrcoef,
    )

# Load data

In [8]:
df = pd.read_pickle("/store/public/databases/hzqcpa/train/train_dr16q_pswise_redshift.pkl")

In [9]:
df.shape

(388166, 16)

In [10]:
df.head()

Unnamed: 0,gr,ri,iz,zy,gw1,rw1,iw1,zw1,yw1,w12,w23,ipk,zpk,ra,dec,redshift
0,0.111539,-0.027499,0.147387,-0.065906,1.231909,1.12037,1.147869,1.000482,1.066388,0.506735,0.877516,-0.0654,-0.0504,0.000629,35.517841,0.845435
3,0.087763,0.166323,0.052244,0.007644,0.367142,0.279379,0.113056,0.060812,0.053167,0.653869,1.669372,-0.0916,-0.0555,0.001914,9.385637,2.024146
4,0.555387,0.196072,0.507199,0.017667,1.970197,1.41481,1.218738,0.711538,0.693872,-0.075955,1.231105,,,0.001978,-0.451088,0.25
6,-0.063477,0.343486,0.175823,-0.10101,0.602331,0.665808,0.322322,0.146499,0.247509,0.42278,1.520391,-0.1219,-0.0507,0.002595,31.328982,1.991313
7,0.142909,0.161818,0.238008,0.141826,0.778385,0.635476,0.473659,0.235651,0.093825,0.65235,2.051777,-0.0594,-0.0741,0.002756,14.974675,2.497


In [11]:
# features = df.columns[:9]
features = ['gr', 'ri', 'iz', 'zy', 'gw1', 'rw1',
            'iw1', 'zw1', 'yw1', 'w12', 'w23', 
            'ipk', 'zpk']
# features = ['gw1', 'rw1', 'iw1', 'zw1', 'yw1', 'w12', 'w23']
X = df[features].values
y = df['redshift'].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=8888)

In [12]:
X_train.shape

(310532, 13)

# Train model using XGBoost CV

In [13]:
def objective(trial):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    param = {
        'verbosity': 1,
        'objective': 'reg:squarederror',
        'booster': 'gbtree',
        'tree_method': 'hist',
        'lambda': trial.suggest_uniform('lambda', 0.5, 3.0),
        'alpha': trial.suggest_uniform('alpha', 0, 2.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'eta': trial.suggest_categorical('eta', [0.001, 0.01, 0.1, 0.2, 0.3]), #learning_rate
        'gamma': trial.suggest_uniform('gamma', 0, 3.0), #min_split_loss
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
    }

#     bst = xgb.train(param, dtrain, num_boost_round=param['num_round'])
#     preds = bst.predict(dtest)
#     pred_labels = np.rint(preds)
    
#     accuracy = accuracy_score(y_test, pred_labels)
#     precision = precision_score(y_true=y_test, y_pred=pred_labels)
#     recall = recall_score(y_true=y_test, y_pred=pred_labels)
#     f1 = f1_score(y_true=y_test, y_pred=pred_labels)
#     matcoef = matthews_corrcoef(y_test, pred_labels)
#     trial.set_user_attr('accuracy', accuracy)
#     trial.set_user_attr('precision', precision)
#     trial.set_user_attr('recall', recall)
#     trial.set_user_attr('f1', f1)
#     trial.set_user_attr('matcoef', matcoef)
    
#     early_stop = int(param['num_round']/10)
#     pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'test-logloss')
    history = xgb.cv(
        param,
        dtrain,
        num_boost_round=100,
        nfold=5, 
        metrics=['rmse'],
        early_stopping_rounds=10, 
#         stratified=True, 
        seed=8888,
#         callbacks=[pruning_callback]
    )
    mean_rmse = history['test-rmse-mean'].values[-1]
    return mean_rmse

In [14]:
if __name__ == '__main__':
#     pruner = optuna.pruners.MedianPruner(n_warmup_steps=15)
#     study = optuna.create_study(pruner=pruner, direction='minimize')
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)

KeyboardInterrupt: 

In [None]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
study.best_trial

In [11]:
opdf = study.trials_dataframe()
from pathlib import Path
Path("param_tune_records").mkdir(exist_ok=True)
opdf.to_pickle("param_tune_records/opt_xgb_GPQ_photoz_cv_500_trials.pkl")
opdf.sort_values(by=['value'])

Unnamed: 0,number,value,datetime_start,datetime_complete,params_alpha,params_colsample_bytree,params_eta,params_gamma,params_grow_policy,params_lambda,params_max_depth,params_min_child_weight,params_subsample,system_attrs__number,state
288,288,0.348489,2020-08-23 00:24:45.561086,2020-08-23 00:25:30.252502,1.538173,0.919036,0.100,0.239166,depthwise,1.591616,9,1,0.925893,288,COMPLETE
291,291,0.348655,2020-08-23 00:27:10.484474,2020-08-23 00:27:54.370588,1.532992,0.898586,0.100,0.115643,depthwise,1.571966,9,1,0.922722,291,COMPLETE
296,296,0.348687,2020-08-23 00:30:29.419725,2020-08-23 00:31:15.600654,1.533145,0.903750,0.100,0.080565,depthwise,1.580392,9,1,0.925991,296,COMPLETE
286,286,0.348721,2020-08-23 00:23:09.127052,2020-08-23 00:23:57.829389,1.512488,0.914347,0.100,0.003490,depthwise,1.622166,9,1,0.923593,286,COMPLETE
427,427,0.348725,2020-08-23 02:16:58.966446,2020-08-23 02:17:53.814892,1.689836,0.897798,0.100,0.142112,lossguide,1.534827,9,1,0.926193,427,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,14,1.231549,2020-08-22 21:07:41.513497,2020-08-22 21:08:14.691767,1.772607,0.863858,0.001,1.789555,depthwise,1.978733,8,4,0.997250,14,COMPLETE
98,98,1.231736,2020-08-22 22:00:49.113739,2020-08-22 22:01:18.227420,1.532151,0.808657,0.001,1.105447,depthwise,1.110389,8,1,0.927498,98,COMPLETE
79,79,1.237899,2020-08-22 21:47:51.159393,2020-08-22 21:47:59.032364,1.765078,0.864137,0.001,1.130900,depthwise,1.062759,3,2,0.983760,79,COMPLETE
0,0,1.238409,2020-08-22 21:02:37.076053,2020-08-22 21:02:48.208201,0.080253,0.562095,0.001,0.863171,lossguide,2.313278,3,2,0.853218,0,COMPLETE


In [12]:
study.optimize(objective, n_trials=500)

[I 2020-08-23 03:21:17,198] Finished trial#500 resulted in value: 0.34925939999999994. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 03:21:55,240] Finished trial#501 resulted in value: 0.35003340000000005. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 03:22:47,946] Finished trial#502 resulted in value: 0.34889019999999993. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'e

[I 2020-08-23 03:57:07,689] Finished trial#542 resulted in value: 0.3497428. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 03:58:00,459] Finished trial#543 resulted in value: 0.34910119999999994. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 03:58:57,204] Finished trial#544 resulted in value: 0.349406. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.

[I 2020-08-23 04:32:57,645] Finished trial#584 resulted in value: 0.3501416. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 04:33:52,479] Finished trial#585 resulted in value: 0.3492766. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 04:34:41,684] Finished trial#586 resulted in value: 0.3487426. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.239166443

[I 2020-08-23 05:09:11,694] Finished trial#626 resulted in value: 0.34894119999999995. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 05:09:55,823] Finished trial#627 resulted in value: 0.3524424. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 05:10:50,599] Finished trial#628 resulted in value: 0.3488246. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0

[I 2020-08-23 05:44:12,209] Finished trial#668 resulted in value: 0.3489618. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 05:45:08,665] Finished trial#669 resulted in value: 0.34923859999999995. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 05:46:05,000] Finished trial#670 resulted in value: 0.34912920000000003. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 

[I 2020-08-23 06:22:42,054] Finished trial#710 resulted in value: 0.3492484. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 06:23:49,299] Finished trial#711 resulted in value: 0.34910660000000004. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 06:24:24,539] Finished trial#712 resulted in value: 0.356774. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.

[I 2020-08-23 07:02:26,907] Finished trial#752 resulted in value: 0.3492354. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 07:03:23,368] Finished trial#753 resulted in value: 0.3489168. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 07:04:23,673] Finished trial#754 resulted in value: 0.3492192. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.239166443

[I 2020-08-23 07:39:35,338] Finished trial#794 resulted in value: 0.3488696. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 07:40:48,033] Finished trial#795 resulted in value: 0.3491892. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0.2391664436504224, 'grow_policy': 'depthwise', 'min_child_weight': 1, 'subsample': 0.9258929782937517, 'colsample_bytree': 0.9190362813280736}.
[I 2020-08-23 07:41:42,920] Finished trial#796 resulted in value: 0.34866379999999997. Current best value is 0.34848860000000004 with parameters: {'lambda': 1.5916155532343612, 'alpha': 1.5381730281408985, 'max_depth': 9, 'eta': 0.1, 'gamma': 0

KeyboardInterrupt: 

In [None]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
study.best_trial

In [None]:
opdf = study.trials_dataframe()
opdf.to_pickle("param_tune_records/opt_xgb_GPQ_photoz_cv_1000_trialsv2.pkl")
opdf.sort_values(by=['value'])