In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [2]:
X = pd.read_csv("data/17_selected_features_trainset.tsv", sep="\t")
y = X.pop("class")

In [3]:
le = preprocessing.LabelEncoder()
le.fit(X["splice_site"])
X["splice_site"] = le.transform(X["splice_site"])

X

Unnamed: 0,recount3_score,antisense_exon_start_ss,antisense_exon_end_ss,nearest_alt_start_ss_dist,nearest_alt_end_ss_dist,MaxEntScan_start_ss,MaxEntScan_end_ss,CpG_island,intron_length,phyloP_score,...,repeat_features_start_site_Satellite repeats,repeat_features_start_site_Unknown,repeat_features_start_site_Type II Transposons,repeat_features_end_site_Tandem repeats,repeat_features_end_site_LTRs,repeat_features_end_site_Type I Transposons/SINE,repeat_features_end_site_Satellite repeats,repeat_features_end_site_Type I Transposons/LINE,splice_site,repeat_features_start_site_Type I Transposons/SINE
0,59151,False,False,170,362,9.09,11.90,False,385,5.65175,...,0,0,0,0,0,0,0,0,2,0
1,61021,False,False,24,232,7.66,3.89,False,499,1.40575,...,0,0,0,0,0,0,0,0,2,0
2,21,False,False,24,246,7.15,7.61,False,277,5.75650,...,0,0,0,0,0,0,0,0,2,0
3,526,False,False,322,232,1.14,3.89,False,168,5.35750,...,0,0,0,0,0,0,0,0,1,0
4,0,False,False,322,232,-12.10,7.63,False,78,3.62350,...,0,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484602,0,False,False,487,25,-19.72,3.66,True,909,0.58700,...,0,0,0,0,0,0,0,0,2,0
484603,31374,False,False,777,464,9.21,13.74,False,415,2.74250,...,0,0,0,0,0,0,0,0,2,0
484604,67,False,False,572,2963,8.56,2.38,False,2970,-0.89575,...,0,0,0,0,0,0,0,0,2,0
484605,18958,False,False,1178,2,-12.64,5.91,False,2358,2.41200,...,0,0,0,0,0,0,0,0,2,0


In [4]:
X = X.fillna(0)

In [5]:
%%time
from bayes_opt import BayesianOptimization
parameter_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [5, 6, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

model = XGBClassifier(tree_method="gpu_hist")

grid_search = GridSearchCV(model, parameter_grid, scoring="matthews_corrcoef", cv=3, verbose=1, n_jobs=12)

grid_search.fit(X, y)

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits
CPU times: user 11.5 s, sys: 1.23 s, total: 12.7 s
Wall time: 1h 5min 13s


In [7]:
df_grid_search_result = pd.DataFrame(grid_search.cv_results_)
df_grid_search_result.to_csv("data/18_hyperparameter_opt_result.tsv", sep="\t")
df_grid_search_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,4.502714,0.001570,0.271742,0.012913,0.8,0.01,5,1,50,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",0.860016,0.894865,0.790842,0.848574,0.043231,1254
1,4.463886,0.054376,0.292894,0.015541,0.8,0.01,5,1,50,0.9,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",0.859641,0.895526,0.789940,0.848369,0.043836,1260
2,3.944259,0.563497,0.263358,0.014848,0.8,0.01,5,1,50,1.0,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",0.859675,0.895387,0.790379,0.848480,0.043594,1258
3,5.736898,0.092272,0.298172,0.028339,0.8,0.01,5,1,100,0.8,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",0.861579,0.897349,0.795451,0.851460,0.042211,1215
4,5.430502,0.700245,0.310719,0.002166,0.8,0.01,5,1,100,0.9,"{'colsample_bytree': 0.8, 'learning_rate': 0.0...",0.861199,0.897245,0.794912,0.851118,0.042381,1230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,18.302688,0.218331,0.497518,0.000931,1.0,0.3,7,5,150,0.9,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",0.883443,0.913408,0.807783,0.868211,0.044446,601
1292,18.025753,0.424667,0.444883,0.030711,1.0,0.3,7,5,150,1.0,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",0.883171,0.913421,0.809499,0.868697,0.043643,474
1293,20.833965,0.317261,0.424439,0.019300,1.0,0.3,7,5,200,0.8,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",0.882202,0.913004,0.807630,0.867612,0.044239,752
1294,17.223778,0.386428,0.412365,0.015024,1.0,0.3,7,5,200,0.9,"{'colsample_bytree': 1.0, 'learning_rate': 0.3...",0.883590,0.911889,0.806607,0.867362,0.044487,791


In [8]:
print("Best Parameters:\n", grid_search.best_params_)

Best Parameters:
 {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 200, 'subsample': 0.9}
