In [4]:

import os
from learning.prods import get_clean_data,pick_time_index
from learning.genetic import gen_sol_location

import numpy as np
import xgboost as xgb
import dask.distributed

In [5]:
df,_,_ = get_clean_data(0,64,1)

ycols = ['Y1','Y2']
xinds = [i for i,c in enumerate(df.columns) if c not in ycols]
yinds = [df.columns.tolist().index(c) for c in ycols]

In [6]:
df_arr = df.to_dask_array(lengths = True)
t = df_arr.shape[0]
tr_slice = slice(0,t//2)
ts_slice = slice(t//2,t)
Xtr_arr0 = df_arr[tr_slice,xinds]
ytr_arr = df_arr[tr_slice,yinds[0]]
Xts_arr0 = df_arr[ts_slice,xinds]
yts_arr = df_arr[ts_slice,yinds[0]]

In [7]:
zfill = np.where(np.isnan(Xtr_arr0),0,Xtr_arr0)
hess = zfill.T@zfill
hess = hess.compute()

In [8]:
q,r = np.linalg.qr(hess)
rdiag = np.abs(np.diag(r))
rdiag = rdiag/np.amax(rdiag)
sel, = np.where(rdiag>1e-2)
print(len(sel))

40


In [9]:
Xtr_arr = Xtr_arr0[:,sel]
Xts_arr = Xts_arr0[:,sel]

In [10]:
trset = xgb.dask.DaskDMatrix(client, Xtr_arr,ytr_arr)
tsset = xgb.dask.DaskDMatrix(client, Xts_arr,yts_arr)

In [11]:
trsc2,tssc2 = np.sqrt(np.mean(ytr_arr**2).compute()),np.sqrt(np.mean(yts_arr**2).compute())

In [12]:
params

{'objective': 'reg:squarederror',
 'gamma': 115.68032306643916,
 'alpha': 1.6664879878980214e-05,
 'colsample_bytree': 0.27428290732370963,
 'eta': 0.031071029771405925,
 'lambda': 0.0004536206441424978,
 'min_child_weight': 59,
 'subsample': 0.3206356962808186,
 'max_depth': 8}

In [13]:
pparams = params.copy()
pparams.update(
    dict(
        eta = 0.0001,
        max_depth = 8,
        gamma = params['gamma']*100,
        booster = 'dart',
        rate_drop = 0.5,
        sample_type = 'weighted',
        skip_drop = 0.1,
    )
)

In [15]:
output = xgb.dask.train(client,pparams,trset,evals=((trset,'train'),(tsset,'test')),num_boost_round = 100)
r2ts = 1 - np.amin(output['history']['test']['rmse'])**2/tssc2**2
print(r2ts)


[17:22:55] task [xgboost.dask-0]:tcp://127.0.0.1:62670 got new rank 0
[17:22:55] task [xgboost.dask-1]:tcp://127.0.0.1:62671 got new rank 1
[17:22:55] task [xgboost.dask-2]:tcp://127.0.0.1:62672 got new rank 2
[17:22:55] task [xgboost.dask-3]:tcp://127.0.0.1:62678 got new rank 3
[17:22:55] task [xgboost.dask-4]:tcp://127.0.0.1:62679 got new rank 4


[0]	train-rmse:12.27945	test-rmse:11.15676
[1]	train-rmse:12.27943	test-rmse:11.15676
[2]	train-rmse:12.27938	test-rmse:11.15676
[3]	train-rmse:12.27937	test-rmse:11.15676
[4]	train-rmse:12.27936	test-rmse:11.15676
[5]	train-rmse:12.27932	test-rmse:11.15675
[6]	train-rmse:12.27930	test-rmse:11.15675
[7]	train-rmse:12.27929	test-rmse:11.15675
[8]	train-rmse:12.27929	test-rmse:11.15675
[9]	train-rmse:12.27926	test-rmse:11.15674
[10]	train-rmse:12.27926	test-rmse:11.15674
[11]	train-rmse:12.27925	test-rmse:11.15674
[12]	train-rmse:12.27924	test-rmse:11.15674
[13]	train-rmse:12.27924	test-rmse:11.15674
[14]	train-rmse:12.27924	test-rmse:11.15674
[15]	train-rmse:12.27924	test-rmse:11.15674
[16]	train-rmse:12.27923	test-rmse:11.15674
[17]	train-rmse:12.27923	test-rmse:11.15674
[18]	train-rmse:12.27923	test-rmse:11.15674
[19]	train-rmse:12.27923	test-rmse:11.15674
[20]	train-rmse:12.27923	test-rmse:11.15674
[21]	train-rmse:12.27922	test-rmse:11.15674
[22]	train-rmse:12.27922	test-rmse:11.1567

In [32]:
gain_rank = booster.get_score(importance_type = 'gain')
gains = [0]*Xtr_arr.shape[1]
for key,val in gain_rank.items():
    gains[int(key.replace('f',''))] += val
gains = np.array(gains)

In [41]:
inds = np.argsort(gains)[::-1]

In [48]:
params

{'objective': 'reg:squarederror',
 'gamma': 115.68032306643916,
 'alpha': 1.6664879878980214e-05,
 'colsample_bytree': 0.27428290732370963,
 'eta': 0.031071029771405925,
 'lambda': 0.0004536206441424978,
 'min_child_weight': 59,
 'subsample': 0.3206356962808186,
 'max_depth': 8}

In [59]:
n = 5
pparams = params.copy()
pparams.pop('gamma')
# pparams.pop('colsample_bytree')
pparams['max_depth'] = 5
pparams['eta'] = 1e-2

trset = xgb.dask.DaskDMatrix(client, Xtr_arr[:,inds[:n]],ytr_arr)
tsset = xgb.dask.DaskDMatrix(client, Xts_arr[:,inds[:n]],yts_arr)
output_sel = xgb.dask.train(client,pparams,trset,evals=((trset,'train'),(tsset,'test')),num_boost_round = 1000)

[0]	train-rmse:12.27928	test-rmse:11.15682
[1]	train-rmse:12.27924	test-rmse:11.15681
[2]	train-rmse:12.27923	test-rmse:11.15681


[16:28:59] task [xgboost.dask-0]:tcp://127.0.0.1:52380 got new rank 0
[16:28:59] task [xgboost.dask-1]:tcp://127.0.0.1:52382 got new rank 1
[16:28:59] task [xgboost.dask-2]:tcp://127.0.0.1:52381 got new rank 2
[16:28:59] task [xgboost.dask-3]:tcp://127.0.0.1:52384 got new rank 3
[16:28:59] task [xgboost.dask-4]:tcp://127.0.0.1:52383 got new rank 4


[3]	train-rmse:12.27891	test-rmse:11.15679
[4]	train-rmse:12.27888	test-rmse:11.15680
[5]	train-rmse:12.27887	test-rmse:11.15681
[6]	train-rmse:12.27883	test-rmse:11.15680
[7]	train-rmse:12.27853	test-rmse:11.15647
[8]	train-rmse:12.27849	test-rmse:11.15647
[9]	train-rmse:12.27845	test-rmse:11.15645
[10]	train-rmse:12.27845	test-rmse:11.15646
[11]	train-rmse:12.27814	test-rmse:11.15609
[12]	train-rmse:12.27794	test-rmse:11.15610
[13]	train-rmse:12.27777	test-rmse:11.15613
[14]	train-rmse:12.27749	test-rmse:11.15610
[15]	train-rmse:12.27718	test-rmse:11.15609
[16]	train-rmse:12.27698	test-rmse:11.15610
[17]	train-rmse:12.27698	test-rmse:11.15611
[18]	train-rmse:12.27695	test-rmse:11.15613
[19]	train-rmse:12.27666	test-rmse:11.15612
[20]	train-rmse:12.27635	test-rmse:11.15574
[21]	train-rmse:12.27615	test-rmse:11.15577
[22]	train-rmse:12.27586	test-rmse:11.15541
[23]	train-rmse:12.27567	test-rmse:11.15544
[24]	train-rmse:12.27550	test-rmse:11.15547
[25]	train-rmse:12.27519	test-rmse:11.1

In [26]:
feat_importance = np.array(list(feats.values()))
print(feat_importance.shape)

(194,)
