In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15
import xgboost as xgb
import time
import datetime
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression
import gc
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.signal import hilbert, hann, convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
from itertools import product

from tsfresh.feature_extraction import feature_calculators


In [6]:
data_reader = pd.read_csv('D:/kaggle/earthquake/train.csv', 
                          dtype={'acoustic_data': np.int16,
                                 'time_to_failure': np.float32},
                          chunksize=300_000)

nrows = 629145481
rows = 300_000
segments = int(np.floor(nrows / rows))

X_train = pd.DataFrame(dtype=np.float64)
y_train = pd.DataFrame(columns=['time_to_failure'])

In [7]:
y_train

Unnamed: 0,time_to_failure


In [4]:
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

In [5]:
def classic_sta_lta(x, length_sta, length_lta):
    sta = np.cumsum(x**2)
    sta = np.require(sta, dtype=np.float)
    lta = sta.copy()
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta
    sta[:length_lta - 1] = 0
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny
    return sta / lta

In [9]:
%%time
from tqdm import tqdm
idx = 4195
for segment in tqdm(data_reader, total=segments):
    
#     xc = pd.Series(segment.acoustic_data.values[75000:225000])
    
    try:
        y_train.loc[idx, 'time_to_failure'] = segment.time_to_failure.values[225000]
    except IndexError:
        y_train.loc[idx, 'time_to_failure'] = segment.time_to_failure.values[-1]
#     zc = np.fft.fft(xc)
    
#     X_train.loc[idx, 'mean'] = xc.mean()
#     X_train.loc[idx, 'std'] =xc.std()
#     X_train.loc[idx, 'max'] = xc.max()
#     X_train.loc[idx, 'min'] = xc.min()   
#     X_train.loc[idx, 'mean_change_abs'] = np.mean(np.diff(xc))
#     X_train.loc[idx, 'mean_change_rate'] = np.mean(np.nonzero((np.diff(xc) / xc[:-1]))[0])
#     X_train.loc[idx, 'abs_max'] = np.abs(xc).max()
    
#     X_train.loc[idx, 'mean_first_50000'] = xc[:50000].mean()
#     X_train.loc[idx, 'mean_last_50000'] = xc[-50000:].mean()
#     X_train.loc[idx, 'mean_first_10000'] = xc[:10000].mean()
#     X_train.loc[idx, 'mean_last_10000'] = xc[-10000:].mean()
#     X_train.loc[idx, 'std_first_50000'] = xc[:50000].std()
#     X_train.loc[idx, 'std_last_50000'] = xc[-50000:].std()
#     X_train.loc[idx, 'std_first_10000'] = xc[:10000].std()
#     X_train.loc[idx, 'std_last_10000'] = xc[-10000:].std()
#     X_train.loc[idx, 'min_first_50000'] = xc[:50000].min()
#     X_train.loc[idx, 'min_last_50000'] = xc[-50000:].min()
#     X_train.loc[idx, 'min_first_10000'] = xc[:10000].min()
#     X_train.loc[idx, 'min_last_10000'] = xc[-10000:].min()
#     X_train.loc[idx, 'max_first_50000'] = xc[:50000].max()
#     X_train.loc[idx, 'max_last_50000'] = xc[-50000:].max()
#     X_train.loc[idx, 'max_first_10000'] = xc[:10000].max()
#     X_train.loc[idx, 'max_last_10000'] = xc[-10000:].max()
    
#     X_train.loc[idx, 'max_to_min'] = xc.max() / np.abs(xc.min())
#     X_train.loc[idx, 'max_to_min_diff'] = xc.max() - np.abs(xc.min())
#     X_train.loc[idx, 'count_big'] = len(xc[np.abs(xc) > 500])
#     X_train.loc[idx, 'sum'] = xc.sum()
    
#     X_train.loc[idx, 'mean_change_rate_first_50000'] = np.mean(np.nonzero((np.diff(xc[:50000]) / xc[:50000][:-1]))[0])
#     X_train.loc[idx, 'mean_change_rate_last_50000'] = np.mean(np.nonzero((np.diff(xc[-50000:]) / xc[-50000:][:-1]))[0])
#     X_train.loc[idx, 'mean_change_rate_first_10000'] = np.mean(np.nonzero((np.diff(xc[:10000]) / xc[:10000][:-1]))[0])
#     X_train.loc[idx, 'mean_change_rate_last_10000'] = np.mean(np.nonzero((np.diff(xc[-10000:]) / xc[-10000:][:-1]))[0])
    
#     X_train.loc[idx, 'q95'] = np.quantile(xc, 0.95)
#     X_train.loc[idx, 'q99'] = np.quantile(xc, 0.99)
#     X_train.loc[idx, 'q05'] = np.quantile(xc, 0.05)
#     X_train.loc[idx, 'q01'] = np.quantile(xc, 0.01)
#     X_train.loc[idx, 'abs_q95'] = np.quantile(np.abs(xc), 0.95)
#     X_train.loc[idx, 'abs_q99'] = np.quantile(np.abs(xc), 0.99)
#     X_train.loc[idx, 'abs_q05'] = np.quantile(np.abs(xc), 0.05)
#     X_train.loc[idx, 'abs_q01'] = np.quantile(np.abs(xc), 0.01)
    
#     X_train.loc[idx, 'trend'] = add_trend_feature(xc)
#     X_train.loc[idx, 'abs_trend'] = add_trend_feature(xc, abs_values=True)
#     X_train.loc[idx, 'abs_mean'] = np.abs(xc).mean()
#     X_train.loc[idx, 'abs_std'] = np.abs(xc).std()

#     X_train.loc[idx, 'mad'] = xc.mad()
#     X_train.loc[idx, 'kurt'] = xc.kurtosis()
#     X_train.loc[idx, 'skew'] = xc.skew()
#     X_train.loc[idx, 'median'] = xc.median()
    
#     X_train.loc[idx, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean()
#     X_train.loc[idx, 'Hann_window_mean_50'] = (convolve(xc, hann(50), mode='same')/sum(hann(50))).mean()
#     X_train.loc[idx, 'Hann_window_mean_150'] = (convolve(xc, hann(150), mode='same')/sum(hann(150))).mean()
#     X_train.loc[idx, 'Hann_window_mean_1500'] = (convolve(xc, hann(1500), mode='same')/sum(hann(1500))).mean()
#     X_train.loc[idx, 'Hann_window_mean_15000'] = (convolve(xc, hann(15000), mode='same')/sum(hann(15000))).mean()
#     X_train.loc[idx, 'classic_sta_lta1_mean'] = classic_sta_lta(xc, 500, 10000).mean()
#     X_train.loc[idx, 'classic_sta_lta2_mean'] = classic_sta_lta(xc, 5000, 100000).mean()
#     X_train.loc[idx, 'classic_sta_lta3_mean'] = classic_sta_lta(xc, 3333, 6666).mean()
#     X_train.loc[idx, 'classic_sta_lta4_mean'] = classic_sta_lta(xc, 10000, 25000).mean()
#     X_train.loc[idx, 'classic_sta_lta5_mean'] = classic_sta_lta(xc, 50, 1000).mean()
#     X_train.loc[idx, 'classic_sta_lta6_mean'] = classic_sta_lta(xc, 100, 5000).mean()
#     X_train.loc[idx, 'classic_sta_lta7_mean'] = classic_sta_lta(xc, 333, 666).mean()
#     X_train.loc[idx, 'classic_sta_lta8_mean'] = classic_sta_lta(xc, 4000, 10000).mean()
    
#     X_train.loc[idx, 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True)
#     X_train.loc[idx, 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True)
#     X_train.loc[idx, 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True)
#     X_train.loc[idx, 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True)
#     ewma = pd.Series.ewm
#     X_train.loc[idx, 'exp_moving_average_300_mean'] = (ewma(xc, span=300).mean()).mean(skipna=True)
#     X_train.loc[idx, 'exp_moving_average_3000_mean'] = (ewma(xc, span=3000).mean()).mean(skipna=True)
#     X_train.loc[idx, 'exp_moving_average_30000_mean'] = (ewma(xc, span=30000).mean()).mean(skipna=True)
#     X_train.loc[idx, 'exp_moving_average_50000_mean'] = (ewma(xc, span=50000).mean()).mean(skipna=True)
    
#     X_train.loc[idx, 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean()
#     X_train.loc[idx,'MA_700MA_BB_high_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] + 2 * X_train.loc[idx, 'MA_700MA_std_mean']).mean()
#     X_train.loc[idx,'MA_700MA_BB_low_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] - 2 * X_train.loc[idx, 'MA_700MA_std_mean']).mean()
#     X_train.loc[idx, 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean()
#     X_train.loc[idx,'MA_400MA_BB_high_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] + 2 * X_train.loc[idx, 'MA_400MA_std_mean']).mean()
#     X_train.loc[idx,'MA_400MA_BB_low_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] - 2 * X_train.loc[idx, 'MA_400MA_std_mean']).mean()
#     X_train.loc[idx, 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean()
    
#     X_train.loc[idx, 'iqr'] = np.subtract(*np.percentile(xc, [25, 75]))
#     X_train.loc[idx, 'q999'] = np.quantile(xc, 0.999)
#     X_train.loc[idx, 'q001'] = np.quantile(xc, 0.001)
#     X_train.loc[idx, 'ave10'] = stats.trim_mean(xc, 0.1)
    
#     X_train.loc[idx, 'number_peaks_50p'] = feature_calculators.number_peaks(xc.values, 50)
#     X_train.loc[idx, 'number_peaks_100p'] = feature_calculators.number_peaks(xc.values, 100)
#     X_train.loc[idx, 'number_peaks_500p'] = feature_calculators.number_peaks(xc.values, 500)
#     X_train.loc[idx, 'number_peaks_1000p'] = feature_calculators.number_peaks(xc.values, 1000)
#     X_train.loc[idx, 'number_peaks_10000p'] = feature_calculators.number_peaks(xc.values, 10000)
#     X_train.loc[idx, 'autocorrelaion_10'] = feature_calculators.autocorrelation(xc.values, 10)
#     X_train.loc[idx, 'autocorrelaion_50'] = feature_calculators.autocorrelation(xc.values, 50)
#     X_train.loc[idx, 'autocorrelaion_100'] = feature_calculators.autocorrelation(xc.values, 100)
#     X_train.loc[idx, 'autocorrelaion_1000'] = feature_calculators.autocorrelation(xc.values, 1000)
#     X_train.loc[idx, 'c3_5'] = feature_calculators.c3(xc.values, 5)
#     X_train.loc[idx, 'c3_10'] = feature_calculators.c3(xc.values, 10)
#     X_train.loc[idx, 'c3_100'] = feature_calculators.c3(xc.values, 100)
#     X_train.loc[idx, 'binned_entropy_50'] = feature_calculators.binned_entropy(xc.values, 50)
#     X_train.loc[idx, 'binned_entropy_80'] = feature_calculators.binned_entropy(xc.values, 80)
#     X_train.loc[idx, 'binned_entropy_100'] = feature_calculators.binned_entropy(xc.values, 100)
#     X_train.loc[idx, 'binned_entropy_500'] = feature_calculators.binned_entropy(xc.values, 500)
#     X_train.loc[idx, 'mean_abs_change'] = feature_calculators.mean_abs_change(xc.values)
 
#     # FFT transform values
#     realFFT = np.real(zc)
#     imagFFT = np.imag(zc)
#     X_train.loc[idx, 'Rmean'] = realFFT.mean()
#     X_train.loc[idx, 'Rstd'] = realFFT.std()
#     X_train.loc[idx, 'Rmax'] = realFFT.max()
#     X_train.loc[idx, 'Rmin'] = realFFT.min()
#     X_train.loc[idx, 'Imean'] = imagFFT.mean()
#     X_train.loc[idx, 'Istd'] = imagFFT.std()
#     X_train.loc[idx, 'Imax'] = imagFFT.max()
#     X_train.loc[idx, 'Imin'] = imagFFT.min()
    
#     X_train.loc[idx, 'Rmean_last_5000'] = realFFT[-5000:].mean()
#     X_train.loc[idx, 'Rstd_last_5000'] = realFFT[-5000:].std()
#     X_train.loc[idx, 'Rmax_last_5000'] = realFFT[-5000:].max()
#     X_train.loc[idx, 'Rmin_last_5000'] = realFFT[-5000:].min()
#     X_train.loc[idx, 'Rmean_last_15000'] = realFFT[-15000:].mean()
#     X_train.loc[idx, 'Rstd_last_15000'] = realFFT[-15000:].std()
#     X_train.loc[idx, 'Rmax_last_15000'] = realFFT[-15000:].max()
#     X_train.loc[idx, 'Rmin_last_15000'] = realFFT[-15000:].min() 
    
#     for windows in [10, 50, 100, 500, 1000, 5000, 10000]:
#         x_roll_std = xc.rolling(windows).std().dropna().values
#         x_roll_mean = xc.rolling(windows).mean().dropna().values

#         X_train.loc[idx, f'mean_roll_{windows}_mean'] = x_roll_mean.mean()
#         X_train.loc[idx, f'mean_roll_{windows}_std'] = x_roll_mean.std()
#         X_train.loc[idx, f'mean_roll_{windows}_max'] = x_roll_mean.max()
#         X_train.loc[idx, f'mean_roll_{windows}_min'] = x_roll_mean.min()
#         X_train.loc[idx, f'mean_roll_{windows}_q01'] = np.quantile(x_roll_mean, 0.01)
#         X_train.loc[idx, f'mean_roll_{windows}_q05'] = np.quantile(x_roll_mean, 0.05)
#         X_train.loc[idx, f'mean_roll_{windows}_median'] = np.median(x_roll_mean)
#         X_train.loc[idx, f'mean_roll_{windows}_q95'] = np.quantile(x_roll_mean, 0.95)
#         X_train.loc[idx, f'mean_roll_{windows}_q99'] = np.quantile(x_roll_mean, 0.99)
#         X_train.loc[idx, f'mean_roll_{windows}_av_change'] = np.mean(np.diff(x_roll_mean))
        
#         X_train.loc[idx, f'std_roll_{windows}_mean'] = x_roll_std.mean()
#         X_train.loc[idx, f'std_roll_{windows}_std'] = x_roll_std.std()
#         X_train.loc[idx, f'std_roll_{windows}_max'] = x_roll_std.max()
#         X_train.loc[idx, f'std_roll_{windows}_min'] = x_roll_std.min()
#         X_train.loc[idx, f'std_roll_{windows}_q01'] = np.quantile(x_roll_std, 0.01)
#         X_train.loc[idx, f'std_roll_{windows}_q05'] = np.quantile(x_roll_std, 0.05)
#         X_train.loc[idx, f'std_roll_{windows}_median'] = np.median(x_roll_std)
#         X_train.loc[idx, f'std_roll_{windows}_q95'] = np.quantile(x_roll_std, 0.95)
#         X_train.loc[idx, f'std_roll_{windows}_q99'] = np.quantile(x_roll_std, 0.99)
#         X_train.loc[idx, f'std_roll_{windows}_av_change'] = np.mean(np.diff(x_roll_std))       
        
    idx += 1
    
# X_train.to_csv('./new_features/X_train_new_features2.csv', index=False, header=True)

100%|████████████████▉| 2096/2097 [03:02<00:00, 11.82it/s]

IndexError: index 225000 is out of bounds for axis 0 with size 45480

In [12]:
y_train.to_csv('./new_features/y2.csv', index=False, header=True)

In [7]:
X_train

Unnamed: 0,mean,std,max,min,mean_change_abs,mean_change_rate,abs_max,mean_first_50000,mean_last_50000,mean_first_10000,...,std_roll_10000_mean,std_roll_10000_std,std_roll_10000_max,std_roll_10000_min,std_roll_10000_q01,std_roll_10000_q05,std_roll_10000_median,std_roll_10000_q95,std_roll_10000_q99,std_roll_10000_av_change
4195,4.7468,6.173383499434666,181.0,-154.0,0.0,75152.33490894768,181.0,4.94148,4.7195,5.2181,...,5.527776848066935,3.055282024741998,13.235292824022196,2.972412569554961,3.06519715244522,3.231207139747924,4.446361754657987,13.065962372794136,13.161216131916031,5.079819678e-06
4196,4.943726666666667,8.011366166799597,197.0,-199.0,4.0000266668e-05,75076.20126061192,199.0,5.00486,4.84652,5.1,...,7.021638380175348,3.692234739090889,17.521969436718383,2.910945550916387,2.935780993365135,3.219385501038971,6.111061467308735,17.210349200755953,17.50239840807545,8.2461824344e-05
4197,4.990006666666667,7.334238343116213,145.0,-144.0,7.3333822225e-05,74929.34778858811,145.0,4.96452,5.09728,5.0668,...,6.142410762412317,3.324629695110395,16.551132860317935,2.961059022892747,2.976511224891995,3.046676217388093,4.56665494905654,11.45968430396034,16.510594711389786,-9.2188374248e-05
4198,4.678353333333333,5.702856030586919,120.0,-89.0,2.6666844446e-05,74858.44638115341,120.0,4.78652,4.61104,4.9747,...,5.440298817740418,1.943399526894086,9.076367227266916,2.971176113721699,2.986641256093498,3.073026180980461,4.89694202478648,8.77648328351869,8.819271912448718,-8.50158267e-06
4199,4.702746666666667,7.290551520296574,152.0,-126.0,-2.6666844446e-05,75019.1316207647,152.0,4.8557,4.72992,4.8368,...,6.684701456869537,3.127163219197416,17.163535029051253,3.422647536866338,3.5365927824462,3.656942679749837,6.403089238420709,15.86960562926132,17.134363607430043,-1.8254940485e-05
4200,4.36028,15.5736187800271,410.0,-478.0,-2.0000133334e-05,75152.22594764223,478.0,4.54868,4.2689,4.8709,...,11.367353108941735,11.378309552213468,44.02809352688288,2.995085214738243,3.16670598614389,3.521946514461048,6.184715824593924,43.849658503745225,43.96127044251424,8.322062231e-06
4201,4.54916,8.196706932325336,224.0,-169.0,-6.0000400003e-05,75044.31523333206,224.0,4.44192,4.61748,4.4427,...,7.380179854829029,3.922977054607707,20.36157188433548,3.214977674595096,3.346538964707316,3.652632190926477,5.760076874361984,16.635962618232348,20.30427150587133,3.9235631051e-05
4202,4.61106,5.884460030923175,98.0,-125.0,4.666697778e-05,75070.56240973329,125.0,4.54348,4.67962,4.4718,...,5.549874916299443,2.283641712317878,12.821223197746324,2.88365658453459,2.931301274641726,3.134264558302644,4.666843149549591,10.82930096386356,12.774066008371731,2.118318339e-06
4203,4.42646,6.644301281047547,145.0,-162.0,-6.6667111114e-05,75023.09683718476,162.0,4.3759,4.4128,4.5826,...,6.207740012368657,2.735245268480428,12.717834578718064,3.124131291996595,3.138486851967258,3.496843797148157,5.302832466070912,11.97908237852394,12.592302039224784,8.085581772e-06
4204,4.623806666666667,5.886002283053249,120.0,-117.0,-6.666711111e-06,75080.42088361304,120.0,4.78482,4.56666,4.6589,...,,,,,,,,,,


In [2]:
X = pd.read_csv('more_features.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32})

In [2]:
%%time
from tqdm import tqdm

data_reader = pd.read_csv('D:/kaggle/earthquake/train.csv', 
                          dtype={'acoustic_data': np.int16,
                                 'time_to_failure': np.float32},
                          chunksize=150_000)

nrows = 629145481
rows = 150_000
segments = int(np.floor(nrows / rows))
y = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure'])
idx = 0
for segment in tqdm(data_reader, total=segments):
    y.loc[idx, 'time_to_failure'] = segment.time_to_failure.values[-1]
    idx += 1

4195it [03:16, 21.31it/s]                                 


Wall time: 3min 16s


In [13]:
X.min()[50:100]

Hann_window_mean_1500            3.590944675262227e+00
Hann_window_mean_15000           3.542118535941572e+00
classic_sta_lta1_mean            7.953713935194633e-01
classic_sta_lta2_mean            0.000000000000000e+00
classic_sta_lta3_mean            8.507818122703901e-01
classic_sta_lta4_mean            4.617799425607957e-01
classic_sta_lta5_mean                             -inf
classic_sta_lta6_mean            8.919247101659796e-01
classic_sta_lta7_mean                             -inf
classic_sta_lta8_mean            7.857312296412058e-01
Moving_average_700_mean          3.596103384629516e+00
Moving_average_1500_mean         3.596149377220804e+00
Moving_average_3000_mean         3.595131420874688e+00
Moving_average_6000_mean         3.594449535072674e+00
exp_moving_average_300_mean      3.596003969268151e+00
exp_moving_average_3000_mean     3.595433116671068e+00
exp_moving_average_30000_mean    3.602832889856244e+00
exp_moving_average_50000_mean    3.605735480246619e+00
MA_700MA_s

In [5]:
X.columns

Index(['Unnamed: 0', 'mean', 'std', 'max', 'min', 'mean_change_abs',
       'mean_change_rate', 'abs_max', 'abs_min', 'mean_first_50000',
       ...
       'std_roll_10000_mean', 'std_roll_10000_std', 'std_roll_10000_max',
       'std_roll_10000_min', 'std_roll_10000_q01', 'std_roll_10000_q05',
       'std_roll_10000_median', 'std_roll_10000_q95', 'std_roll_10000_q99',
       'std_roll_10000_av_change'],
      dtype='object', length=255)

In [6]:
X.drop(['abs_min', 'cid', 'Unnamed: 0'], axis=1,inplace=True)

In [7]:
X.shape

(4195, 252)

In [8]:
y.shape

(4195, 1)

In [18]:
XGB_MAX_DEPTH = 50
EVAL_METRIC_XGB = 'mae'
NUM_EVALS = 5000
N_FOLDS = 5

In [158]:
print('Running {} rounds of XGBoost parameter optimization:'.format(NUM_EVALS))
gc.collect()
integer_params = ['max_depth']


def objective(space_params):

    for param in integer_params:
        space_params[param] = int(space_params[param])

    space_params['tree_method'] = 'exact'
    space_params['boosting'] = 'gbtree'
    space_params['eval_metric'] = 'mae'
    cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB],
                        early_stopping_rounds=100, stratified=False, seed=0)
    
    best_loss = cv_results['test-mae-mean'].iloc[-1]
    return {'loss': best_loss, 'status': STATUS_OK}

train = xgb.DMatrix(X, y_train)


space = {'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH,1),
         'reg_alpha': hp.uniform('reg_alpha', 0, 5),
         'reg_lambda': hp.uniform('reg_lambda', 0, 5),
         'min_child_weight': hp.uniform('min_child_weight', 0, 5),
         'gamma': hp.uniform('gamma', 0, 5),
         'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
         'colsample_bytree': hp.quniform('colsample_bytree', 0.1, 1, 0.01),
         'colsample_bylevel': hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
         'colsample_bynode': hp.quniform('colsample_bynode', 0.1, 1, 0.01),
         'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
         'nthread': 3}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=NUM_EVALS)


Running 5000 rounds of XGBoost parameter optimization:
100%|█| 5000/5000 [1:59:00<00:00,  1.75s/it, best loss: 2.0472478]         


In [159]:
best.items()

dict_items([('colsample_bylevel', 0.66), ('colsample_bynode', 0.33), ('colsample_bytree', 0.67), ('gamma', 4.605768437487887), ('learning_rate', 0.19973803639042553), ('max_depth', 4.0), ('min_child_weight', 3.9024481042680503), ('reg_alpha', 2.3014070108436657), ('reg_lambda', 0.509742273012636), ('subsample', 1.0)])

In [10]:
xgb_params = {'colsample_bylevel': 0.66,
              'colsample_bynode': 0.33,
              'colsample_bytree': 0.67,
              'gamma': 4.605768437487887,
              'learning_rate': 0.19973803639042553,
              'max_depth': 4,
              'min_child_weight': 3.9024481042680503,
              'reg_alpha': 2.3014070108436657,
              'reg_lambda': 0.509742273012636,
              'subsample': 1.0}

In [12]:
MAE = 0
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=0)
from tqdm import tqdm

for train_idx, valid_idx in tqdm(folds.split(y), total=5):
#     print(train_idx)
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    model = xgb.XGBRegressor(**xgb_params, n_estimators=10000, n_jobs=3)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
              verbose=0)
    val_preds = model.predict(X_valid)
    MAE += mean_absolute_error(y_valid, val_preds) / n_fold
    
print('CV score: {}'.format(MAE))

100%|███████████████████████████████████████████| 5/5 [10:28<00:00, 125.22s/it]


CV score: 2.178242952092323
