In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15
import xgboost as xgb
import time
import datetime
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression
import gc
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.signal import hilbert, hann, convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
from itertools import product

from tsfresh.feature_extraction import feature_calculators
from joblib import Parallel, delayed
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
X = pd.read_csv('more_features.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32})

In [3]:
X.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
submission = pd.read_csv('D:/kaggle/earthquake/sample_submission.csv', index_col='seg_id')
X_train = pd.DataFrame(columns=X.columns, dtype=np.float64, index=submission.index)

In [5]:
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

def classic_sta_lta(x, length_sta, length_lta):
    sta = np.cumsum(x**2)
    sta = np.require(sta, dtype=np.float)
    lta = sta.copy()
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta
    sta[:length_lta - 1] = 0
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny
    return sta / lta

In [7]:
%%time
from tqdm import tqdm


for idx in tqdm(list(X_train.index)):
    file = 'D:/kaggle/earthquake/test/' + idx + '.csv'
    segment = pd.read_csv(file)    
    xc = pd.Series(segment.acoustic_data.values)
    zc = np.fft.fft(xc)
    
    X_train.loc[idx, 'mean'] = xc.mean()
    X_train.loc[idx, 'std'] =xc.std()
    X_train.loc[idx, 'max'] = xc.max()
    X_train.loc[idx, 'min'] = xc.min()   
    X_train.loc[idx, 'mean_change_abs'] = np.mean(np.diff(xc))
    X_train.loc[idx, 'mean_change_rate'] = np.mean(np.nonzero((np.diff(xc) / xc[:-1]))[0])
    X_train.loc[idx, 'abs_max'] = np.abs(xc).max()
    
    X_train.loc[idx, 'mean_first_50000'] = xc[:50000].mean()
    X_train.loc[idx, 'mean_last_50000'] = xc[-50000:].mean()
    X_train.loc[idx, 'mean_first_10000'] = xc[:10000].mean()
    X_train.loc[idx, 'mean_last_10000'] = xc[-10000:].mean()
    X_train.loc[idx, 'std_first_50000'] = xc[:50000].std()
    X_train.loc[idx, 'std_last_50000'] = xc[-50000:].std()
    X_train.loc[idx, 'std_first_10000'] = xc[:10000].std()
    X_train.loc[idx, 'std_last_10000'] = xc[-10000:].std()
    X_train.loc[idx, 'min_first_50000'] = xc[:50000].min()
    X_train.loc[idx, 'min_last_50000'] = xc[-50000:].min()
    X_train.loc[idx, 'min_first_10000'] = xc[:10000].min()
    X_train.loc[idx, 'min_last_10000'] = xc[-10000:].min()
    X_train.loc[idx, 'max_first_50000'] = xc[:50000].max()
    X_train.loc[idx, 'max_last_50000'] = xc[-50000:].max()
    X_train.loc[idx, 'max_first_10000'] = xc[:10000].max()
    X_train.loc[idx, 'max_last_10000'] = xc[-10000:].max()
    
    X_train.loc[idx, 'max_to_min'] = xc.max() / np.abs(xc.min())
    X_train.loc[idx, 'max_to_min_diff'] = xc.max() - np.abs(xc.min())
    X_train.loc[idx, 'count_big'] = len(xc[np.abs(xc) > 500])
    X_train.loc[idx, 'sum'] = xc.sum()
    
    X_train.loc[idx, 'mean_change_rate_first_50000'] = np.mean(np.nonzero((np.diff(xc[:50000]) / xc[:50000][:-1]))[0])
    X_train.loc[idx, 'mean_change_rate_last_50000'] = np.mean(np.nonzero((np.diff(xc[-50000:]) / xc[-50000:][:-1]))[0])
    X_train.loc[idx, 'mean_change_rate_first_10000'] = np.mean(np.nonzero((np.diff(xc[:10000]) / xc[:10000][:-1]))[0])
    X_train.loc[idx, 'mean_change_rate_last_10000'] = np.mean(np.nonzero((np.diff(xc[-10000:]) / xc[-10000:][:-1]))[0])
    
    X_train.loc[idx, 'q95'] = np.quantile(xc, 0.95)
    X_train.loc[idx, 'q99'] = np.quantile(xc, 0.99)
    X_train.loc[idx, 'q05'] = np.quantile(xc, 0.05)
    X_train.loc[idx, 'q01'] = np.quantile(xc, 0.01)
    X_train.loc[idx, 'abs_q95'] = np.quantile(np.abs(xc), 0.95)
    X_train.loc[idx, 'abs_q99'] = np.quantile(np.abs(xc), 0.99)
    X_train.loc[idx, 'abs_q05'] = np.quantile(np.abs(xc), 0.05)
    X_train.loc[idx, 'abs_q01'] = np.quantile(np.abs(xc), 0.01)
    
    X_train.loc[idx, 'trend'] = add_trend_feature(xc)
    X_train.loc[idx, 'abs_trend'] = add_trend_feature(xc, abs_values=True)
    X_train.loc[idx, 'abs_mean'] = np.abs(xc).mean()
    X_train.loc[idx, 'abs_std'] = np.abs(xc).std()

    X_train.loc[idx, 'mad'] = xc.mad()
    X_train.loc[idx, 'kurt'] = xc.kurtosis()
    X_train.loc[idx, 'skew'] = xc.skew()
    X_train.loc[idx, 'median'] = xc.median()
    
    X_train.loc[idx, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean()
    X_train.loc[idx, 'Hann_window_mean_50'] = (convolve(xc, hann(50), mode='same')/sum(hann(50))).mean()
    X_train.loc[idx, 'Hann_window_mean_150'] = (convolve(xc, hann(150), mode='same')/sum(hann(150))).mean()
    X_train.loc[idx, 'Hann_window_mean_1500'] = (convolve(xc, hann(1500), mode='same')/sum(hann(1500))).mean()
    X_train.loc[idx, 'Hann_window_mean_15000'] = (convolve(xc, hann(15000), mode='same')/sum(hann(15000))).mean()
    X_train.loc[idx, 'classic_sta_lta1_mean'] = classic_sta_lta(xc, 500, 10000).mean()
    X_train.loc[idx, 'classic_sta_lta2_mean'] = classic_sta_lta(xc, 5000, 100000).mean()
    X_train.loc[idx, 'classic_sta_lta3_mean'] = classic_sta_lta(xc, 3333, 6666).mean()
    X_train.loc[idx, 'classic_sta_lta4_mean'] = classic_sta_lta(xc, 10000, 25000).mean()
    X_train.loc[idx, 'classic_sta_lta5_mean'] = classic_sta_lta(xc, 50, 1000).mean()
    X_train.loc[idx, 'classic_sta_lta6_mean'] = classic_sta_lta(xc, 100, 5000).mean()
    X_train.loc[idx, 'classic_sta_lta7_mean'] = classic_sta_lta(xc, 333, 666).mean()
    X_train.loc[idx, 'classic_sta_lta8_mean'] = classic_sta_lta(xc, 4000, 10000).mean()
    
    X_train.loc[idx, 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True)
    X_train.loc[idx, 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True)
    X_train.loc[idx, 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True)
    X_train.loc[idx, 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True)
    ewma = pd.Series.ewm
    X_train.loc[idx, 'exp_moving_average_300_mean'] = (ewma(xc, span=300).mean()).mean(skipna=True)
    X_train.loc[idx, 'exp_moving_average_3000_mean'] = (ewma(xc, span=3000).mean()).mean(skipna=True)
    X_train.loc[idx, 'exp_moving_average_30000_mean'] = (ewma(xc, span=30000).mean()).mean(skipna=True)
    X_train.loc[idx, 'exp_moving_average_50000_mean'] = (ewma(xc, span=50000).mean()).mean(skipna=True)
    
    X_train.loc[idx, 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean()
    X_train.loc[idx,'MA_700MA_BB_high_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] + 2 * X_train.loc[idx, 'MA_700MA_std_mean']).mean()
    X_train.loc[idx,'MA_700MA_BB_low_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] - 2 * X_train.loc[idx, 'MA_700MA_std_mean']).mean()
    X_train.loc[idx, 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean()
    X_train.loc[idx,'MA_400MA_BB_high_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] + 2 * X_train.loc[idx, 'MA_400MA_std_mean']).mean()
    X_train.loc[idx,'MA_400MA_BB_low_mean'] = (X_train.loc[idx, 'Moving_average_700_mean'] - 2 * X_train.loc[idx, 'MA_400MA_std_mean']).mean()
    X_train.loc[idx, 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean()
    
    X_train.loc[idx, 'iqr'] = np.subtract(*np.percentile(xc, [25, 75]))
    X_train.loc[idx, 'q999'] = np.quantile(xc, 0.999)
    X_train.loc[idx, 'q001'] = np.quantile(xc, 0.001)
    X_train.loc[idx, 'ave10'] = stats.trim_mean(xc, 0.1)
    
    X_train.loc[idx, 'number_peaks_50p'] = feature_calculators.number_peaks(xc.values, 50)
    X_train.loc[idx, 'number_peaks_100p'] = feature_calculators.number_peaks(xc.values, 100)
    X_train.loc[idx, 'number_peaks_500p'] = feature_calculators.number_peaks(xc.values, 500)
    X_train.loc[idx, 'number_peaks_1000p'] = feature_calculators.number_peaks(xc.values, 1000)
    X_train.loc[idx, 'number_peaks_10000p'] = feature_calculators.number_peaks(xc.values, 10000)
    X_train.loc[idx, 'autocorrelaion_10'] = feature_calculators.autocorrelation(xc.values, 10)
    X_train.loc[idx, 'autocorrelaion_50'] = feature_calculators.autocorrelation(xc.values, 50)
    X_train.loc[idx, 'autocorrelaion_100'] = feature_calculators.autocorrelation(xc.values, 100)
    X_train.loc[idx, 'autocorrelaion_1000'] = feature_calculators.autocorrelation(xc.values, 1000)
    X_train.loc[idx, 'c3_5'] = feature_calculators.c3(xc.values, 5)
    X_train.loc[idx, 'c3_10'] = feature_calculators.c3(xc.values, 10)
    X_train.loc[idx, 'c3_100'] = feature_calculators.c3(xc.values, 100)
    X_train.loc[idx, 'binned_entropy_50'] = feature_calculators.binned_entropy(xc.values, 50)
    X_train.loc[idx, 'binned_entropy_80'] = feature_calculators.binned_entropy(xc.values, 80)
    X_train.loc[idx, 'binned_entropy_100'] = feature_calculators.binned_entropy(xc.values, 100)
    X_train.loc[idx, 'binned_entropy_500'] = feature_calculators.binned_entropy(xc.values, 500)
    X_train.loc[idx, 'mean_abs_change'] = feature_calculators.mean_abs_change(xc.values)
 
    # FFT transform values
    realFFT = np.real(zc)
    imagFFT = np.imag(zc)
    X_train.loc[idx, 'Rmean'] = realFFT.mean()
    X_train.loc[idx, 'Rstd'] = realFFT.std()
    X_train.loc[idx, 'Rmax'] = realFFT.max()
    X_train.loc[idx, 'Rmin'] = realFFT.min()
    X_train.loc[idx, 'Imean'] = imagFFT.mean()
    X_train.loc[idx, 'Istd'] = imagFFT.std()
    X_train.loc[idx, 'Imax'] = imagFFT.max()
    X_train.loc[idx, 'Imin'] = imagFFT.min()
    
    X_train.loc[idx, 'Rmean_last_5000'] = realFFT[-5000:].mean()
    X_train.loc[idx, 'Rstd_last_5000'] = realFFT[-5000:].std()
    X_train.loc[idx, 'Rmax_last_5000'] = realFFT[-5000:].max()
    X_train.loc[idx, 'Rmin_last_5000'] = realFFT[-5000:].min()
    X_train.loc[idx, 'Rmean_last_15000'] = realFFT[-15000:].mean()
    X_train.loc[idx, 'Rstd_last_15000'] = realFFT[-15000:].std()
    X_train.loc[idx, 'Rmax_last_15000'] = realFFT[-15000:].max()
    X_train.loc[idx, 'Rmin_last_15000'] = realFFT[-15000:].min() 
    
    for windows in [10, 50, 100, 500, 1000, 5000, 10000]:
        x_roll_std = xc.rolling(windows).std().dropna().values
        x_roll_mean = xc.rolling(windows).mean().dropna().values

        X_train.loc[idx, f'mean_roll_{windows}_mean'] = x_roll_mean.mean()
        X_train.loc[idx, f'mean_roll_{windows}_std'] = x_roll_mean.std()
        X_train.loc[idx, f'mean_roll_{windows}_max'] = x_roll_mean.max()
        X_train.loc[idx, f'mean_roll_{windows}_min'] = x_roll_mean.min()
        X_train.loc[idx, f'mean_roll_{windows}_q01'] = np.quantile(x_roll_mean, 0.01)
        X_train.loc[idx, f'mean_roll_{windows}_q05'] = np.quantile(x_roll_mean, 0.05)
        X_train.loc[idx, f'mean_roll_{windows}_median'] = np.median(x_roll_mean)
        X_train.loc[idx, f'mean_roll_{windows}_q95'] = np.quantile(x_roll_mean, 0.95)
        X_train.loc[idx, f'mean_roll_{windows}_q99'] = np.quantile(x_roll_mean, 0.99)
        X_train.loc[idx, f'mean_roll_{windows}_av_change'] = np.mean(np.diff(x_roll_mean))
        
        X_train.loc[idx, f'std_roll_{windows}_mean'] = x_roll_std.mean()
        X_train.loc[idx, f'std_roll_{windows}_std'] = x_roll_std.std()
        X_train.loc[idx, f'std_roll_{windows}_max'] = x_roll_std.max()
        X_train.loc[idx, f'std_roll_{windows}_min'] = x_roll_std.min()
        X_train.loc[idx, f'std_roll_{windows}_q01'] = np.quantile(x_roll_std, 0.01)
        X_train.loc[idx, f'std_roll_{windows}_q05'] = np.quantile(x_roll_std, 0.05)
        X_train.loc[idx, f'std_roll_{windows}_median'] = np.median(x_roll_std)
        X_train.loc[idx, f'std_roll_{windows}_q95'] = np.quantile(x_roll_std, 0.95)
        X_train.loc[idx, f'std_roll_{windows}_q99'] = np.quantile(x_roll_std, 0.99)
        X_train.loc[idx, f'std_roll_{windows}_av_change'] = np.mean(np.diff(x_roll_std))       
        
        
X_train.to_csv('more_features_test', index=True, header=True)

100%|████████████████████████████████████| 2624/2624 [8:04:15<00:00, 11.02s/it]


Wall time: 8h 4min 16s


In [8]:
X_train.to_csv('more_features_test.csv', index=True, header=True)