In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
from scipy.fftpack import fft

In [3]:
import gc

In [4]:
PATH = Path(".")
TRAIN_PATH = PATH/"train"
TEST_PATH = PATH/"test"

In [5]:
sample = pd.read_csv(TRAIN_PATH/"train.csv", dtype=np.float, nrows=150000000)

In [6]:
sample_15_x = sample[:1500000]['acoustic_data']

In [7]:
def segmentation(data, skipping=150000, window_size=150000):
    return list(range(0, len(data)-len(data)%window_size - window_size + 1, skipping))

In [8]:
def basic_feature(data, skipping=150000, window_size=150000):
    segment_start = segmentation(data, skipping, window_size)
    features = pd.DataFrame(index=range(len(segment_start)), dtype=np.float64)
    i = 0
    for start in segment_start:
        features.loc[i, "mean"] = data[start: start+window_size].mean()
        features.loc[i, "std"] = data[start: start+window_size].std()
        features.loc[i, "95_quantile"] = np.quantile(data[start: start+window_size], 0.95)
        features.loc[i, "50_quantile"] = np.quantile(data[start: start+window_size], 0.50)
        features.loc[i, "10_quantile"] = np.quantile(data[start: start+window_size], 0.10)
        features.loc[i, "5_quantile"] = np.quantile(data[start: start+window_size], 0.05)
        
        features.loc[i, "first_5000_mean"] = data[start: start+5000].mean()
        features.loc[i, "first_5000_std"] = data[start: start+5000].std()
        features.loc[i, "95_quantile_f5000"] = np.quantile(data[start: start+5000], 0.95)
        features.loc[i, "50_quantile_f5000"] = np.quantile(data[start: start+5000], 0.50)
        features.loc[i, "10_quantile_f5000"] = np.quantile(data[start: start+5000], 0.10)
        features.loc[i, "5_quantile_f5000"] = np.quantile(data[start: start+5000], 0.05)
        
        features.loc[i, "mid_5000_mean"] = data[start+5000: start+10000].mean()
        features.loc[i, "mid_5000_std"] = data[start+5000: start+10000].std()
        features.loc[i, "95_quantile_m5000"] = np.quantile(data[start+5000: start+10000], 0.95)
        features.loc[i, "50_quantile_m5000"] = np.quantile(data[start+5000: start+10000], 0.50)
        features.loc[i, "10_quantile_m5000"] = np.quantile(data[start+5000: start+10000], 0.10)
        features.loc[i, "5_quantile_m5000"] = np.quantile(data[start+5000: start+10000], 0.05)
        
        features.loc[i, "last_5000_mean"] = data[-5000:].mean()
        features.loc[i, "last_5000_std"] = data[-5000:].std()
        features.loc[i, "95_quantile_l5000"] = np.quantile(data[-5000:], 0.95)
        features.loc[i, "50_quantile_l5000"] = np.quantile(data[-5000:], 0.50)
        features.loc[i, "10_quantile_l5000"] = np.quantile(data[-5000:], 0.10)
        features.loc[i, "5_quantile_l5000"] = np.quantile(data[-5000:], 0.05)
        
        rolling_window_100_mean = data[start: start+window_size].rolling(100).mean().dropna()
        rolling_window_100_std = data[start: start+window_size].rolling(100).std().dropna()
        features.loc[i, "rw100_mean_mean"] = rolling_window_100_mean.mean()
        features.loc[i, "rw100_mean_std"] = rolling_window_100_mean.std()
        features.loc[i, "rw100_std_mean"] = rolling_window_100_std.mean()
        features.loc[i, "rw100_std_std"] = rolling_window_100_std.std()
        features.loc[i, "rw100_mean_diff_mean"] = rolling_window_100_mean.diff().dropna().mean()
        features.loc[i, "rw100_mean_diff_std"] = rolling_window_100_mean.diff().dropna().std()
        features.loc[i, "rw100_std_diff_mean"] = rolling_window_100_std.diff().dropna().mean()
        features.loc[i, "rw100_std_diff_std"] = rolling_window_100_std.diff().dropna().std()
        
        rolling_window_1000_mean = data[start: start+window_size].rolling(1000).mean().dropna()
        rolling_window_1000_std = data[start: start+window_size].rolling(1000).std().dropna()
        features.loc[i, "rw100_mean_mean"] = rolling_window_1000_mean.mean()
        features.loc[i, "rw100_mean_std"] = rolling_window_1000_mean.std()
        features.loc[i, "rw100_std_mean"] = rolling_window_1000_std.mean()
        features.loc[i, "rw100_std_std"] = rolling_window_1000_std.std()
        features.loc[i, "rw100_mean_diff_mean"] = rolling_window_1000_mean.diff().dropna().mean()
        features.loc[i, "rw100_mean_diff_std"] = rolling_window_1000_mean.diff().dropna().std()
        features.loc[i, "rw100_std_diff_mean"] = rolling_window_1000_std.diff().dropna().mean()
        features.loc[i, "rw100_std_diff_std"] = rolling_window_1000_std.diff().dropna().std()
        
        rolling_window_5000_mean = data[start: start+window_size].rolling(5000).mean().dropna()
        rolling_window_5000_std = data[start: start+window_size].rolling(5000).std().dropna()
        features.loc[i, "rw100_mean_mean"] = rolling_window_5000_mean.mean()
        features.loc[i, "rw100_mean_std"] = rolling_window_5000_mean.std()
        features.loc[i, "rw100_std_mean"] = rolling_window_5000_std.mean()
        features.loc[i, "rw100_std_std"] = rolling_window_5000_std.std()
        features.loc[i, "rw100_mean_diff_mean"] = rolling_window_5000_mean.diff().dropna().mean()
        features.loc[i, "rw100_mean_diff_std"] = rolling_window_5000_mean.diff().dropna().std()
        features.loc[i, "rw100_std_diff_mean"] = rolling_window_5000_std.diff().dropna().mean()
        features.loc[i, "rw100_std_diff_std"] = rolling_window_5000_std.diff().dropna().std()
        i += 1
    return features

In [9]:
%%time
test = basic_feature(sample_15_x, 15000)

CPU times: user 7.25 s, sys: 90.6 ms, total: 7.34 s
Wall time: 7.34 s


In [10]:
test

NameError: name 'test' is not defined

In [None]:
rolling_window_100 = sample_15_x[0: 150000].rolling(100)

In [None]:
rolling_window_100.std().dropna().mean()

In [None]:
gc.collect()

In [None]:
fig, ax1 = plt.subplots(figsize=(20, 8))
plt.plot(sample_500["acoustic_data"].values[::10], color='b')
ax2 = ax1.twinx()
plt.plot(sample_500["time_to_failure"].values[::10], color='r')

In [None]:
fft_sample_500 = fft(sample_500['acoustic_data'][:150000])

In [None]:
plt.plot(sample_500[:150001])

In [None]:
plt.plot(sample_500[150000:300000])

In [None]:
test_set = pd.read_csv(TEST_PATH/"seg_00184e.csv")