In [74]:
import os, sys, glob
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt

import tqdm


%config InlineBackend.figure_format = 'retina'
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
ROOT_PATH = '..'

In [146]:
def is_quake_in_chunk(chunk_df):
    return np.sum(np.abs(np.diff(chunk_df.time_to_failure))>0.1)>0

def progress_bar(current_value, max_value, size=50):
    prog = (current_value+1)/max_value
    left = '#'*int(prog * size + 0.5) # 0.5 to round up when casting to int
    right = '-'*(size-len(left))
    print('\r[{}{}] {:.1f}%'.format(left, right, prog*100), end='')

In [3]:
# from scipy.signal import argrelextrema

# df_generator = pd.read_csv(f'{ROOT_PATH}/data/train.csv', chunksize=100000)
# index_minima = []
# for df in df_generator:
#     index_minima.append(df.iloc[argrelextrema(df.time_to_failure.values, np.less_equal, order=2)[0]].index.tolist()[:-1])


# from scipy.signal import argrelextrema
# index_minima = df.iloc[argrelextrema(df.time_to_failure.values, np.less_equal, order=2)[0]].index.tolist()[:-1]
# event_index = np.array([x[0] for x in index_minima if len(x)>0])
# np.savetxt(f"{ROOT_PATH}/data/event_index.csv", event_index, delimiter=",")

In [110]:
event_index = np.genfromtxt(f"{ROOT_PATH}/data/event_index.csv", dtype='int')

```python
event_index = array([  5656573,  50085877, 104677355, 138772452, 187641819, 218652629,
       245829584, 307838916, 338276286, 375377847, 419368879, 461811622,
       495800224, 528777114, 585568143, 621985672])
```

```python
test_df.shape = (150000, 1)
```

## Split feature from target

In [149]:
resample_window_size = 15
chunk_len = 150000 ## equal to test len
quake_counts = 16
chunks_count = 4195 - quake_counts - 1 ## last chunk and quakes are discarded
chunk_index = 0
number_of_feature = int(chunk_len/resample_window_size)

X_train = np.zeros([chunks_count, number_of_feature])
y_train = np.zeros([chunks_count, 1])

df_generator = pd.read_csv(f'{ROOT_PATH}/data/train.csv', chunksize=chunk_len)
for df_chunk in df_generator:
    if is_quake_in_chunk(df_chunk):
        continue

    df_chunk_resampled = df_chunk.reset_index(drop=True).groupby(by=lambda x: int(x/resample_window_size), axis=0).mean()
    
    if df_chunk_resampled.acoustic_data.values.shape[0] != number_of_feature:
        continue
    
    X_train[chunk_index, :] = df_chunk_resampled.acoustic_data.values
    y_train[chunk_index, 0] = np.mean(df_chunk_resampled.time_to_failure.values)
    
    chunk_index = chunk_index + 1
    progress_bar(chunk_index, chunks_count)

[##################################################] 100.0%

In [153]:
X_train.shape

(4178, 10000)

In [154]:
y_train.shape

(4178, 1)

## Save the training dataset

In [157]:
from sklearn.model_selection import train_test_split
X_train_tofile, X_test_tofile, y_train_tofile, y_test_tofile = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [170]:
import h5py

with h5py.File(f'{ROOT_PATH}/data/train_data.h5', "w") as out:
    data_type = 'float64'
    out.create_dataset("X_train", data=X_train_tofile)
    out.create_dataset("y_train", data=y_train_tofile)

In [171]:
with h5py.File(f'{ROOT_PATH}/data/test_data.h5', "w") as out:
    data_type = 'float64'
    out.create_dataset("X_test", data=X_test_tofile)
    out.create_dataset("y_test", data=y_test_tofile)