In [1]:
from data_utils import load_data
import sklearn.preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from scipy.signal import butter, lfilter, resample

In [8]:
def slidingWindow(sequence, window , step=1):
    for i in range(0,sequence.shape[1]-window+1,step):
        yield sequence[:,i:i+window]

In [9]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut=0.1, highcut=180.0, fs=400.0, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [10]:
def upper_right_triangle(matrix):
    accum = []
    for i in range(matrix.shape[0]):
        for j in range(i+1, matrix.shape[1]):
            accum.append(matrix[i, j])

    return np.array(accum)

In [11]:
def fft(time_data):
    return np.log10(np.absolute(np.fft.rfft(time_data, axis=1)[:, 1:48]))

In [12]:
def transform(data):
    fft_out = fft(data)
    freq_corr_out = freq_corr(fft_out)
    time_corr_out = time_corr(data)
    return np.concatenate((fft_out.ravel(), freq_corr_out, time_corr_out))


In [13]:
def time_corr(time_data):
    resampled = resample(time_data, 400, axis=1) if time_data.shape[-1] > 400 else time_data
    scaled = sklearn.preprocessing.scale(resampled, axis=0)
    corr_matrix = np.corrcoef(scaled)
    eigenvalues = np.absolute(np.linalg.eig(corr_matrix)[0])
    corr_coefficients = upper_right_triangle(corr_matrix) # custom func
    return np.concatenate((corr_coefficients, eigenvalues))


In [14]:
def freq_corr(fft_data):
    scaled = sklearn.preprocessing.scale(fft_data, axis=0)
    corr_matrix = np.corrcoef(scaled)
    eigenvalues = np.absolute(np.linalg.eig(corr_matrix)[0])
    eigenvalues.sort()
    corr_coefficients = upper_right_triangle(corr_matrix)
    return np.concatenate((corr_coefficients, eigenvalues))

In [4]:
%time X, y, files = load_data('train_1', sample=True)

loading 20 files with 4 processes.
Wall time: 21 s


In [5]:
X.shape

(20L, 16L, 240000L)

In [15]:
filtered = butter_bandpass_filter(X[0])

In [16]:
second_clips = np.array(list(slidingWindow(filtered,400,400)))

In [17]:
%time fft_features = np.array(map(fft, second_clips))
%time time_corr_features = np.array(map(time_corr, second_clips))
%time freq_corr_features = np.array([freq_corr(fft(clip)) for clip in second_clips])

Wall time: 47 ms
Wall time: 361 ms
Wall time: 306 ms


In [18]:
fft_features.shape, time_corr_features.shape, freq_corr_features.shape

((600L, 16L, 47L), (600L, 136L), (600L, 136L))

In [19]:
%time features = np.array(map(transform, second_clips))

Wall time: 585 ms


In [20]:
%time features_listcomp = np.array([transform(clip) for clip in second_clips])

Wall time: 625 ms


In [None]:
%time X, y, files = load_data('train_1')

loading 1297 files with 4 processes.


In [27]:
from scipy import signal

In [28]:
def firwin(data, f=400):
    nyq = f / 2.0
    cutoff = min(f, nyq-1)
    h = signal.firwin(numtaps=101, cutoff=cutoff, nyq=nyq)
    
def firwin_filter(data, f):
    h = firwin(data, f)
    return lfilter(h, 1.0, data)

In [24]:
def extract_features(data):
#     filtered = butter_bandpass_filter(data)
    second_clips = np.array(list(slidingWindow(data,400,400)))
    return np.array([transform(clip) for clip in second_clips])

In [25]:
check = [extract_features(data) for data in X]

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').