In [1]:
!pip install pandas==1.0.3

Collecting pandas==1.0.3
  Downloading pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 4.9 MB/s 
[31mERROR: pandas-profiling 2.5.0 has requirement pandas==0.25.3, but you'll have pandas 1.0.3 which is incompatible.[0m
[31mERROR: hypertools 0.6.2 has requirement scikit-learn<0.22,>=0.19.1, but you'll have scikit-learn 0.22.2.post1 which is incompatible.[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 0.25.3
    Uninstalling pandas-0.25.3:
      Successfully uninstalled pandas-0.25.3
Successfully installed pandas-1.0.3


In [2]:
import numpy as np 
import pandas as pd 
from pykalman import KalmanFilter
from tqdm.notebook import tqdm
from scipy import signal
import gc

LEN_ = 4000
n_classes = 11
fs = 10000.0  # Sample frequency (Hz)
f0 = 50.0  # Frequency to be removed from signal (Hz)
Q = 100  # Quality factor

In [3]:
%%time
def Kalman1D(observations,damping=1):
    # To return the smoothed time series data
    observation_covariance = damping
    initial_value_guess = observations[0]
    transition_matrix = 1
    transition_covariance = 0.1

    kf = KalmanFilter(
            initial_state_mean=initial_value_guess,
            initial_state_covariance=observation_covariance,
            observation_covariance=observation_covariance,
            transition_covariance=transition_covariance,
            transition_matrices=transition_matrix
        )
    pred_state, state_cov = kf.smooth(observations)
    pred_state, state_cov = pred_state.reshape((-1)), state_cov.reshape((-1))
    return pred_state, state_cov

def segmenter(shape_train, shape_test):
    temp_train = np.zeros((shape_train[0]))
    temp_test = np.zeros((shape_test[0]))

    def point_train(batch):
        a = 500000*(batch-1); b = 500000*batch
        return(a, b)

    #train
    #1 slow
    (a, b), (c, d) = point_train(1), point_train(2)
    temp_train[a:b] = 1
    temp_train[c:d] = 1
    #1 fast
    (a, b), (c, d) = point_train(3), point_train(7)
    temp_train[a:b] = 2
    temp_train[c:d] = 2
    #3 
    (a, b), (c, d) = point_train(4), point_train(8)
    temp_train[a:b] = 3
    temp_train[c:d] = 3
    #5
    (a, b), (c, d) = point_train(6), point_train(9)
    temp_train[a:b] = 5
    temp_train[c:d] = 5
    #10
    (a, b), (c, d) = point_train(5), point_train(10)
    temp_train[a:b] = 10
    temp_train[c:d] = 10

    #test
    def point_test(batch):
        a, b = 100000*batch, 100000*(batch+1)
        return(a, b)
    
    #SUB A --> 1S
    a, b = point_test(0)
    temp_test[a:b] = 1
    
    #SUB B --> 3
    a, b = point_test(1)
    temp_test[a:b] = 3
    
    #SUB C --> 5
    a, b = point_test(2)
    temp_test[a:b] = 5

    #SUB D --> 1S
    a, b = point_test(3)
    temp_test[a:b] = 1

    #SUB E --> 1F
    a, b = point_test(4)
    temp_test[a:b] = 2

    #SUB F --> 10
    a, b = point_test(5)
    temp_test[a:b] = 10
    
    #SUB G --> 5
    a, b = point_test(6)
    temp_test[a:b] = 5

    #SUB H --> 10
    a, b = point_test(7)
    temp_test[a:b] = 10
    
    #SUB I --> 1S
    a, b = point_test(8)
    temp_test[a:b] = 1

    #SUB J --> 3
    a, b = point_test(9)
    temp_test[a:b] = 3
    
    #BATCHES 3/4 --> 1S
    a, b = 1000000, 2000000
    temp_test[a:b] = 1
    
    _label_train, _label_test = temp_train, temp_test
    
    l_mean, l_std = _label_train.mean(axis = 0), _label_train.std(axis = 0)

    _label_train, _label_test = (_label_train-l_mean)/l_std, (_label_test-l_mean)/l_std   

    _label_train, _label_test = _label_train.reshape(-1, LEN_, 1), _label_test.reshape(-1, LEN_, 1)
    return _label_train, _label_test

def roller(train, test, period = [15, 25, 50]):
  train['group'] = train['time'].apply(lambda x: np.ceil(x*10000/500000))
  test['group'] = test['time'].apply(lambda x: np.ceil(x*10000/500000))

  for num in period:
    train[f'signal_{num}_mean'] = train[['group', 'signal']].groupby('group')['signal'].rolling(num, center = True).mean().reset_index(0,drop=True)
    test[f'signal_{num}_mean'] = test[['group', 'signal']].groupby('group')['signal'].rolling(num, center = True).mean().reset_index(0,drop=True)
  
  name = [f'signal_{x}_mean' for x in period]

  return(train, test, name)

def df_process():

    observation_covariance = .0015
    
    train_clean = pd.read_csv('/kaggle/input/data-without-drift/train_clean.csv')
    test_clean = pd.read_csv('/kaggle/input/data-without-drift/test_clean.csv')

    #add cluster
    _label_train, _label_test = segmenter(train_clean.shape, test_clean.shape)

    #add rolling mean
    train_clean, test_clean, name = roller(train_clean, test_clean)

    print('Notch Filter...\n')
    b, a = signal.iirnotch(f0, Q, fs)

    train_clean['signalQ'] = signal.filtfilt(b, a, train_clean.signal)
    test_clean['signalQ'] = signal.filtfilt(b, a, test_clean.signal)

    print('Kalman-train estimate...\n')
    pred_state, _ = Kalman1D(train_clean.signal.values, observation_covariance)
    
    train_clean['signal_kalman'] = pred_state

    print('Kalman-test estimate...\n')
    pred_state, _ = Kalman1D(test_clean.signal.values, observation_covariance)

    test_clean['signal_kalman'] = pred_state

    n_classes = train_clean.open_channels.unique().shape[0]

    gc.collect()

    train_clean['signal2'] = train_clean.signal**2
    test_clean['signal2'] = test_clean.signal**2

    new_feat = ['signal_kalman', 'signalQ', 'signal', 'signal2'] + name

    train_mean = train_clean[new_feat].mean(axis = 0)
    train_sigma = train_clean[new_feat].std(axis = 0)

    train_clean[new_feat] = (train_clean[new_feat] - train_mean)/train_sigma
    test_clean[new_feat] = (test_clean[new_feat] - train_mean)/train_sigma

    train_clean[new_feat] = train_clean[new_feat].fillna(0)
    test_clean[new_feat] = test_clean[new_feat].fillna(0)

    train_signal = train_clean[new_feat].values.reshape(-1, LEN_, len(new_feat))
    train_signal = np.concatenate((train_signal, _label_train), axis = 2)
    
    test_signal = test_clean[new_feat].values.reshape(-1, LEN_, len(new_feat))
    test_signal = np.concatenate((test_signal, _label_test), axis = 2)

    train_target = pd.get_dummies(train_clean['open_channels']).values.reshape(-1, LEN_, n_classes)
    
    group = np.tile(np.repeat(np.array(range(5)), 25), 10)

    return(train_signal, train_target, test_signal, group)

train_signal, train_target, test_signal, group = df_process()


Notch Filter...

Kalman-train estimate...

Kalman-test estimate...

CPU times: user 48min 17s, sys: 7.91 s, total: 48min 25s
Wall time: 48min 24s


In [4]:
np.save("train_signal.npy", train_signal, allow_pickle = True)
np.save("train_target.npy", train_target, allow_pickle = True)
np.save("test_signal.npy", test_signal, allow_pickle = True)
np.save("group.npy", group, allow_pickle = True)