In [1]:
!pip install pandas==1.0.3

Collecting pandas==1.0.3
  Downloading pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 2.6 MB/s 
[31mERROR: pandas-profiling 2.5.0 has requirement pandas==0.25.3, but you'll have pandas 1.0.3 which is incompatible.[0m
[31mERROR: hypertools 0.6.2 has requirement scikit-learn<0.22,>=0.19.1, but you'll have scikit-learn 0.22.2.post1 which is incompatible.[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 0.25.3
    Uninstalling pandas-0.25.3:
      Successfully uninstalled pandas-0.25.3
Successfully installed pandas-1.0.3


In [2]:
import numpy as np 
import pandas as pd 
from pykalman import KalmanFilter
from tqdm.notebook import tqdm
from scipy import signal
import gc
from scipy import signal

gc.collect()
LEN_ = 4000
n_classes = 11
fs = 10000.0  # Sample frequency (Hz)
f0 = 50.0  # Frequency to be removed from signal (Hz)
Q = 100  # Quality factor

In [3]:
def sd(col, max_loss_limit=0.001, avg_loss_limit=0.001, na_loss_limit=0, n_uniq_loss_limit=0, fillna=0):
    """
    max_loss_limit - don't allow any float to lose precision more than this value. Any values are ok for GBT algorithms as long as you don't unique values.
                     See https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_decimal_values_in_[0,_1]
    avg_loss_limit - same but calculates avg throughout the series.
    na_loss_limit - not really useful.
    n_uniq_loss_limit - very important parameter. If you have a float field with very high cardinality you can set this value to something like n_records * 0.01 in order to allow some field relaxing.
    """
    is_float = str(col.dtypes)[:5] == 'float'
    na_count = col.isna().sum()
    n_uniq = col.nunique(dropna=False)
    try_types = ['float16', 'float32']

    if na_count <= na_loss_limit:
        try_types = ['int8', 'int16', 'float16', 'int32', 'float32']

    for type in try_types:
        col_tmp = col

        # float to int conversion => try to round to minimize casting error
        if is_float and (str(type)[:3] == 'int'):
            col_tmp = col_tmp.copy().fillna(fillna).round()

        col_tmp = col_tmp.astype(type)
        max_loss = (col_tmp - col).abs().max()
        avg_loss = (col_tmp - col).abs().mean()
        na_loss = np.abs(na_count - col_tmp.isna().sum())
        n_uniq_loss = np.abs(n_uniq - col_tmp.nunique(dropna=False))

        if max_loss <= max_loss_limit and avg_loss <= avg_loss_limit and na_loss <= na_loss_limit and n_uniq_loss <= n_uniq_loss_limit:
            return col_tmp

    # field can't be converted
    return col


def reduce_mem_usage_sd(df, deep=True, verbose=False, obj_to_cat=False):
    numerics = ['int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    for col in tqdm(df.columns):
        col_type = df[col].dtypes

        # collect stats
        na_count = df[col].isna().sum()
        n_uniq = df[col].nunique(dropna=False)
        
        # numerics
        if col_type in numerics:
            df[col] = sd(df[col])

        # strings
        if (col_type == 'object') and obj_to_cat:
            df[col] = df[col].astype('category')
        
        if verbose:
            print(f'Column {col}: {col_type} -> {df[col].dtypes}, na_count={na_count}, n_uniq={n_uniq}')
        new_na_count = df[col].isna().sum()
        if (na_count != new_na_count):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost na values. Before: {na_count}, after: {new_na_count}')
        new_n_uniq = df[col].nunique(dropna=False)
        if (n_uniq != new_n_uniq):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost unique values. Before: {n_uniq}, after: {new_n_uniq}')

    end_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    percent = 100 * (start_mem - end_mem) / start_mem
    print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))
    return df


In [4]:
%%time

def calc_gradients(s, n_grads=2, name = 'signal'):
    '''
    Calculate gradients for a pandas series. Returns the same number of samples
    '''
    grads = pd.DataFrame()
    
    g = s
    for i in range(n_grads):
        g = np.gradient(s, edge_order = i+1)
        grads[name + '_grad_' + str(i+1)] = g
        
    return grads

def calc_low_pass(s, n_filts=3):
    '''
    Applies low pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.3, n_filts)
    
    low_pass = pd.DataFrame()
    x = s
    for lab, wn in enumerate(wns):
        b, a = signal.butter(1, Wn=wn, btype='low')
        zi = signal.lfilter_zi(b, a)
        low_pass['lowpass_lf_' + str(lab)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        low_pass['lowpass_ff_' + str(lab)] = signal.filtfilt(b, a, x)
        
    return low_pass

def Kalman1D(observations,damping=1):
    # To return the smoothed time series data
    observation_covariance = damping
    initial_value_guess = observations[0]
    transition_matrix = 1
    transition_covariance = 0.1

    kf = KalmanFilter(
            initial_state_mean=initial_value_guess,
            initial_state_covariance=observation_covariance,
            observation_covariance=observation_covariance,
            transition_covariance=transition_covariance,
            transition_matrices=transition_matrix
        )
    pred_state, state_cov = kf.smooth(observations)
    pred_state, state_cov = pred_state.reshape((-1)), state_cov.reshape((-1))
    return pred_state, state_cov

def segmenter(train, test):
    temp_train = np.zeros((train.shape[0]))
    temp_test = np.zeros((test.shape[0]))

    def point_train(batch):
        a = 500000*(batch-1); b = 500000*batch
        return(a, b)

    #train
    #1 slow
    (a, b), (c, d) = point_train(1), point_train(2)
    temp_train[a:b] = 1
    temp_train[c:d] = 1
    #1 fast
    (a, b), (c, d) = point_train(3), point_train(7)
    temp_train[a:b] = 2
    temp_train[c:d] = 2
    #3 
    (a, b), (c, d) = point_train(4), point_train(8)
    temp_train[a:b] = 3
    temp_train[c:d] = 3
    #5
    (a, b), (c, d) = point_train(6), point_train(9)
    temp_train[a:b] = 5
    temp_train[c:d] = 5
    #10
    (a, b), (c, d) = point_train(5), point_train(10)
    temp_train[a:b] = 10
    temp_train[c:d] = 10

    #test
    def point_test(batch):
        a, b = 100000*batch, 100000*(batch+1)
        return(a, b)
    
    #SUB A --> 1S
    a, b = point_test(0)
    temp_test[a:b] = 1
    
    #SUB B --> 3
    a, b = point_test(1)
    temp_test[a:b] = 3
    
    #SUB C --> 5
    a, b = point_test(2)
    temp_test[a:b] = 5

    #SUB D --> 1S
    a, b = point_test(3)
    temp_test[a:b] = 1

    #SUB E --> 1F
    a, b = point_test(4)
    temp_test[a:b] = 2

    #SUB F --> 10
    a, b = point_test(5)
    temp_test[a:b] = 10
    
    #SUB G --> 5
    a, b = point_test(6)
    temp_test[a:b] = 5

    #SUB H --> 10
    a, b = point_test(7)
    temp_test[a:b] = 10
    
    #SUB I --> 1S
    a, b = point_test(8)
    temp_test[a:b] = 1

    #SUB J --> 3
    a, b = point_test(9)
    temp_test[a:b] = 3
    
    #BATCHES 3/4 --> 1S
    a, b = 1000000, 2000000
    temp_test[a:b] = 1
    
    train['segment'], test['segment'] = temp_train, temp_test
    return train, test

def roller(train, test, period = [25, 50, 100]):
  train['group'] = train['time'].apply(lambda x: np.ceil(x*10000/500000))
  test['group'] = test['time'].apply(lambda x: np.ceil(x*10000/500000))

  for num in tqdm(period):
    train[f'signal_{num}_mean_gauss'] = train[['group', 'signal']].groupby('group')['signal'].rolling(num, center = True, win_type='gaussian').mean().reset_index(0,drop=True)
    train[f'percent_{num}'] = (train['signal']/train[f'signal_{num}_mean_gauss']).replace({np.inf: np.nan, -np.inf: np.nan})
    
    test[f'signal_{num}_mean_gauss'] = test[['group', 'signal']].groupby('group')['signal'].rolling(num, center = True, win_type='gaussian').mean().reset_index(0,drop=True)
    test[f'percent_{num}'] = (test['signal']/test[f'signal_{num}_mean_gauss']).replace({np.inf: np.nan, -np.inf: np.nan})

    train[f'signal_kalman_{num}_mean_gauss'] = train[['group', 'signal_kalman']].groupby('group')['signal_kalman'].rolling(num, center = True, win_type='gaussian').mean().reset_index(0,drop=True)
    train[f'percent_kalman_{num}'] = (train['signal_kalman']/train[f'signal_kalman_{num}_mean_gauss']).replace({np.inf: np.nan, -np.inf: np.nan})
    
    test[f'signal_kalman_{num}_mean_gauss'] = test[['group', 'signal_kalman']].groupby('group')['signal_kalman'].rolling(num, center = True, win_type='gaussian').mean().reset_index(0,drop=True)
    test[f'percent_kalman_{num}'] = (test['signal_kalman']/test[f'signal_kalman_{num}_mean_gauss']).replace({np.inf: np.nan, -np.inf: np.nan})

  gc.collect()
  return(train, test)

def lagger(train, test, period = 5):
    
    for num in tqdm(range(1, period + 1)):
        train[f'signal_{num}_lag'] = train[['group', 'signal']].groupby('group')['signal'].shift(num).reset_index(0,drop=True)
        train[f'signal_{num}_lead'] = train[['group', 'signal']].groupby('group')['signal'].shift(-num).reset_index(0,drop=True)

        test[f'signal_{num}_lag'] = test[['group', 'signal']].groupby('group')['signal'].shift(num).reset_index(0,drop=True)
        test[f'signal_{num}_lead'] = test[['group', 'signal']].groupby('group')['signal'].shift(-num).reset_index(0,drop=True)

    gc.collect()
    return(train, test)

CPU times: user 7 µs, sys: 2 µs, total: 9 µs
Wall time: 12.9 µs


In [5]:
gc.collect()
observation_covariance = .0015

train_clean = pd.read_csv('/kaggle/input/data-without-drift/train_clean.csv')
test_clean = pd.read_csv('/kaggle/input/data-without-drift/test_clean.csv')

print('Notch Filter...\n')
b, a = signal.iirnotch(f0, Q, fs)

train_clean['signalQ'] = signal.filtfilt(b, a, train_clean.signal)
test_clean['signalQ'] = signal.filtfilt(b, a, test_clean.signal)

n_classes = train_clean.open_channels.unique().shape[0]

print('Kalman-train estimate...\n')
pred_state, _ = Kalman1D(train_clean.signal.values, observation_covariance)

train_clean['signal_kalman'] = pred_state

train_clean['signal_percent_kalman'] = (train_clean['signal']/pred_state).replace({np.inf: np.nan, -np.inf: np.nan})

print('Kalman-test estimate...\n')
pred_state, _ = Kalman1D(test_clean.signal.values, observation_covariance)

test_clean['signal_kalman'] = pred_state

test_clean['signal_percent_kalman'] = (test_clean['signal']/pred_state).replace({np.inf: np.nan, -np.inf: np.nan})

print('Gradient and ll filter..\n')

train_clean = pd.concat([train_clean, calc_gradients(train_clean.signal)], axis = 1)
train_clean = pd.concat([train_clean, calc_low_pass(train_clean.signal)], axis = 1)

test_clean = pd.concat([test_clean, calc_gradients(test_clean.signal)], axis = 1)
test_clean = pd.concat([test_clean, calc_low_pass(test_clean.signal)], axis = 1)

train_clean = pd.concat([train_clean, calc_gradients(train_clean.signal_kalman, name = 'signal_kalman')], axis = 1)
test_clean = pd.concat([test_clean, calc_gradients(test_clean.signal_kalman, name = 'signal_kalman')], axis = 1)

#add cluster
print('Segment estimate...\n')
train_clean, test_clean = segmenter(train_clean, test_clean)
categorical = ['segment']

#add rolling mean
print('Rolling estimate...\n')
train_clean, test_clean = roller(train_clean, test_clean)

#add lag
print('Lag estimate...\n')
train_clean, test_clean = lagger(train_clean, test_clean)

gc.collect()

print('Signal2 estimate...\n')

train_clean['signal2'] = train_clean.signal**2
test_clean['signal2'] = test_clean.signal**2

print('Reduce memory')
train_clean = reduce_mem_usage_sd(train_clean)
test_clean = reduce_mem_usage_sd(test_clean)

print('Get label and group')
train_target = pd.get_dummies(train_clean['open_channels']).values

train_clean = train_clean.drop(['time', 'open_channels', 'group'], axis = 1)
test_clean = test_clean.drop(['time', 'group'], axis = 1)

group = np.tile(np.repeat(np.array(range(5)), 100000), 10)



Notch Filter...

Kalman-train estimate...

Kalman-test estimate...

Gradient and ll filter..

Segment estimate...

Rolling estimate...



HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Lag estimate...



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Signal2 estimate...

Reduce memory


HBox(children=(FloatProgress(value=0.0, max=41.0), HTML(value='')))


Mem. usage decreased from 1564.03 Mb to 1444.82 Mb (7.6% reduction)


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Mem. usage decreased from 610.35 Mb to 576.02 Mb (5.6% reduction)
Get label and group


In [6]:
assert list(train_clean.columns) == list(test_clean.columns), 'Columns error'

In [7]:
train_clean.to_csv('train_clean.csv', index = False)
test_clean.to_csv('test_clean.csv', index = False)
np.save("train_target.npy", train_target, allow_pickle = True)
np.save("group.npy", group, allow_pickle = True)