**References**:
* `JOSEPH JOSIA`'s notebook: <a href="https://www.kaggle.com/code/takanashihumbert/gait-single-models-inference/notebook" style="text-decoration:none">Gait Single Models [inference]</a>
* `JOSEPH JOSIA`'s post: <a href="https://www.kaggle.com/competitions/tlvmc-parkinsons-freezing-gait-prediction/discussion/415975" style="text-decoration:none">21st place solution: Conv1d with denoising</a>

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import gc
import glob
import pickle
import joblib

from tqdm.auto import tqdm
from time import perf_counter
from collections import defaultdict as dd
# from os.path import basename, dirname, join, exists

import math
import numpy as np
import pandas as pd

import pywt
import scipy

from scipy import signal
from scipy.signal import butter
from scipy.special import expit
from statsmodels.robust import mad

from functools import partial
from numpy.random import default_rng

from colorama import Fore, Back, Style

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import average_precision_score
from sklearn.cluster import KMeans

import catboost as ctb
import xgboost as xgb
import lightgbm as lgb

import tensorflow as tf
print(f"TF version: {tf.__version__}")
AUTO = tf.data.experimental.AUTOTUNE

TF version: 2.12.0


In [2]:
def madev(d, axis=None):
    """ Mean absolute deviation of a signal """
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

In [3]:
def wavelet_denoising_1(x, wavelet='db4', level=1):
    coeffs = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * madev(coeffs[-level])
    uthresh = sigma * np.sqrt(2 * np.log(len(x)))
    coeffs[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeffs[1:])
    result = pywt.waverec(coeffs, wavelet, mode='per')
    if len(x)%2==1:
        result = result[:-1]
    return result




def wavelet_denoising_2(x, wavelet='db4'):
    coeffs = pywt.wavedec(x, wavelet, mode="per")
    coeffs[len(coeffs)-1] *= 0
    coeffs[len(coeffs)-2] *= 0
    result = pywt.waverec(coeffs, wavelet, mode='per')
    if len(x)%2==1:
        result = result[:-1]
    return result

In [4]:
def sgn(num):
    if(num > 0.0):
        return 1.0
    elif(num == 0.0):
        return 0.0
    else:
        return -1.0

In [5]:
def wavelet_denoising_3(x, wavelet='dB10'):
    ca3, cd3, cd2, cd1 = pywt.wavedec(x, wavelet, level=3, mode="per")  # 3层小波分解

    abs_cd1 = np.abs(np.array(cd1))
    median_cd1 = np.median(abs_cd1)

    length0 = len(x)
    sigma = (1.0 / 0.6745) * median_cd1
    lamda = sigma * math.sqrt(2.0 * math.log(float(length0), math.e))
    usecoeffs = []
    usecoeffs.append(ca3)

    length1 = len(cd1)
    for k in range(length1):
        if (abs(cd1[k]) >= lamda/np.log2(2)):
            cd1[k] = sgn(cd1[k]) * (abs(cd1[k]) - lamda/np.log2(2))
        else:
            cd1[k] = 0.0
    
    length2 = len(cd2)
    for k in range(length2):
        if (abs(cd2[k]) >= lamda/np.log2(3)):
            cd2[k] = sgn(cd2[k]) * (abs(cd2[k]) - lamda/np.log2(3))
        else:
            cd2[k] = 0.0

    length3 = len(cd3)
    for k in range(length3):
        if (abs(cd3[k]) >= lamda/np.log2(4)):
            cd3[k] = sgn(cd3[k]) * (abs(cd3[k]) - lamda/np.log2(4))
        else:
            cd3[k] = 0.0
    
    
    usecoeffs.append(cd3)
    usecoeffs.append(cd2)
    usecoeffs.append(cd1)
    result = pywt.waverec(usecoeffs, wavelet, mode="per") #信号重构
    
    if len(x)%2==1:
        result = result[:-1]
    return result

<br>

In [6]:
# Constants
BASE_DIR = "/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction"
TRAIN_DIR = os.path.join(BASE_DIR, "train")
TEST_DIR = os.path.join(BASE_DIR, "test")

IS_PUBLIC = len(glob.glob(os.path.join(TEST_DIR, "*/*.csv")))==2

In [7]:
IS_PUBLIC

True

In [8]:
class Config:
    train_sub_dirs = [os.path.join(TRAIN_DIR, "defog"),
                      os.path.join(TRAIN_DIR, "tdcsfog")
                     ]
    
    metadata_paths = [os.path.join(BASE_DIR, "defog_metadata.csv"),
                      os.path.join(BASE_DIR, "tdcsfog_metadata.csv")
                     ]
    
    splits = 5
    batch_size = 1024
    
    defog_window_size = 200
    defog_window_future = 50
    defog_window_past = defog_window_size - defog_window_future
    tdcsfog_window_size = 256
    tdcsfog_window_future = 64
    tdcsfog_window_past = tdcsfog_window_size - tdcsfog_window_future
    
    wx = 3
    
    model_dropout = 0.2
    model_hidden = 128
    model_nblocks = 2
    
    lr = 0.0002
    num_epochs = 5
    
    feature_list = ['Time_frac', 'AccV', 'AccML', 'AccAP', 'V_ML', 'V_AP', 'ML_AP']
    label_list = ['StartHesitation', 'Turn', 'Walking', 'Normal']
    
    n_features = len(feature_list)
    n_labels = len(label_list)    
    
cfg = Config()

In [9]:
class FOGSequence(tf.keras.utils.Sequence):
    def __init__(self, df_paths, module, cfg=cfg, split="train"):
        
        _time = perf_counter()
        
        self.rng = default_rng(42)
        self.cfg = cfg
        self.split = split
        self.module = module
        
        if self.module=='defog':
            self.past_pad = self.cfg.wx * (self.cfg.defog_window_past-1)
            self.future_pad = self.cfg.wx * self.cfg.defog_window_future
        else:
            self.past_pad = self.cfg.wx * (self.cfg.tdcsfog_window_past-1)
            self.future_pad = self.cfg.wx * self.cfg.tdcsfog_window_future
        
        
        if self.split == "test":
            self.Ids = []
            self.Time_frac = []
            
        _values = [self._read(f) for f in df_paths]
        
        self.mapping = []
        _length = 0
        for _value in _values:
            _shape = _value.shape[0]
            self.mapping.extend(range(_length+self.past_pad, _length+_shape-self.future_pad))
            _length += _shape
            
        self.values = np.concatenate(_values, axis=0)
        self.mapping = np.array(self.mapping)
        if self.split != "test":
            # Keep only vaild and task rows
            _valid_pos = self.values[self.mapping, self.valid_position] > 0
            _task_pos = self.values[self.mapping, self.task_position] > 0
            self.mapping = self.mapping[_valid_pos&_task_pos]
        self.length = self.mapping.shape[0]
        
        if split=="train":
            print(f"Train Dataset of size {self.length:,} initialized in {perf_counter() - _time:.3f} secs!")
            
        if split=="valid":
            print(f"Valid Dataset of size {self.length:,} initialized in {perf_counter() - _time:.3f} secs!") 
        gc.collect()

    
    
    def _read(self, path):
        _is_tdcs = os.path.basename(os.path.dirname(path)).startswith('tdcs')
        df = pd.read_csv(path)
        #########
        df['Time_frac'] = (df.index/df.index.max()).values
        
        if self.split == "test":
            _ids = basename(path).split('.')[0] + '_' + df.Time.astype(str)
            self.Ids.extend(_ids.tolist())
            self.Time_frac.extend(df.Time_frac.tolist())
            return self._df_to_array(df, self.cfg.feature_list)
        
        _cols = [*self.cfg.feature_list, *self.cfg.label_list, 'Valid', 'Task']
        self.valid_position = self.cfg.n_features + self.cfg.n_labels
        self.task_position = self.valid_position + 1
        
        if _is_tdcs:
            # Fill Valid and Task columns for tdcsfog
            df['Valid'] = 1
            df['Task'] = 1
            
        return self._df_to_array(df, _cols)
    
    
    
    def _df_to_array(self, df, cols):
        # Pads past and future rows to dataframe values for indexing
        df['AccV'] = wavelet_denoising_2(df['AccV'], wavelet='db4')
        df['AccML'] = wavelet_denoising_2(df['AccML'], wavelet='db4')
        df['AccAP'] = wavelet_denoising_2(df['AccAP'], wavelet='db4')
        df['V_ML'] = df['AccV'] - df['AccML']
        df['V_AP'] = df['AccV'] - df['AccAP']
        df['ML_AP'] = df['AccML'] - df['AccAP']
        
        df['AccV'] = (df['AccV'] - df['AccV'].mean()) / df['AccV'].std()
        df['AccML'] = (df['AccML'] - df['AccML'].mean()) / df['AccML'].std()
        df['AccAP'] = (df['AccAP'] - df['AccAP'].mean()) / df['AccAP'].std()
        df['V_ML'] = (df['V_ML'] - df['V_ML'].mean()) / df['V_ML'].std()
        df['V_AP'] = (df['V_AP'] - df['V_AP'].mean()) / df['V_AP'].std()
        df['ML_AP'] = (df['ML_AP'] - df['ML_AP'].mean()) / df['ML_AP'].std()
        
        _values = df[cols].values.astype(np.float32)  # np.float32
        
        return np.pad(_values, ((self.past_pad, self.future_pad),(0,0)), 'edge')
    
    
    
    def __len__(self):
        
        return int(np.ceil(self.length / self.cfg.batch_size))
    
    
    
    def __getitem__(self, idx):
        
        if self.split == "train":
            # Onlt train set has randomly selected batches
            _idxs = self.rng.choice(self.mapping, size=self.cfg.batch_size, replace=False)
        else:
            _idxs = self._get_indices(idx)
            
        # For test return only features
        if self.split == "test":
            return self._get_X(_idxs)
        # For train and val splits return y also
        
        return self._get_X_y(_idxs)
    
    
    
    def _get_indices(self, idx):
        _low = idx * self.cfg.batch_size
        # Cap high at self.length so overflow does not occur
        _high = min(_low + self.cfg.batch_size, self.length)
        
        return self.mapping[_low:_high]
    
    
    
    def _get_X(self, indices):
        if self.module=='defog':
            _X = np.empty((len(indices), self.cfg.defog_window_size, self.cfg.n_features), dtype=np.float32)
        else:
            _X = np.empty((len(indices), self.cfg.tdcsfog_window_size, self.cfg.n_features), dtype=np.float32)
        for i, idx in enumerate(indices):
            _X[i] = self.values[idx-self.past_pad:idx+self.future_pad+1:self.cfg.wx, :self.cfg.n_features]
            
        return _X
    
    
    
    def _get_X_y(self, indices):
        if self.module=='defog':
            _X = np.empty((len(indices), self.cfg.defog_window_size, self.cfg.n_features), dtype=np.float32)
        else:
            _X = np.empty((len(indices), self.cfg.tdcsfog_window_size, self.cfg.n_features), dtype=np.float32)
        for i, idx in enumerate(indices):
            _X[i] = self.values[idx-self.past_pad: idx+self.future_pad+1:self.cfg.wx, :self.cfg.n_features]
            
        return _X, self.values[indices, self.cfg.n_features:self.cfg.n_features+self.cfg.n_labels]

```python
def get_model(module, checkpoint_path = None):
    if module=='defog':
        window_size = cfg.defog_window_size
    else:
        window_size = cfg.tdcsfog_window_size
        
    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(window_size, cfg.n_features), dtype='float32'))
    for i in range(cfg.model_nblocks):
        model.add(tf.keras.layers.Conv1D(filters=cfg.model_hidden, strides=i+1, kernel_size=16-6*i, padding="same"))
        model.add(tf.keras.layers.BatchNormalization())
        model.add(tf.keras.layers.ReLU())
        model.add(tf.keras.layers.Dropout(cfg.model_dropout))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    model.add(tf.keras.layers.Dense(cfg.n_labels, activation='sigmoid'))
    
    if checkpoint_path is not None:
        model.load_weights(checkpoint_path)
    
    model.compile(tf.keras.optimizers.Adam(learning_rate=cfg.lr), 
                  loss = tf.keras.losses.BinaryCrossentropy()
                 )
    
    return model



tf.keras.backend.clear_session()
get_model('defog').summary()

```

In [10]:
# Model adapted from https://keras.io/examples/timeseries/timeseries_classification_from_scratch/

def get_model(module, checkpoint_path = None):
    if module == 'defog':
        window_size = cfg.defog_window_size
    else:
        window_size = cfg.tdcsfog_window_size
    
    
    inputs = tf.keras.Input(shape=(window_size, cfg.n_features), dtype='float32')
    
    
    left = tf.keras.layers.Conv1D(filters=cfg.model_hidden, strides=1, kernel_size=4, padding="same", 
                                  kernel_regularizer=tf.keras.regularizers.l2(0.01))(inputs)
    left = tf.keras.layers.BatchNormalization()(left)
    
    
    right = tf.keras.layers.Conv1D(filters=cfg.model_hidden, strides=3, kernel_size=8, padding="same", 
                                   kernel_regularizer=tf.keras.regularizers.l2(0.01))(inputs)
    right = tf.keras.layers.BatchNormalization()(right)
    
    
    mid = tf.keras.layers.Conv1D(filters=cfg.model_hidden, strides=5, kernel_size=16, padding="same", 
                                 kernel_regularizer=tf.keras.regularizers.l2(0.01))(inputs)
    mid = tf.keras.layers.BatchNormalization()(mid)
    
    
    conb = tf.keras.layers.Concatenate(axis=1)([left, mid, right])
    conb = tf.keras.layers.ReLU()(conb)
    #conb = tf.keras.layers.GlobalAveragePooling1D()(conb)
    conb = tf.keras.layers.Flatten()(conb)
    conb = tf.keras.layers.Dropout(0.2)(conb)
    outputs = tf.keras.layers.Dense(cfg.n_labels, activation='sigmoid')(conb)
    
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    if checkpoint_path is not None:
        model.load_weights(checkpoint_path)
        
    model.compile(tf.keras.optimizers.Adam(learning_rate=cfg.lr), 
                  loss = tf.keras.losses.BinaryCrossentropy(),
                 )
    
    return model

In [11]:
get_model('defog').summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 200, 7)]     0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 200, 128)     3712        ['input_1[0][0]']                
                                                                                                  
 conv1d_2 (Conv1D)              (None, 40, 128)      14464       ['input_1[0][0]']                
                                                                                                  
 conv1d_1 (Conv1D)              (None, 67, 128)      7296        ['input_1[0][0]']                
                                                                                              

<br>
<br>


The author used `StratifiedGroupKFold(y=labels, group=Subject)`. CV is around 0.32-0.34.


## conv1d model

In [None]:
# defog_ver23/
#    fold0_model_04.h5,  fold1_model_03.h5,  fold2_model_05.h5,  fold3_model_05.h5,  fold4_model_03.h5
# tdcsfog_ver23/
#    fold0_model_01.h5,  fold1_model_02.h5,  fold2_model_05.h5,  fold3_model_05.h5,  fold4_model_05.h5


model_paths = {'defog': [f for f in glob.glob("/kaggle/input/gait-cov1d-models/defog_ver23/*.h5")],
               'tdcsfog': [f for f in glob.glob("/kaggle/input/gait-cov1d-models/tdcsfog_ver23/*.h5")],
              }
display(model_paths)

In [None]:
%%time

test_defog_paths = glob.glob(join(TEST_DIR, "defog/*.csv"))
test_tdcsfog_paths = glob.glob(join(TEST_DIR, "tdcsfog/*.csv"))

test_ds_dict = {'defog': FOGSequence(test_defog_paths, module='defog', split="test"), 
                'tdcsfog': FOGSequence(test_tdcsfog_paths, module='tdcsfog', split="test")
               }

# Get test predictions
df_list = []
for module, test_ds in test_ds_dict.items():
    y_pred_list = []
    for model_path in tqdm(model_paths[module]):
        model = get_model(module, model_path)
        y_pred_list.append(model.predict(test_ds, verbose=0, batch_size=256))
        
    y_pred = np.mean(y_pred_list, axis=0)
    df_list.append(pd.DataFrame({'Id': test_ds.Ids, 
                                 'module': [module]*len(y_pred), 
                                 'Time_frac': test_ds.Time_frac, 
                                 'StartHesitation': y_pred[:,0], 
                                 'Turn': y_pred[:,1], 
                                 'Walking': y_pred[:,2]}
                               ))

In [None]:
# Concatenate Prediction to DataFrames
submission = pd.concat(df_list)

#submission.loc[((submission.Time_frac<0.01)|(submission.Time_frac>0.99))&(submission.module=='tdcsfog'), 'Walking'] = 0
#submission.loc[(submission.Time_frac<0.01)&(submission.module=='tdcsfog'), 'Turn'] = 0
#submission.loc[(submission.Time_frac<0.01)&(submission.module=='defog'), 'Turn'] = 0

# Only keep Ids in sample_submission
sample_submission = pd.read_csv(join(BASE_DIR, "sample_submission.csv"))
submission = pd.merge(sample_submission[['Id']], submission, how='left', on='Id').fillna(0.0)
submission[['Id','StartHesitation','Turn','Walking']].to_csv("submission.csv", index=False, float_format='%.5f') # round to 5 decimal places while keeping point notation

display(submission.head())
display(submission.tail())

In [None]:
del test_ds_dict
gc.collect()