In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import os

import pickle

from IPython.core.debugger import set_trace
from tqdm import tqdm

from IPython.core.debugger import set_trace
import gc
from sklearn.model_selection import StratifiedGroupKFold
# Install tsflex and seglearn
!pip install tsflex --no-index --find-links=file:///kaggle/input/tsflex
!pip install seglearn --no-index --find-links=file:///kaggle/input/segalearn


from seglearn.feature_functions import base_features, emg_features

from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import seglearn_feature_dict_wrapper


Looking in links: file:///kaggle/input/tsflex
Processing /kaggle/input/tsflex/tsflex-0.3.0-py3-none-any.whl
Installing collected packages: tsflex
Successfully installed tsflex-0.3.0
[0mLooking in links: file:///kaggle/input/segalearn
Processing /kaggle/input/segalearn/seglearn-1.2.5-py3-none-any.whl
Installing collected packages: seglearn
Successfully installed seglearn-1.2.5
[0m

In [2]:
black_list = pickle.load(open("/kaggle/input/black-list-v8-1/black_list.pkl", "rb"))

In [3]:
defog_metadata = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv")

subjects = pd.read_csv("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/subjects.csv")
subjects["Visit"] = subjects["Visit"].fillna(1)
subjects = subjects.drop_duplicates(subset=["Subject", "Visit"])

defog_metadata["Medication"] = np.where(defog_metadata["Medication"] == "on", 1, 0)
defog_subject_dict = dict(zip(defog_metadata["Id"], defog_metadata["Subject"]))
defog_medication_dict = dict(zip(defog_metadata["Id"], defog_metadata["Medication"]))
defog_Id_Visit = dict(zip(defog_metadata["Id"], defog_metadata["Visit"]))
defog_Id_Test  =dict(zip(defog_metadata["Id"], np.zeros(defog_metadata.shape[0])))

subjects["UPDRSIII_On"] = subjects["UPDRSIII_On"].fillna(0)
subjects["UPDRSIII_Off"] = subjects["UPDRSIII_Off"].fillna(0)

In [4]:
def fix_invalid_events(df):
    for e_type in ["StartHesitation", "Turn",'Walking']:
        df.loc[(df["Valid"] == False) | (df["Task"] == False), e_type] = 0
    return df

def min_max_feature(df, feature):
    new_feature = f"precent_prograss_{feature}"
    df[new_feature] = (df[feature] - df[feature].min()) / (df[feature].max() - df[feature].min())
    df[new_feature] = df[new_feature]                                                                                                     
    return df


In [5]:
def FE(df, is_td, isTest=False ):
#     tdcsfog (128 timesteps per second)
#     defog (100 timesteps per second).

    if is_td:
        second = 128
    else:
        second = 100

    
    if not isTest:
        if "Valid" in df.columns:
            df = fix_invalid_events(df)

    for col in ["Time", "AccV", "AccML", "AccAP"]:
        df = min_max_feature(df, col)
    
    #TODO TDFLEX FEATURES
    basic_feats = MultipleFeatureDescriptors(
        functions=seglearn_feature_dict_wrapper(base_features()),
        series_names=['AccV', 'AccML', 'AccAP'],
        windows=[5_000, 10_000],
        strides=[5_000, 10_000],
    )

    emg_feats = emg_features()
    del emg_feats['simple square integral'] # is same as abs_energy (which is in base_features)

    emg_feats = MultipleFeatureDescriptors(
        functions=seglearn_feature_dict_wrapper(emg_feats),
        series_names=['AccV', 'AccML', 'AccAP'],
        windows=[5_000, 10_000],
        strides=[5_000, 10_000],
    )

    fc = FeatureCollection([basic_feats, emg_feats])
    df_feats = fc.calculate(df, return_df=True, include_final_window=True, approve_sparsity=True, window_idx="begin")
    df = df.merge(df_feats, how="left", left_index=True, right_index=True).fillna(method="ffill")
    
    return df
    

In [6]:
# Target
def create_target(df):
    class_dict = {0: "StartHesitation", 1: "Turn", 2:"Walking", 3:"None"}
    df["target"] = 3
    df["target"] = np.where(df["StartHesitation"] == 1, 0, df["target"] )
    df["target"] = np.where(df["Turn"] == 1, 1, df["target"] )
    df["target"] = np.where(df["Walking"] == 1, 2, df["target"] )
    
    df = df.drop(["StartHesitation", "Turn", "Walking"], axis = 1)
    return df

In [7]:
def down_sample_stf(df, n_splits):   
    print(f"before split df has: {df.shape[0]} rows, {df.Subject.nunique()} people")
    n_rows_init = df.shape[0]
    n_subjects_init = df.Subject.nunique()
    X = df.drop("target", axis = 1)
    y = df["target"]
    groups = df["Subject"]
    sgkf = StratifiedGroupKFold(n_splits=n_splits)
    for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)):
        train = df.loc[train_index]
        valid = df.loc[test_index]
        break
        
    print(f"after split train has: {train.shape[0]} rows and {train.Subject.nunique()} people")
    print(f"valid has:{valid.shape[0]} rows and {valid.Subject.nunique()} people")

    return train, valid
    

In [8]:
def fix_dtypes(df):
    df["StartHesitation"] = df["StartHesitation"].astype("bool")
    df["Turn"] = df["Turn"].dfstype("bool")
    df["Walking"] = df["Walking"].astype("bool")
    
    return df


In [9]:
def flat_outliers(df):
    for col in ['AccV','AccML','AccV']:
        max_value = df[col].quantile(q=0.99)
        min_value = df[col].quantile(q=0.01)
        df[col] = np.where(df[col] > max_value, max_value, df[col])
        df[col] = np.where(df[col] < min_value, min_value, df[col])
    return df

In [10]:
def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df
#reference: https://www

In [11]:
def make_df(base,isTest=False, black_list = []):
    train= pd.DataFrame()
    if "tdcsfog" in base:
        is_td = True
    else:
        is_td = False
    
    for train_path in tqdm(os.listdir(base)):
        file_path = base + '/'+train_path
        df = pd.read_csv(file_path)
        df_time = df["Time"].copy()
        df = df.set_index("Time")
        df["Time"] = df_time
        df = flat_outliers(df)
        df = FE(df, is_td)
        if not isTest:
            df = create_target(df)
        df["file"] = train_path.split(".")[0]
        df["id"] = df["file"].astype("str") + "_" + df["Time"].astype("str")
        
        dot_index = train_path.index(".")
        file_id = train_path[:dot_index]
        
        if "tdcsfog" in base:
            df["Subject"] = tdcsfog_subject_dict[file_id]
            df["Medication"] =  tdcsfog_medication_dict[file_id]
            df["Visit"] = tdcsfog_Id_Visit[file_id]
            df["Test_level"] =tdcsfog_Id_Test[file_id]

        else:
            df["Subject"] = defog_subject_dict[file_id]
            df["Medication"] = defog_medication_dict[file_id]
            df["Visit"] = defog_Id_Visit[file_id]
            df["Test_level"] =defog_Id_Test[file_id]

        if train.shape[0] == 0:
            cur_black_list = [c for c in black_list if c in df.columns]
        train = train.append(df.drop(cur_black_list, axis = 1))
        del df
        gc.collect()

    train = reduce_memory_usage(train)
    train.reset_index(drop=True, inplace=True)
#     train = train.reset_index(drop=True)
        
    return train

In [12]:
defog_train = make_df("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/train/defog", black_list=black_list)

100%|██████████| 91/91 [06:45<00:00,  4.45s/it]


Memory usage of dataframe is 8900.39 MB
Memory usage became:  2941.03759765625  MB


In [13]:
train, valid = down_sample_stf(defog_train, 4)
del defog_train
gc.collect()

print(train.shape)
print(valid.shape)

# merging only on Subject because Visit is relevant for defog only
train = train.merge(subjects, on=["Subject","Visit"], how="left")
train['Sex'] = np.where(train['Sex'] == "M", 1, 0)

valid = valid.merge(subjects, on=["Subject","Visit"], how="left")
valid['Sex'] = np.where(valid['Sex'] == "M", 1, 0)

del subjects

print(train.shape)
print(valid.shape)

print(f"train nans: {train.isna().sum().sum()}")
isn = train.isna().sum()
isn[isn > 0]

before split df has: 13525702 rows, 38 people
after split train has: 10136077 rows and 28 people
valid has:3389625 rows and 10 people
(10136077, 87)
(3389625, 87)
(10136077, 93)
(3389625, 93)
train nans: 0


Series([], dtype: int64)

In [14]:
print(f"valid nans: {valid.isna().sum().sum()}")
isn = valid.isna().sum()
isn[isn > 0]

valid nans: 0


Series([], dtype: int64)

In [15]:
train = reduce_memory_usage(train)
valid = reduce_memory_usage(valid)
gc.collect()


Memory usage of dataframe is 2890.83 MB
Memory usage became:  2446.1684732437134  MB
Memory usage of dataframe is 1378.86 MB
Memory usage became:  1230.1570978164673  MB


0

In [16]:
pickle.dump(train.reset_index(drop=True), open("train_de.pkl", "wb"))
pickle.dump(valid.reset_index(drop=True), open("valid_de.pkl", "wb"))