In [1]:
import os, gc
import numpy as np
import pandas as pd
from collections import deque
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import average_precision_score

In [2]:
train_defog = os.listdir("../tlvmc-parkinsons-freezing-gait-prediction/train/defog")
test_defog = os.listdir("../tlvmc-parkinsons-freezing-gait-prediction/test/defog")
train_tdcsfog = os.listdir("../tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog")
test_tdcsfog = os.listdir("../tlvmc-parkinsons-freezing-gait-prediction/test/tdcsfog")

In [3]:
set(train_defog) & set(train_tdcsfog)

set()

In [4]:
if len(set(test_defog) & set(test_tdcsfog)) > 0: # is there?
    test_tdcsfog = list(set(test_tdcsfog) - (set(test_defog) & set(test_tdcsfog)))

In [5]:
task = pd.read_csv("../tlvmc-parkinsons-freezing-gait-prediction/tasks.csv")
task_map = {t:i+1 for i,t in enumerate(sorted(list(set(task.Task.values))))}
print(task_map)
task["TaskId"] = task.Task.apply(lambda x:task_map[x])
events = pd.read_csv("../tlvmc-parkinsons-freezing-gait-prediction/events.csv")
print(events)
events = events[events.Kinetic==1]
subjects = pd.read_csv("../tlvmc-parkinsons-freezing-gait-prediction/subjects.csv")
subjects["SexId"] = (subjects.Sex=="M").values.astype(np.uint8)
subjects = subjects.fillna(0)
subjects = subjects.drop(["Sex"], axis=1)
len(set(task_map.values()))

{'4MW': 1, '4MW-C': 2, 'Hotspot1': 3, 'Hotspot1-C': 4, 'Hotspot2': 5, 'Hotspot2-C': 6, 'MB1': 7, 'MB10': 8, 'MB11': 9, 'MB12': 10, 'MB13': 11, 'MB2a': 12, 'MB2b': 13, 'MB3-L': 14, 'MB3-R': 15, 'MB4': 16, 'MB5': 17, 'MB6': 18, 'MB6-L': 19, 'MB6-R': 20, 'MB7': 21, 'MB8': 22, 'MB9': 23, 'Rest1': 24, 'Rest2': 25, 'TUG-C': 26, 'TUG-DT': 27, 'TUG-ST': 28, 'Turning-C': 29, 'Turning-DT': 30, 'Turning-ST': 31}
              Id        Init  Completion     Type  Kinetic
0     003f117e14     8.61312     14.7731     Turn      1.0
1     009ee11563    11.38470     41.1847     Turn      1.0
2     009ee11563    54.66470     58.7847     Turn      1.0
3     011322847a    28.09660     30.2966     Turn      1.0
4     01d0fe7266    30.31840     31.8784     Turn      1.0
...          ...         ...         ...      ...      ...
3707  f9fc61ce85   628.56000    631.6650  Walking      0.0
3708  f9fc61ce85   782.49800    782.6530  Walking      1.0
3709  f9fc61ce85   931.93900    933.4470     Turn      1.0
3710 

31

In [6]:
metadata = [pd.read_csv("../tlvmc-parkinsons-freezing-gait-prediction/defog_metadata.csv"),
            pd.read_csv("../tlvmc-parkinsons-freezing-gait-prediction/tdcsfog_metadata.csv")]
metadata[0]["MedicationId"] = (metadata[0].Medication=="on").values.astype(np.uint8)
metadata[0] = metadata[0].drop(["Medication"], axis=1)
metadata[1]["MedicationId"] = (metadata[1].Medication=="on").values.astype(np.uint8)
metadata[1] = metadata[1].drop(["Medication"], axis=1)
metadata

[             Id Subject  Visit  MedicationId
 0    02ab235146  ab54e1      2             1
 1    02ea782681  bf608b      2             1
 2    06414383cf  c0b71e      2             0
 3    092b4c1819  b6a627      1             0
 4    0a900ed8a2  b7bd52      2             1
 ..          ...     ...    ...           ...
 132  f3a921edee  ce8b0b      1             0
 133  f40e8c6ebe  d9529b      1             0
 134  f8ddbdd98d  fc1e1b      1             1
 135  f9efef91fb  fe5d84      2             0
 136  f9fc61ce85  a7d8c0      1             1
 
 [137 rows x 4 columns],
              Id Subject  Visit  Test  MedicationId
 0    003f117e14  13abfd      3     2             1
 1    009ee11563  d81e3a      4     2             1
 2    011322847a  203e85      2     2             1
 3    01d0fe7266  203e85      2     1             0
 4    024418ba39  cecfb8     19     3             1
 ..          ...     ...    ...   ...           ...
 828  feba449e1a  47860d     19     1             1
 829 

In [7]:
ext_columns = list(subjects.columns)[1:]
ext_columns

['Visit',
 'Age',
 'YearsSinceDx',
 'UPDRSIII_On',
 'UPDRSIII_Off',
 'NFOGQ',
 'SexId']

In [8]:
def read_csv_with_task(csv):
    global task, events, defog_metadata, tdcsfog_metadata
    fn = csv.split("/")[-1]
    idf = fn.split(".")[0]
    tdf = task[task.Id==idf]
    edf = events[events.Id==idf]
    df = pd.read_csv(csv)
    taskids = np.zeros(len(df), dtype=np.uint8)
    for b,e,t in zip(tdf.Begin,tdf.End,tdf.TaskId):
        taskids[int(b):int(e)] = t
    for b,e,t in zip(edf.Init,edf.Completion,edf.Type):
        if t=="Turn":
            taskids[int(b):int(e)] = taskids[int(b):int(e)] + 32
        else:
            taskids[int(b):int(e)] = taskids[int(b):int(e)] + 64
    df["TaskId"] = taskids
    met = metadata[0 if "defog" in csv else 1]
    subId = met[met.Id==idf].values.flatten()[1]
    sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
    for i,c in enumerate(ext_columns):
        df[c] = sub[i]
    return df

In [9]:
target_cols = ["StartHesitation","Turn","Walking"]
train_cols = ["Time","AccV","AccML","AccAP","TaskId"]+ext_columns

In [10]:
def feature_engineering(val, clfs, target=None):
    # Cluster and Dimensional mapping analysis for each data
    if clfs[0] is None:
        clfs[0] = MiniBatchKMeans(n_clusters=10, random_state=0, init="random").fit(val[:,1:4])
    km = clfs[0].predict(val[:,1:4])
    km_oh = np.zeros((val.shape[0],10), dtype=np.uint8) # discrete value change to One-hot
    for i in range(10):
        idx = np.where(km==0)[0]
        km_oh[idx,i] = 1
    if clfs[1] is None:
        clfs[1] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(val[:,1:4])
    svd = clfs[1].transform(val[:,1:4])
    # Per-user statistics
    print("Per-user statistics")
    cp = 0
    sp = 0
    usrm = np.zeros((val.shape[0], 5*val.shape[1]-10), dtype=np.float16)
    for i in range(val.shape[0]):
        if cp > val[i,0]:
            for t in range(val.shape[1]-2):
                usrm[sp:i,5*t] = np.mean(val[sp:i,t+1])
                usrm[sp:i,5*t+1] = np.std(val[sp:i,t+1])
                usrm[sp:i,5*t+2] = np.max(val[sp:i,t+1])
                usrm[sp:i,5*t+3] = np.min(val[sp:i,t+1])
                usrm[sp:i,5*t+4] = (i-sp)/val.shape[0]
            sp = i
        cp = val[i,0]
    for t in range(val.shape[1]-2):
        usrm[sp:,5*t] = np.mean(val[sp:,t+1])
        usrm[sp:,5*t+1] = np.std(val[sp:,t+1])
        usrm[sp:,5*t+2] = np.max(val[sp:,t+1])
        usrm[sp:,5*t+3] = np.min(val[sp:,t+1])
        usrm[sp:,5*t+4] = (val.shape[0]-sp)/val.shape[0]
    iskinetic = np.stack([(val[:,4]>=32).astype(np.uint8), (val[:,4]>=64).astype(np.uint8)]).transpose((1,0))
    # Cluster and Dimensional mapping analysis for each user/task
    print("Cluster and Dimensional mapping analysis for each user/task")
    if clfs[2] is None:
        clfs[2] = MiniBatchKMeans(n_clusters=10, random_state=0, init="random").fit(usrm)
    kmu = clfs[2].predict(usrm)
    kmu_oh = np.zeros((val.shape[0],10), dtype=np.uint8) # discrete value change to One-hot
    for i in range(10):
        idx = np.where(kmu==0)[0]
        kmu_oh[idx,i] = 1
    del kmu
    gc.collect()
    if clfs[3] is None:
        clfs[3] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(usrm)
    svdu = clfs[3].transform(usrm)
    gc.collect()
    # Merge waypoints
    marged = np.hstack([val[:,1:4],km_oh,svd])
    # Moving average and variance within the same user
    print("Moving average and variance within the same user/task")
    wnd = np.zeros((val.shape[0],60), dtype=np.float16)
    cp = val[0,0]
    window = deque([marged[0,:15]] * 5)
    for i in range(val.shape[0]):
        if cp > val[i,0]:
            window = deque([marged[i,:15]] * 5)
        else:
            window.popleft()
            window.append(marged[i,:15])
        cp = val[i,0]
        wnd[i] = np.hstack([np.mean(window, axis=0),np.std(window, axis=0),np.min(window, axis=0),np.max(window, axis=0)])
    # Analyze the entire merge data
    print("Analyze the entire merge data")
    usrv = np.hstack([svd,svdu])
    if clfs[4] is None:
        clfs[4] = [LinearRegression().fit(usrv, target[:,i]) for i in range(3)]
    reg = np.stack([clfs[4][i].predict(usrv) for i in range(3)]).transpose((1,0))
    del usrv
    gc.collect()
    if clfs[5] is None:
        clfs[5] = TruncatedSVD(n_components=2, n_iter=10, random_state=0).fit(marged)
    svdm = clfs[5].transform(marged)
    # ext columns
    ext = val[:,5:].astype(np.uint8)
    # Marge all
    return np.hstack([marged,wnd,reg,svdm,usrm,kmu_oh,svdu,iskinetic,ext])

In [11]:
#from sklearn.tree import DecisionTreeRegressor
def get_regressor(totest=False):
    return Ridge(max_iter=1000,random_state=0) #DecisionTreeRegressor(max_leaf_nodes=32, random_state=0)

def training(val, target, tsk):
    task_grp = tsk % 32
    clfs = [get_regressor().fit(val, target)]
    for i in range(1,32,1):
        index = (task_grp==i)
        X, y = val[index], target[index]
        if len(X) == 0:
            clfs.append(0.0)
        elif len(set(y)) <= 1:
            clfs.append(float(y[0]))
        else:
            clfs.append(get_regressor().fit(X, y))
    return clfs

def predict(clfs, val, tsk):
    task_grp = tsk % 32
    result = clfs[0].predict(val)
    for i,c in enumerate(clfs[1:]):
        index = (task_grp==(i+1))
        if type(c) is float or type(c) is int:
            result[index] = c
        else:
            X = val[index]
            if len(X) > 0:
                result[index] = c.predict(X)
    return result

In [12]:
train_dfs = [read_csv_with_task("../tlvmc-parkinsons-freezing-gait-prediction/train/defog/"+i)[train_cols+target_cols] for i in train_defog]
train_val = [i[train_cols].values for i in train_dfs]
train_tgt = [i[target_cols].values for i in train_dfs]
del train_dfs
gc.collect()

  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().va

  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().va

  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().va

  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().va

  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)
  sub = subjects[subjects.Subject==subId].mean().values.astype(np.uint8)


0

In [13]:
train_val = np.vstack(train_val)
train_tsk = train_val[:,4].astype(np.uint8)
train_tgt = np.vstack(train_tgt)
gc.collect()

0

In [1]:
defog_trans = [None, None, None, None, None ,None]
train_val = feature_engineering(train_val, defog_trans, target=train_tgt)

NameError: name 'feature_engineering' is not defined

In [None]:
train_val = train_val.astype(np.float16) # reduce memory
gc.collect()
defog_clf = [training(train_val,train_tgt[:,i],train_tsk) for i in range(len(target_cols))]

In [None]:
del train_val, train_tgt, train_defog, train_tsk
gc.collect()

In [1]:
train_dfs = [read_csv_with_task("../tlvmc-parkinsons-freezing-gait-prediction/train/tdcsfog/"+i)[train_cols+target_cols] for i in train_tdcsfog]
train_val = [i[train_cols].values for i in train_dfs]
train_tgt = [i[target_cols].values for i in train_dfs]
del train_dfs
gc.collect()

NameError: name 'train_tdcsfog' is not defined

In [None]:
train_val = np.vstack(train_val)
train_tsk = train_val[:,4].astype(np.uint8)
train_tgt = np.vstack(train_tgt)
gc.collect()

In [None]:
tdcsfog_trans = [None, None, None, None, None, None]
train_val = feature_engineering(train_val, tdcsfog_trans, target=train_tgt)

In [None]:
train_val = train_val.astype(np.float16) # reduce memory
gc.collect()
tdcsfog_clf = [training(train_val,train_tgt[:,i],train_tsk) for i in range(len(target_cols))]

In [None]:
del train_val, train_tgt, train_tdcsfog, train_tsk
gc.collect()

In [None]:
test_dfs = [read_csv_with_task("/kaggle/input/tlvmc-parkinsons-freezing-gait-prediction/test/defog/"+i)[train_cols] for i in test_defog]
test_val = [i.values for i in test_dfs]
gc.collect()