In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, optimizers, metrics
import pandas as pd
import os
import typing as tp
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

In [3]:
df: pd.DataFrame = pd.read_pickle(os.path.join('..', 'data', 'SemesterProject', 'KiMoRe', 'processed_data.tar.gz'))
df['is_healthy'] = np.logical_or(df['is_healthy_experienced'], df['is_healthy_inexperienced']).astype(np.float32)

df[['clinical_ts', 'clinical_po', 'clinical_cf']] = df[['clinical_ts', 'clinical_po', 'clinical_cf']] / df[['clinical_ts', 'clinical_po', 'clinical_cf']].max(axis=0)

df

Unnamed: 0,pos_spinebase_x,pos_spinebase_y,pos_spinebase_z,pos_spinemid_x,pos_spinemid_y,pos_spinemid_z,pos_neck_x,pos_neck_y,pos_neck_z,pos_head_x,...,group,age_,gender,frame_num,is_healthy_experienced,is_healthy_inexperienced,is_low_back_problems,is_parkinsons,is_stroke,is_healthy
0,0.148025,-1.146570,2.60229,2.0,0.147799,-0.870216,2.64721,2.0,0.146912,-0.595615,...,B,52,F,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.148034,-1.146580,2.60229,2.0,0.147791,-0.870219,2.64721,2.0,0.146843,-0.595615,...,B,52,F,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.147904,-1.146670,2.60234,2.0,0.147691,-0.870267,2.64724,2.0,0.146768,-0.595635,...,B,52,F,2.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.147692,-1.146840,2.60271,2.0,0.147566,-0.870364,2.64729,2.0,0.146635,-0.595729,...,B,52,F,3.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.146500,-1.147980,2.60381,2.0,0.147056,-0.870540,2.64741,2.0,0.146204,-0.595432,...,B,52,F,4.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279084,-0.333194,-0.215199,2.58387,2.0,-0.345806,0.088729,2.48869,2.0,-0.355289,0.384876,...,E,22,M,550.0,1.0,0.0,0.0,0.0,0.0,1.0
279085,-0.329756,-0.215215,2.58713,2.0,-0.342719,0.090435,2.49272,2.0,-0.352767,0.388246,...,E,22,M,551.0,1.0,0.0,0.0,0.0,0.0,1.0
279086,-0.324673,-0.215416,2.59172,2.0,-0.334720,0.093020,2.50138,2.0,-0.341581,0.394412,...,E,22,M,552.0,1.0,0.0,0.0,0.0,0.0,1.0
279087,-0.320218,-0.215014,2.59377,2.0,-0.330699,0.094413,2.50459,2.0,-0.338477,0.396178,...,E,22,M,553.0,1.0,0.0,0.0,0.0,0.0,1.0


In [4]:
x_columns: tp.List[str] = [col for col in df.columns if col[-1] in ['x', 'y', 'z', 'w'] and col != 'is_healthy']
y_columns: tp.List[str] = ['exercise_num', 'clinical_ts', 'clinical_po', 'clinical_cf', 'is_healthy']
len(x_columns)

175

In [5]:
df[y_columns].describe()

Unnamed: 0,exercise_num,clinical_ts,clinical_po,clinical_cf,is_healthy
count,279089.0,276074.0,276074.0,276074.0,279089.0
mean,2.924992,0.766515,0.752423,0.772465,0.588572
std,1.340889,0.201062,0.258369,0.197371,0.491449
min,1.0,0.2,0.000343,0.2,0.0
25%,2.0,0.630899,0.6,0.657143,0.0
50%,3.0,0.82,0.844444,0.828571,1.0
75%,4.0,0.94,1.0,0.933333,1.0
max,5.0,1.0,1.0,1.0,1.0


In [40]:
df.groupby(['exercise_num', 'patient_id']).aggregate({'frame_num': 'max'}).min()

frame_num    69.0
dtype: float64

In [41]:
# per_patient_df: pd.DataFrame = df.groupby(['patient_id', 'exercise_num']).aggregate({'age_': 'max'}, axis=1)

# Taken the max_frames value from https://ieeexplore.ieee.org/document/9564240
max_frames: int = 60 # int(df['frame_num'].max())
num_patients: int = df['patient_id'].drop_duplicates().shape[0]
num_exercises: int = int(df['exercise_num'].max())
num_joint_data: int = len(x_columns)

patient_ids: tp.List[str] = df['patient_id'].drop_duplicates().tolist()
exercise_nums: tp.List[int] = list(range(num_exercises))

exercise_data: np.ndarray = np.zeros((num_patients, num_exercises, max_frames, num_joint_data)) * np.nan
y_data: np.ndarray = np.zeros((num_patients, num_exercises, len(y_columns))) * np.nan

y_sizes: tp.List[int] = []

for patient_index, patient_id in enumerate(patient_ids):
    for exercise_index, exercise_num in enumerate(exercise_nums):
        print(f'Trying out patient {patient_id} for exercise #{exercise_num}')

        c_df: pd.DataFrame = df.loc[np.logical_and(df['exercise_num'] == exercise_num, df['patient_id'] == patient_id), x_columns + y_columns]

        if c_df.size == 0:
            print('Skipping...')
            continue

        curr_values: np.ndarray = c_df[x_columns].values

        print(curr_values.shape)

        curr_y: np.ndarray = c_df[y_columns].drop_duplicates().values.flatten()

        y_sizes.append(curr_y.size)

        num_frames: int = n_frames if (n_frames := curr_values.shape[0]) < max_frames else max_frames

        exercise_data[patient_index, exercise_index, :num_frames, :] = curr_values[:num_frames]

        y_data[patient_index, exercise_index, :] = curr_y

exercise_data: np.ndarray = exercise_data.reshape((-1, max_frames, num_joint_data))
y_data: np.ndarray = y_data.reshape((-1, len(y_columns)))

exercise_data.shape, y_data.shape

Trying out patient B_ID4 for exercise #0
Skipping...
Trying out patient B_ID4 for exercise #1
(552, 175)
Trying out patient B_ID4 for exercise #2
(627, 175)
Trying out patient B_ID4 for exercise #3
(727, 175)
Trying out patient B_ID4 for exercise #4
(756, 175)
Trying out patient B_ID7 for exercise #0
Skipping...
Trying out patient B_ID7 for exercise #1
(561, 175)
Trying out patient B_ID7 for exercise #2
(603, 175)
Trying out patient B_ID7 for exercise #3
(696, 175)
Trying out patient B_ID7 for exercise #4
(572, 175)
Trying out patient B_ID5 for exercise #0
Skipping...
Trying out patient B_ID5 for exercise #1
(996, 175)
Trying out patient B_ID5 for exercise #2
(579, 175)
Trying out patient B_ID5 for exercise #3
(1033, 175)
Trying out patient B_ID5 for exercise #4
(758, 175)
Trying out patient B_ID1 for exercise #0
Skipping...
Trying out patient B_ID1 for exercise #1
(505, 175)
Trying out patient B_ID1 for exercise #2
(587, 175)
Trying out patient B_ID1 for exercise #3
(1031, 175)
Trying

((390, 60, 175), (390, 5))

In [34]:
np.logical_or(np.any(np.isnan(y_data), axis=1), np.any(np.isnan(exercise_data))).sum()

390

In [27]:
valid_indices.sum()

300

In [8]:
# There's too many nans.
# I think you need to process it yourself.

valid_indices: np.ndarray = ~np.any(np.isnan(y_data), axis=1)
first_y: np.ndarray = y_data[valid_indices, -1]
X: np.ndarray = exercise_data[valid_indices, :, :]

y: np.ndarray = np.zeros((first_y.shape[0], np.unique(first_y).size))
y[first_y == 0, 0] = 1
y[first_y == 1, 1] = 1

X.shape, y.shape

((300, 720, 175), (300, 2))

In [9]:
y[:, 0].sum() / y.shape[0], y[:, 1].sum() / y.shape[0]

(0.44, 0.56)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, shuffle=True, random_state=42)

train_dataset: tf.data.Dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset: tf.data.Dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

train_dataset, test_dataset

2022-11-23 18:51:29.265073: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-23 18:51:29.265360: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-23 18:51:29.265416: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-11-23 18:51:29.265460: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-11-23 18:51:29.283315: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

(<TensorSliceDataset element_spec=(TensorSpec(shape=(720, 175), dtype=tf.float64, name=None), TensorSpec(shape=(2,), dtype=tf.float64, name=None))>,
 <TensorSliceDataset element_spec=(TensorSpec(shape=(720, 175), dtype=tf.float64, name=None), TensorSpec(shape=(2,), dtype=tf.float64, name=None))>)

In [12]:
BATCH_SIZE: int = 32
SHUFFLE_BATCH_SIZE: int = 50

train_dataset = train_dataset.shuffle(SHUFFLE_BATCH_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [23]:
tf.keras.backend.clear_session()

norm_layer = tf.keras.layers.Normalization(axis=1)

norm_layer.adapt(X_train)

is_healthy_model: tf.keras.Model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=X.shape[1:]),
    norm_layer,
    # tf.keras.layers.LSTM(256, return_sequences=True),
    # tf.keras.layers.LSTM(32, return_sequences=True, activation='tanh'),
    # tf.keras.layers.Dense(4, activation='sigmoid'),
    tf.keras.layers.Dense(16),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation='relu'),
    # tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(16, activation='tanh'),
    # tf.keras.layers.LSTM(32, return_sequences=False, activation='relu'),
    # tf.keras.layers.Dense(16, activation='sigmoid'),
    tf.keras.layers.Dense(2, activation='softmax')
])

is_healthy_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  (None, 720, 175)         1441      
 n)                                                              
                                                                 
 dense (Dense)               (None, 720, 16)           2816      
                                                                 
 flatten (Flatten)           (None, 11520)             0         
                                                                 
 dense_1 (Dense)             (None, 32)                368672    
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 2)                 34        
                                                        

In [25]:
np.isnan(X_train).sum()

3268125

In [24]:
norm_layer(X_train[0])

<tf.Tensor: shape=(1, 720, 175), dtype=float32, numpy=
array([[[-0.49310583, -0.81009865,  2.6816156 , ..., -0.47383025,
         -0.47383025, -0.47383025],
        [-0.4939877 , -0.810871  ,  2.6809146 , ..., -0.47467902,
         -0.47467902, -0.47467902],
        [-0.49349165, -0.81034976,  2.6827376 , ..., -0.4743206 ,
         -0.4743206 , -0.4743206 ],
        ...,
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan],
        [        nan,         nan,         nan, ...,         nan,
                 nan,         nan]]], dtype=float32)>

In [18]:
n_epochs: int = 5

# is_healthy_model.adapt(X_train)

best_model_checkpoint_path: str = os.path.join('..', 'models', 'kimore', 'pt-is-healthy-best.mdl')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=best_model_checkpoint_path, save_best_only=True, save_weights_only=True, initial_value_threshold=.6, verbose=0, monitor='val_accuracy')

# fbeta_metric = tfa.metrics.FBetaScore(num_classes=np.unique(y_train).size, name='f_beta', average='macro')

is_healthy_model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
history = is_healthy_model.fit(X_train, y_train, epochs=n_epochs, validation_data=(X_test, y_test), callbacks=[checkpoint_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [49]:
np.isnan(y_train).sum(), np.isnan(y_test).sum()

(0, 0)

In [None]:
y.sum() /