In [1]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.layers import CuDNNLSTM, CuDNNGRU, Dropout
from keras.optimizers import Adam, SGD
import math
import random
import numpy as np
import pandas as pd
import json as js
import os

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from keras.models import load_model
from keras.utils import to_categorical

# Import own methods

import data_generator_modified as dg
import ml_utils as mlu
import rnn_generator as rnng

from time import sleep

Using TensorFlow backend.


**Checking if GPU are avaliable**

In [2]:
from tensorflow.python.client import device_lib
from keras import backend as K

print("Process Units detected by tensorflow: ")
print(device_lib.list_local_devices())


print("GPUs used by keras:")
K.tensorflow_backend._get_available_gpus()

Process Units detected by tensorflow: 
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6424708338209615732
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 2060943975155694549
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 4341595800605275982
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10227318784
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10174727839573092421
physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]
GPUs used by keras:


['/job:localhost/replica:0/task:0/device:GPU:0']

**Loading 10-fold organization.** It contains the user information to be used for k-fold and posterior test

In [3]:
import json as js

with open("10_cross_validation_organization.json") as json_file:  
    cross_10_val_organization = js.load(json_file)
    print(cross_10_val_organization)

{'group1': [['SA19', 'SA04'], ['SE13']], 'group2': [['SA21', 'SA01'], ['SE08']], 'group3': [['SA10', 'SA12'], ['SE09']], 'group4': [['SA13', 'SE06'], ['SE10']], 'group5': [['SA18', 'SA08'], ['SE15']], 'group6': [['SA14', 'SA09'], ['SE05']], 'group7': [['SA17', 'SA05'], ['SE01']], 'group8': [['SA07', 'SA16'], ['SE12']], 'group9': [['SA22', 'SA11'], ['SE03']], 'group10': [['SA23', 'SA06'], ['SE02']], 'test': [['SA03', 'SA15', 'SA20', 'SA02'], ['SE04', 'SE14', 'SE07', 'SE11']]}


**Loading the data with data for train/val and test separated**

In [4]:
import data_generator_modified as dg

dataCVWinValues, dataCVWinLabel, dataTestWinValues, dataTestWinLabel, dataWinOrganization = \
dg.loadDataSetInBlocks('../10_cross_validation/', nTestUsers=8, windowSize=256, stride=128, randomOrder = False, usersForValidation = cross_10_val_organization['test'])

Loading data...

Divided (train - test) found. Loading...

Data loaded correctly

Estructuring data in blocks...

Data generated correctly


*Checking the organization*

In [5]:
set([x[0] for x in dataWinOrganization['test']])

{'SA02', 'SA03', 'SA15', 'SA20', 'SE04', 'SE07', 'SE11', 'SE14'}

In [6]:
set([x[0] for x in dataWinOrganization['train']])

{'SA01',
 'SA04',
 'SA05',
 'SA06',
 'SA07',
 'SA08',
 'SA09',
 'SA10',
 'SA11',
 'SA12',
 'SA13',
 'SA14',
 'SA16',
 'SA17',
 'SA18',
 'SA19',
 'SA21',
 'SA22',
 'SA23',
 'SE01',
 'SE02',
 'SE03',
 'SE05',
 'SE06',
 'SE08',
 'SE09',
 'SE10',
 'SE12',
 'SE13',
 'SE15'}

**Get only the accelerometer data**

In [7]:
dataCVWinValues = dataCVWinValues[:,:,:3]
dataTestWinValues = dataTestWinValues[:,:,:3]

In [8]:
import numpy as np
np.shape(dataCVWinValues)

(94667, 256, 3)

In [9]:
sample = np.take(dataCVWinValues,[1,2,3], axis=0)
sample

array([[[  19, -224,  -59],
        [  21, -237,  -53],
        [  22, -249,  -40],
        ...,
        [ -22, -292,  -52],
        [ -33, -278,  -49],
        [ -47, -269,  -47]],

       [[ -38, -367,  -30],
        [ -26, -358,  -33],
        [ -11, -341,  -44],
        ...,
        [  58, -228,  -77],
        [  63, -216,  -66],
        [  72, -215,  -56]],

       [[ -53, -261,  -35],
        [ -60, -251,  -33],
        [ -62, -241,  -17],
        ...,
        [ -49, -256,  -36],
        [ -52, -256,  -29],
        [ -51, -255,  -24]]], dtype=int32)

**Tools for split a subset for val and the remaining k-1 subsets for train**

In [10]:
""" 
Return info about the samples for each fold
    1. The index of each sample corresponding to each fold (group)
    2. The number of samples contained in each fold
"""

def subsets_samples_info(data_organization,cv_organization):
    dict_indices = dict()
    n_of_samples = dict()
    list_of_groups = list(cv_organization.keys())
    list_of_groups.pop(list_of_groups.index("test"))
    # print(list_of_groups)
    
    reverse_cv_org = dict()
    for gr in list_of_groups:
        for user in cv_organization[gr][0] + cv_organization[gr][1]:
            reverse_cv_org[user] = gr
        n_of_samples[gr] = 0
        dict_indices[gr] = list()
    # print(reverse_cv_org)
    
    for i in range(len(data_organization)):
        group = reverse_cv_org[data_organization[i][0]]
        n_of_samples[group] = n_of_samples[group] + 1
        dict_indices[group].append(i)
    
    return (dict_indices, n_of_samples)

In [11]:
indicesCV, nSamples = subsets_samples_info(dataWinOrganization['train'],cross_10_val_organization)

In [12]:
def generate_fold(data, data_indices, fold):
    
    val_subset = np.take(data, data_indices[fold], axis=0)
    tr_indices = list()
    
    for gr in data_indices.keys():
        if gr != fold:
            tr_indices = tr_indices + data_indices[gr]
    tr_subset = np.take(data, tr_indices, axis=0)
    
    return tr_subset, val_subset
    

In [13]:
dataTrWinValues, dataValWinValues = generate_fold(dataCVWinValues, indicesCV, 'group10')

In [14]:
np.shape(dataValWinValues)

(9093, 256, 3)

In [15]:
nSamples['group10']

9093

In [16]:
""" """
def train_cv(dataTrValValues, dataTrValLabels, data_organization, cv_organization, arch_params, path):
    
    info_file_name = "model_info.npy"
    model_last_epoch_file_name = "model_last_epoch.hdf5"
    cv_results_file_name = "cv_results.npy"
    
    indicesCV, nSamples = subsets_samples_info(data_organization,cv_organization)
    
    cv_results = dict()
    
    first_fold = True
    
    for fold in indicesCV.keys():
        
        print("Fold # {}:".format(fold))
        
        fullpath = os.path.join(path, fold)
        if not(os.path.isdir(fullpath)):
            # try:  
            os.mkdir(fullpath)
            """except OSError:  
                print ("Creation of the directory {} failed".format(fullpath))
                return
            else:  
                print ("Successfully created the directory {}".format(fullpath"""
        
        info_file_path = os.path.join(fullpath, info_file_name)
        model_last_epoch_file_path = os.path.join(fullpath, model_last_epoch_file_name)
        
        if not(os.path.isfile(info_file_path)): # The model training hasn't been done yet
            
            dataTrValues, dataValValues = generate_fold(dataTrValValues, indicesCV, fold)
            dataTrlabels, dataValLabels = generate_fold(dataTrValLabels, indicesCV, fold)

            aEpochs = arch_params['epochs']
            aLR = arch_params['learning_rate']
            aWin = arch_params['window_length']
            aStride = arch_params['stride']
            aBatchSize = arch_params['batch_size']
            aRNNType = arch_params['rnn_type']
            aSecondLayer = arch_params['second_rnn_layer']
            aFirstDropout = arch_params['first_dropout']
            aSecondDropout = arch_params['second_dropout']
            aFirstDenseLayer = arch_params['first_dense_layer']

            results = rnng.trainRNNModel(
                            dataTrValues, dataTrlabels, dataValValues, dataValLabels, 
                            epochs = aEpochs, lr=aLR, w = aWin, stride = aStride, 
                            batch_size = aBatchSize, rnn_type = aRNNType, 
                            two_rnn_layers = aSecondLayer, drop_coeff_rnn = aFirstDropout, 
                            drop_coeff_dense = aSecondDropout, first_dense = aFirstDenseLayer,
                            best_model=True, best_model_path=fullpath)
            
            model_performance_info = dict()
            
            for best_model_per_metric in results.keys():
                
                model, model_train_history, model_confusion_matrix = results[best_model_per_metric]
            
                train_info = model_train_history.history
                metrics = mlu.macro_and_micro_metrics_per_class(model_confusion_matrix, ['BKG', 'ALERT', 'FALL'])
                model_performance_info[best_model_per_metric] = { 
                                            'train_history' : train_info, 
                                            'eval_results' : model_confusion_matrix,
                                            'metrics' : metrics,
                                            'optional_data' : fold
                                        }

            np.save(info_file_path, model_performance_info)
            
            # TODO: verify
            model.save(model_last_epoch_file_path)
        
        else:
            print('{} fold was already validated. Loading results'.format(fold))
            model_performance_info = np.load(info_file_path)[()]
        print("\n")
        ## TODO: consider improve the implementation with the option to select the metric (not fix f1-score micro)
        
        if(first_fold):
            for best_model_per_metric in model_performance_info.keys():
                cv_results[best_model_per_metric] = dict()
                cv_results[best_model_per_metric]['f1-score_macro'] = dict()
                cv_results[best_model_per_metric]['f1-score_micro'] = dict()
                cv_results[best_model_per_metric]['balanced_accuracy'] = dict()
            first_fold = False
            
        for best_model_per_metric in model_performance_info.keys():
            
            f1_score_m = model_performance_info[best_model_per_metric]['metrics']['macro']['f1-score']
            cv_results[best_model_per_metric]['f1-score_macro'][fold] = (f1_score_m, 1- f1_score_m)
            
            f1_score_u = model_performance_info[best_model_per_metric]['metrics']['micro']['f1-score']
            cv_results[best_model_per_metric]['f1-score_micro'][fold] = (f1_score_u, 1- f1_score_u)
            
            balanced_acc = model_performance_info[best_model_per_metric]['metrics']['macro']['recall']
            cv_results[best_model_per_metric]['balanced_accuracy'][fold] = (balanced_acc, 1- balanced_acc)
            
        
    dict_results = {
                    'model_params' : arch_params,
                    'results_per_best_metric' : dict()
                   }
        
    for best_model_per_metric in cv_results.keys():
        
        dict_results['results_per_best_metric'][best_model_per_metric] = dict()
        
        for metric in cv_results[best_model_per_metric].keys():
            
            mean = np.mean([ r[0] for r in list(cv_results[best_model_per_metric][metric].values()) ])
            error = np.mean([ r[1] for r in list(cv_results[best_model_per_metric][metric].values()) ])
            sd = np.std([ r[0] for r in list(cv_results[best_model_per_metric][metric].values()) ])
            sd_err = np.std([ r[1] for r in list(cv_results[best_model_per_metric][metric].values()) ])
            
            dict_results['results_per_best_metric'][best_model_per_metric][metric] = {
                                                            'fold_results' : cv_results[best_model_per_metric][metric],
                                                            'mean' : mean,
                                                            'standard_deviation' : sd,
                                                            'error' : error,
                                                            'std_error' : sd_err
                                                          }
    
    np.save(os.path.join(path, cv_results_file_name), dict_results)

    return dict_results


In [17]:
params = {
            'epochs' : 2,
            'learning_rate' : 0.001,
            'window_length' : 256,
            'stride' : 128,
            'batch_size' : 64,
            'rnn_type' : 'gru',
            'second_rnn_layer' : False,
            'first_dropout' : 0.2,
            'second_dropout' : 0.2,
            'first_dense_layer' : False
        }

# train_cv(dataCVWinValues, dataCVWinLabel, dataWinOrganization['train'],cross_10_val_organization, params, './cv_test')

In [18]:
def model_analysis_with_cv(study_name, path, data_cv_values, data_cv_labels, data_test_values, data_test_labels, data_organization, folds_organization, model_params_list):
    
    study_route = os.path.join(path, study_name)
    
    if not(os.path.isdir(study_route)):
        os.mkdir(study_route)
    
    models_count = 1
    model_metric_list = list()
    for params in model_params_list:
        
        model_id = "model{}".format(models_count)
        model_route = os.path.join(study_route, model_id)
        
        print("###### Model id: {} ######".format(model_id))
        
        if not(os.path.isdir(model_route)):
            os.mkdir(model_route)
        
        data_cv = data_cv_values
        data_test = data_test_values
        
        frequency_reduction = params["frequency_reduction"]
        
        for i in range(frequency_reduction):
            data_cv = dg.reduce_frequency_of_window_samples(data_cv)
            data_test = dg.reduce_frequency_of_window_samples(data_test)
        
        train_cv_info = train_cv(data_cv, data_cv_labels, data_organization['train'], folds_organization, params, model_route)
        
        print(train_cv_info)
        for model_results_per_best_metric in train_cv_info['results_per_best_metric'].keys():
            aux_results = [model_id, model_results_per_best_metric]
            for metric_results in train_cv_info['results_per_best_metric'][model_results_per_best_metric].keys():
                metrics_data = train_cv_info['results_per_best_metric'][model_results_per_best_metric][metric_results]
                print(type(metrics_data))
                aux_results = aux_results + [metrics_data['mean'],metrics_data['standard_deviation'], metrics_data['error'], metrics_data['std_error']]
            
            model_metric_list.append(aux_results)

        models_count = models_count + 1
    
    basic_columns = ['model_id', 'best_metric']
    f1_m_columns = ['f1-score_m mean', 'f1-score_m std', 'f1-score_m error', 'f1-score_m error std']
    f1_u_columns = ['f1-score_u mean', 'f1-score_u std', 'f1-score_u error', 'f1-score_u error std']
    balanced_acc_columns = ['balanced-accuracy mean', 'balanced-accuracy std', 'balanced-accuracy error', 'balanced-accuracy error std']
    results = pd.DataFrame(data=model_metric_list, columns=basic_columns + f1_m_columns + f1_u_columns + balanced_acc_columns)
    
    return results
        ## TODO: decide which model is the best and train the model with all cv folds as train data
        ## TODO: extract the metrics of the trained model and save the results in a file
        

In [19]:
short_params_ls = [
        {
            'epochs' : 3,
            'learning_rate' : 0.001,
            'frequency_reduction' : 0,
            'window_length' : 256,
            'stride' : 128,
            'batch_size' : 64,
            'rnn_type' : 'gru',
            'second_rnn_layer' : False,
            'first_dropout' : 0.2,
            'second_dropout' : 0.2,
            'first_dense_layer' : False
        },
        {
            'epochs' : 3,
            'learning_rate' : 0.001,
            'frequency_reduction' : 1,
            'window_length' : 128,
            'stride' : 64,
            'batch_size' : 64,
            'rnn_type' : 'gru',
            'second_rnn_layer' : False,
            'first_dropout' : 0.2,
            'second_dropout' : 0.2,
            'first_dense_layer' : False
        }
]

In [20]:
model_params_ls = 

In [21]:
results = model_analysis_with_cv("short_test_gru_different_frequency", "../10_cross_validation", 
                                 dataCVWinValues, dataCVWinLabel, dataTestWinValues, dataTestWinLabel, 
                                 dataWinOrganization, cross_10_val_organization, short_params_ls)

###### Model id: model1 ######
Fold # group1:
group1 fold was already validated. Loading results


Fold # group2:
group2 fold was already validated. Loading results


Fold # group3:
group3 fold was already validated. Loading results


Fold # group4:
group4 fold was already validated. Loading results


Fold # group5:
group5 fold was already validated. Loading results


Fold # group6:
group6 fold was already validated. Loading results


Fold # group7:
group7 fold was already validated. Loading results


Fold # group8:
group8 fold was already validated. Loading results


Fold # group9:
group9 fold was already validated. Loading results


Fold # group10:
group10 fold was already validated. Loading results


{'model_params': {'epochs': 3, 'learning_rate': 0.001, 'frequency_reduction': 0, 'window_length': 256, 'stride': 128, 'batch_size': 64, 'rnn_type': 'gru', 'second_rnn_layer': False, 'first_dropout': 0.2, 'second_dropout': 0.2, 'first_dense_layer': False}, 'results_per_best_metric': {'f1

Train on 85182 samples, validate on 9485 samples
Epoch 1/3
[[5919  975 2116]
 [  81   37   33]
 [  13    7  304]]
 — val_micro_f1: 0.659989 — val_micro_precision: 0.659989 — val_micro_recall 0.659989
 — val_macro_f1: 0.356727 — val_macro_precision: 0.381536 — val_macro_recall 0.613414
Epoch 2/3
[[7502 1159  349]
 [  48  100    3]
 [  58    9  257]]
 — val_micro_f1: 0.828571 — val_micro_precision: 0.828571 — val_micro_recall 0.828571
 — val_macro_f1: 0.531577 — val_macro_precision: 0.495645 — val_macro_recall 0.762697
Epoch 3/3
[[7538 1433   39]
 [  34  115    2]
 [  66   17  241]]
 — val_micro_f1: 0.832261 — val_micro_precision: 0.832261 — val_micro_recall 0.832261
 — val_macro_f1: 0.611662 — val_macro_precision: 0.638333 — val_macro_recall 0.780681
TP: 7538, FP: 100 and FN: 1472
precision: 0.9869075674260278, recall: 0.8366259711431743 and f1-score: 0.9055742431523307
TP: 115, FP: 1450 and FN: 36
precision: 0.07348242811501597, recall: 0.7615894039735099 and f1-score: 0.13403263403263

TP: 7045, FP: 41 and FN: 2029
precision: 0.9942139429861699, recall: 0.7763940930130042 and f1-score: 0.8719059405940595
TP: 70, FP: 2023 and FN: 22
precision: 0.033444816053511704, recall: 0.7608695652173914 and f1-score: 0.06407322654462243
TP: 282, FP: 24 and FN: 37
precision: 0.9215686274509803, recall: 0.8840125391849529 and f1-score: 0.9024000000000001
precision_u: 0.7798629414865578, recall_u: 0.7798629414865578 and f1-score_u: 0.7798629414865578
TP: 7045, FP: 41 and FN: 2029
precision: 0.9942139429861699, recall: 0.7763940930130042 and f1-score: 0.8719059405940595
TP: 70, FP: 2023 and FN: 22
precision: 0.033444816053511704, recall: 0.7608695652173914 and f1-score: 0.06407322654462243
TP: 282, FP: 24 and FN: 37
precision: 0.9215686274509803, recall: 0.8840125391849529 and f1-score: 0.9024000000000001
precision_u: 0.7798629414865578, recall_u: 0.7798629414865578 and f1-score_u: 0.7798629414865578
TP: 7045, FP: 41 and FN: 2029
precision: 0.9942139429861699, recall: 0.7763940930130



Fold # group6:
samples: {0: 81162, 1: 1078, 2: 2986} 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_6 (Batch (None, 128, 3)            12        
_________________________________________________________________
dropout_11 (Dropout)         (None, 128, 3)            0         
_________________________________________________________________
cu_dnngru_6 (CuDNNGRU)       (None, 32)                3552      
_________________________________________________________________
dropout_12 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 99        
Total params: 3,663
Trainable params: 3,657
Non-trainable params: 6
_________________________________________________________________
Train on 85226 samples, validate on 9441 samples
Epoch 1/3
[[2733 4092 2186]
 [   1 

[[ 682 6192 2170]
 [   6   75   12]
 [   4   32  312]]
 — val_micro_f1: 0.112704 — val_micro_precision: 0.112704 — val_micro_recall 0.112704
 — val_macro_f1: 0.127710 — val_macro_precision: 0.374185 — val_macro_recall 0.592804
Epoch 2/3
[[7307 1163  574]
 [  21   68    4]
 [  73   53  222]]
 — val_micro_f1: 0.800949 — val_micro_precision: 0.800949 — val_micro_recall 0.800949
 — val_macro_f1: 0.458061 — val_macro_precision: 0.439253 — val_macro_recall 0.725684
Epoch 3/3
[[7673 1352   19]
 [  14   77    2]
 [  27   28  293]]
 — val_micro_f1: 0.847970 — val_micro_precision: 0.847970 — val_micro_recall 0.847970
 — val_macro_f1: 0.633431 — val_macro_precision: 0.660218 — val_macro_recall 0.839440
TP: 7673, FP: 41 and FN: 1371
precision: 0.9946849883329012, recall: 0.8484077841662981 and f1-score: 0.9157417352906075
TP: 77, FP: 1380 and FN: 16
precision: 0.05284831846259437, recall: 0.8279569892473119 and f1-score: 0.09935483870967741
TP: 293, FP: 21 and FN: 55
precision: 0.9331210191082803,

TP: 7176, FP: 167 and FN: 1398
precision: 0.9772572518044396, recall: 0.8369489153254024 and f1-score: 0.9016774517811145
TP: 119, FP: 1444 and FN: 79
precision: 0.07613563659628919, recall: 0.601010101010101 and f1-score: 0.13515048268029528
TP: 178, FP: 9 and FN: 143
precision: 0.9518716577540107, recall: 0.5545171339563862 and f1-score: 0.7007874015748031
precision_u: 0.8218409765753877, recall_u: 0.8218409765753877 and f1-score_u: 0.8218409765753877
TP: 7176, FP: 167 and FN: 1398
precision: 0.9772572518044396, recall: 0.8369489153254024 and f1-score: 0.9016774517811145
TP: 119, FP: 1444 and FN: 79
precision: 0.07613563659628919, recall: 0.601010101010101 and f1-score: 0.13515048268029528
TP: 178, FP: 9 and FN: 143
precision: 0.9518716577540107, recall: 0.5545171339563862 and f1-score: 0.7007874015748031
precision_u: 0.8218409765753877, recall_u: 0.8218409765753877 and f1-score_u: 0.8218409765753877
TP: 7176, FP: 167 and FN: 1398
precision: 0.9772572518044396, recall: 0.836948915325

In [22]:
results

Unnamed: 0,model_id,best_metric,f1-score_m mean,f1-score_m std,f1-score_m error,f1-score_m error std,f1-score_u mean,f1-score_u std,f1-score_u error,f1-score_u error std,balanced-accuracy mean,balanced-accuracy std,balanced-accuracy error,balanced-accuracy error std
0,model1,f1-macro,0.452409,0.081636,0.547591,0.081636,0.745133,0.069759,0.254867,0.069759,0.687216,0.061701,0.312784,0.061701
1,model1,f1-micro,0.446965,0.082076,0.553035,0.082076,0.759057,0.067495,0.240943,0.067495,0.648809,0.072213,0.351191,0.072213
2,model1,balanced-acc,0.44314,0.087229,0.55686,0.087229,0.715401,0.08326,0.284599,0.08326,0.725261,0.049533,0.274739,0.049533
3,model2,f1-macro,0.60434,0.034713,0.39566,0.034713,0.827066,0.04098,0.172934,0.04098,0.759968,0.066293,0.240032,0.066293
4,model2,f1-micro,0.59972,0.039555,0.40028,0.039555,0.83019,0.037484,0.16981,0.037484,0.759728,0.066244,0.240272,0.066244
5,model2,balanced-acc,0.587122,0.077137,0.412878,0.077137,0.808226,0.065969,0.191774,0.065969,0.766637,0.059436,0.233363,0.059436
