In [None]:
import sys

In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt

from pathlib import Path
import pathlib
import shutil

from matplotlib import pyplot as plt
from IPython.display import clear_output
import numpy as np
import tensorflow as tf
import re

import os
from sklearn.model_selection import train_test_split
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

In [None]:
import sys
import os
import importlib
sys.path.insert(0, os.path.abspath('../'))

import ansim.loader

import ansim.preprocessor
import ansim.window_dataset
import ansim.dnn

importlib.reload(ansim.loader)

importlib.reload(ansim.preprocessor)
importlib.reload(ansim.window_dataset)
importlib.reload(ansim.dnn)



## Set Experiment dataset settings and generate training and test sets

In [None]:
data_preprocessor = ansim.preprocessor.Preprocessor()


# the columns for X and y
data_preprocessor.x_columns= [ 'min', 'electrocardiogram_signal', 'noininvasible_blood_pressure',
                              'G-force_z_axis', 'blood_pressure_systolic_value']
data_preprocessor.y_columns= [ 'max_gz' ] #'tolerance'

data_preprocessor.omit_baseline = True # a run that has a max gforce <= 0.75 is dismissed
data_preprocessor.train_split= 0.7
data_preprocessor.split_by= ansim.preprocessor.Preprocessor.SPLIT_BY_SUBJECT # this can be SPLIT_BY_SUBJECT or SPLIT_BY_RUN

# set windowed dataset - A.b
windowed = ansim.window_dataset.WindowDataset()
windowed.window_size = 3000
windowed.shift = 1000
windowed.batch_size = 512
windowed.shuffle_buffer = 1000

data_preprocessor.windowDataset= windowed


## Run dummy baseline (average)

In [None]:
# get baseline data
# here the data is preprared:
## We calculate the max g force and the tolerance
## the runs with gforce < 0.65 will be dismissedA
## the baseline runs (max g force between 0.65 and 0.75) will be kept if "omit_baseline" is set to False
## the runs filename, max gz, tolerance and validity are saved here: data/experiment_runid_maxgz_tolerance.csv

# NOTE: first run can take some time because we are calculating tolerance and max gz
# after that it takes some time to concatenated all the data together - we can not save and preprocess this due to the variabilit of the experiements and the big size of the concat data
X_training, y_training, X_test, y_test = data_preprocessor.prepare_baseline_data()

In [None]:
baseline = ansim.dnn.Baseline(X_training, y_training, X_test, y_test)
print(baseline.dummy_train_test(strategy = "mean"))



{'mse': 0.1962, 'mae': 0.422}


In [None]:
X_training, y_training, X_test, y_test = None,None,None,None

## Get the windowed training and test sets. these will be used in the DNN experiments

In [None]:
# get baseline data
# here the data is preprared:
## We calculate the max g force and the tolerance
## the runs with gforce < 0.65 will be dismissed 
## the baseline runs (max g force between 0.65 and 0.75) will be kept if "omit_baseline" is set to False
## all invalid runs reasons is saved in a csv file
# Then the data is windowed and ready to be used in the experiments
dataset_train, dataset_test = data_preprocessor.get_windowed_data()

In [None]:
#sanity check
data_preprocessor.windowDataset.get_window_data_batch_shape()

512   per batch ( 512 )
512   per batch ( 512 )
3000  x length of 1 array in batch ( 3000 )
1  y length of 1 array in batch (1)
5  x values per instance  (should be equal to the # of x columns)
1  y values per instance  (should be equal to the # of y columns)


## Sequential 3 layers model

### Set Experiment sequential settings

In [None]:
sequential_model = ansim.dnn.SequentialModel(dataset_train, dataset_test,
                                            input_shape_instances= data_preprocessor.windowDataset.window_size,
                                            input_shape_features = len(data_preprocessor.x_columns))

sequential_model.lr=1e-3
sequential_model.epochs=100
sequential_model.loss = "mse"
sequential_model.metrics = ["mse", "mae"]

### Train then evaluate

In [None]:
SEQUENTIAL_BEST_MODEL = data_preprocessor.data_root_path+'models/best_maxgz_sequential_a'


In [None]:
sequential_model.train(stop_early = True, best_model_name = SEQUENTIAL_BEST_MODEL, plot_loss= True, verbose=1, patience=20)


In [None]:
sequential_model.evaluate() # evaluate the overall model




In [None]:
sequential_model.model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 15000)             0         
_________________________________________________________________
dense (Dense)                (None, 100)               1500100   
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1010      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 22        
Total params: 1,501,132
Trainable params: 1,501,132
Non-trainable params: 0
_________________________________________________________________


In [None]:

sequential_model.model.save(data_preprocessor.data_root_path+'models/overall_maxgz_sequential_a')

In [None]:
sequential_model.evaluate_saved_model(SEQUENTIAL_BEST_MODEL+'.h5', verbose=1)
#saved_model.evaluate(dataset_test, verbose=1) #[0.595668613910675, 0.5941470861434937, 0.6601144075393677]

Train: loss, mse mae -->  [0.2257266342639923, 0.2257266342639923, 0.45023342967033386]
Test: loss, mse mae -->  [0.19550250470638275, 0.19550250470638275, 0.42000943422317505]


In [None]:
sequential_model.evaluate_saved_model(data_preprocessor.data_root_path+'models/overall_maxgz_sequential_a', verbose=1)


In [None]:
sequential_model = None

## LSTM model

### Set Experiment dnn settings for tuning lr

In [None]:
lstm_model = None
lstm_model = ansim.dnn.Lstm(dataset_train, dataset_test,
                            input_shape_instances= data_preprocessor.windowDataset.window_size,
                            input_shape_features = len(data_preprocessor.x_columns))

# we want to tune first 
#lstm_model.lr=1e-8
lstm_model.epochs=100
lstm_model.metrics = ["mse", "mae"]

### Running 100 epochs each with different lr

In [None]:
history_lr = lstm_model.tune_lr()

In [None]:
print(lstm_model.get_best_epoch(history_lr)) # 0.0001


#plt.semilogx(history_lr.history["lr"], history_lr.history["loss"])
#plt.axis([1e-8, 1e-1, 0, max(history_lr.history["loss"])+1])
#plt.xlabel('learning rate')
#plt.ylabel('loss (Huber)')


In [None]:
# set best lr based on above
best_lr =  0.0001


### Set Experiment lstm with the best lr

In [None]:
lstm_model.lr=best_lr
lstm_model.epochs=500
lstm_model.metrics = ["mse", "mae"]
lstm_model.loss = "mse"

### Running 500 epochs using the best lr

In [None]:
LSTM_BEST_MODEL = data_preprocessor.data_root_path+'models/'+ 'best_maxgz_lstm_a'


In [None]:
history = lstm_model.train(stop_early = True, best_model_name = LSTM_BEST_MODEL, plot_loss= True, verbose=1, patience=100)

In [None]:

lstm_model.evaluate() # evaluate the overall model





In [None]:
lstm_model.model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 3000, 64)          9728      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 2)                 130       
Total params: 34,690
Trainable params: 34,690
Non-trainable params: 0
_________________________________________________________________


In [None]:
lstm_model.evaluate_saved_model(LSTM_BEST_MODEL+'.h5', verbose=1)

Train: loss, mse mae -->  [0.16775135695934296, 0.16775135695934296, 0.3502233326435089]
Test: loss, mse mae -->  [0.33850833773612976, 0.33850833773612976, 0.5270208120346069]
