In [None]:
!pip install tsgm python-dotenv

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import zipfile
import os
import shutil
import random
import re
import seaborn as sns
from scipy import stats, signal
from collections import Counter
import matplotlib.pyplot as plt
import tsgm
import csv
import dotenv
import gspread


import sys
from google.auth import default
from google.colab import auth, drive
from tqdm import tqdm
from tqdm.notebook import tqdm

tqdm.pandas()

drive.mount('/content/drive', force_remount=True)

dotenv.load_dotenv('/content/drive/MyDrive/.env')

DELETE_AUG_SAMPLE = False
DATASET_PROCEDURES = ["DP_1", "DP_2"]
AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE = { "AP_2": 1, "AP_3": 1, "AP_4": 2, "AP_5" : 9, "AP_6" : 10, "AP_6" : 11, "AP_7" : 11, "AP_8" : 22, "AP_9" : 22}
AUGMENTATION_PROCEDURES = AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE.keys()
MAIN_PATH = os.environ.get('MAIN_EXP_PATH')
MAIN_PROCESSED_PATH = f"{MAIN_PATH}(Processed)"



In [None]:
sys.path.insert(0, os.environ.get('UTIL_SCRIPT_PATH'))
from analyze_and_transform_datasets import list_files_scandir, \
                                          get_formatted_values, \
                                          format_time_str

In [None]:

all_data_infos = {}
metric_file_list = ["ST.csv", "EDA.csv", "EDA_Tonic.csv", "EDA_Phasic.csv", "BVP.csv"]

for DATASET_PROCEDURE in DATASET_PROCEDURES:
  all_data_list = []
  list_files_scandir(metric_file_list, all_data_list, f'{MAIN_PROCESSED_PATH} {DATASET_PROCEDURE}', MAIN_PATH)
  print(f'Number of files will be used in procedure {DATASET_PROCEDURE} : {len(all_data_list)}')
  all_data_infos[DATASET_PROCEDURE] = all_data_list


### Physiological Metric Colleting

From random sample , showing results.

In [None]:
random_procedure = random.sample(DATASET_PROCEDURES, 1)[0]
random_samples = random.sample(all_data_infos[random_procedure], 3)

In [None]:


sample_path = MAIN_PROCESSED_PATH + ' ' + random_procedure + '/' + random_samples[0]['diagnose_result']
sample_path += '/' + random_samples[0]['sample_name']
sample_path += '/' + random_samples[0]['game_name']
df = pd.read_csv(f'{sample_path}/ST.csv')
#df.insert(df.shape[1], 'source_path', [sample_path for _ in range(df.shape[0])])
if 'ST' in df.columns:
  df = df.drop(columns=['ST'], axis=1)
df = df.rename(columns={"values": "ST_values"})
df['time'] = pd.to_datetime(df['time'].apply(format_time_str), unit='s')
df = df.sort_values(by = 'time',ascending = True)
df = df.set_index('time')
df.head(10)

In [None]:
st_values = df['ST_values'].apply(get_formatted_values).astype(np.float64).values
df = pd.read_csv(f'{sample_path}/EDA.csv')
if 'Unnamed: 0' in df.columns:
  df = df.drop(columns=['Unnamed: 0'], axis=1)
df = df.rename(columns={"values": "EDA_values"})
df['time'] = pd.to_datetime(df['time'].apply(format_time_str), unit='s')
df = df.sort_values(by = 'time',ascending = True)
df = df.set_index('time')
df.head(10)

In [None]:
eda_values = df['EDA_values'].apply(get_formatted_values).values
df = pd.read_csv(f'{sample_path}/EDA_Tonic.csv')
if 'Unnamed: 0' in df.columns:
  df = df.drop(columns=['Unnamed: 0'], axis=1)
df = df.rename(columns={"values": "EDA_Tonic_values"})
df['time'] = pd.to_datetime(df['time'].apply(format_time_str), unit='s')
df = df.sort_values(by = 'time',ascending = True)
df = df.set_index('time')
df.head(10)

In [None]:
eda_tonic_values = df['EDA_Tonic_values'].apply(get_formatted_values).values
df = pd.read_csv(f'{sample_path}/EDA_Phasic.csv')
if 'Unnamed: 0' in df.columns:
  df = df.drop(columns=['Unnamed: 0'], axis=1)
df = df.rename(columns={"values": "EDA_Phasic_values"})
df['time'] = pd.to_datetime(df['time'].apply(format_time_str), unit='s')
df = df.sort_values(by = 'time',ascending = True)
df = df.set_index('time')
df.head(10)

In [None]:
eda_phasic_values = df['EDA_Phasic_values'].apply(get_formatted_values).values
df = pd.read_csv(f'{sample_path}/BVP.csv')
if 'Unnamed: 0' in df.columns:
  df = df.drop(columns=['Unnamed: 0'], axis=1)
df = df.rename(columns={"values": "BVP_values"})
df['time'] = pd.to_datetime(df['time'].apply(format_time_str), unit='s')
df = df.sort_values(by = 'time',ascending = True)
df = df.set_index('time')
bvp_values = df['BVP_values'].apply(get_formatted_values).values
df.head(10)

#### Data Augmentation

In [None]:
# from : https://towardsdatascience.com/time-series-augmentations-16237134b29b


 Jittering / Gaussian Noise


In [None]:
aug_model = tsgm.models.augmentations.GaussianNoise()
samples = np.zeros((X.shape[0] * 10  , X.shape[1], X.shape[2]))
for feat in range(X.shape[2]):
  resahpe_X = np.reshape(X[:, :, feat], (X.shape[0], X.shape[1], 1))
  aug_result = aug_model.generate(X=resahpe_X, n_samples=X.shape[0] * 10, variance=0.2)
  samples[:, :, feat] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))
plot_samples_aug(X, samples)

In [None]:

samples = np.full((10, max_length, 5), np.nan)
resahpe_X = np.reshape(st_values, (1, len(st_values), 1))
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, variance=0.3)
samples[:, :len(st_values), 0] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(eda_values, (1, len(eda_values), 1))
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, variance=0.3)
samples[:, :len(eda_values), 1] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(eda_tonic_values, (1, len(eda_tonic_values), 1))
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, variance=0.3)
samples[:, :len(eda_tonic_values), 2] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(eda_phasic_values, (1, len(eda_phasic_values), 1))
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, variance=0.3)
samples[:, :len(eda_phasic_values), 3] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(bvp_values, (1, len(bvp_values), 1))
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, variance=0.3)
samples[:, :len(bvp_values), 4] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

plot_samples_aug(sample_matrix, samples, n_samples=1, n_samples_aug=5, name_features=True)

Magnitude Warping

In [None]:
aug_model = tsgm.models.augmentations.MagnitudeWarping()
samples = np.zeros((X.shape[0] * 10  , X.shape[1], X.shape[2]))
for feat in range(X.shape[2]):
  resahpe_X = np.reshape(X[:, :, feat], (X.shape[0], X.shape[1], 1))
  aug_result = aug_model.generate(X=resahpe_X, n_samples=X.shape[0] * 10, sigma=1)
  samples[:, :, feat] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))
# samples = aug_model.generate(X=X, n_samples=10, sigma=1)
plot_samples_aug(X, samples)

In [None]:

samples = np.full((10, max_length, 5), np.nan)
resahpe_X = np.reshape(st_values, (1, len(st_values), 1))
local_max = np.max(resahpe_X)
local_min = np.min(resahpe_X)
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, sigma=0.2)
samples[:, :len(st_values), 0] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(eda_values, (1, len(eda_values), 1))
local_max = np.max(resahpe_X)
local_min = np.min(resahpe_X)
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, sigma=0.2)
samples[:, :len(eda_values), 1] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(eda_tonic_values, (1, len(eda_tonic_values), 1))
local_max = np.max(resahpe_X)
local_min = np.min(resahpe_X)
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, sigma=0.2)
samples[:, :len(eda_tonic_values), 2] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(eda_phasic_values, (1, len(eda_phasic_values), 1))
local_max = np.max(resahpe_X)
local_min = np.min(resahpe_X)
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, sigma=0.2)
samples[:, :len(eda_phasic_values), 3] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

resahpe_X = np.reshape(bvp_values, (1, len(bvp_values), 1))
local_max = np.max(resahpe_X)
local_min = np.min(resahpe_X)
aug_result = aug_model.generate(X=resahpe_X, n_samples=sample_matrix.shape[0] * 10, sigma=0.2)
samples[:, :len(bvp_values), 4] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

plot_samples_aug(sample_matrix, samples, n_samples=1, n_samples_aug=5, name_features=True)

Both Jittering and Warping Applied

In [None]:
def get_constrained_augmented_smaples(original_value_list):

  aug_model_jittering = tsgm.models.augmentations.GaussianNoise()
  aug_model_warping = tsgm.models.augmentations.MagnitudeWarping()

  max_length = np.max([len(original_values) for original_values in original_value_list])
  # sample_matrix = np.full((1, max_length, len(metric_file_list)), np.nan)
  # for metric_idx in range(len(metric_file_list)):
  #   sample_matrix[0, :len(value_list[metric_idx]), metric_idx] = value_list[metric_idx]

  aug_datas = {}

  for AUGMENTATION_PROCEDURE in AUGMENTATION_PROCEDURES:
    n_samples = AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE[AUGMENTATION_PROCEDURE]
    samples = np.full((n_samples, max_length, len(metric_file_list)), np.nan)
    for metric_idx in range(len(metric_file_list)):
      value_list = original_value_list[metric_idx]
      max_value = np.max(value_list)
      min_value = np.min(value_list)
      resahpe_X = np.reshape(value_list, (1, len(value_list), 1))
      aug_result = resahpe_X.copy()
      if AUGMENTATION_PROCEDURE == 'AP_2':
        aug_result = aug_model_jittering.generate(X=resahpe_X, n_samples=n_samples, variance=0.3)
      if AUGMENTATION_PROCEDURE == 'AP_3':
        aug_result = aug_model_warping.generate(X=aug_result, n_samples=n_samples, sigma=0.15)
      if AUGMENTATION_PROCEDURE == 'AP_5':
        aug_result = aug_model_jittering.generate(X=resahpe_X, n_samples=int(n_samples / 3), variance=0.3)
        aug_result = aug_model_warping.generate(X=aug_result, n_samples=n_samples, sigma=0.15)
      if AUGMENTATION_PROCEDURE == 'AP_6' or AUGMENTATION_PROCEDURE == 'AP_8':
        aug_result = aug_model_warping.generate(X=aug_result, n_samples=n_samples, sigma=0.2)
      if AUGMENTATION_PROCEDURE == 'AP_7' or AUGMENTATION_PROCEDURE == 'AP_9':
        aug_result = aug_model_jittering.generate(X=aug_result, n_samples=n_samples, variance=0.5)
      samples[:, :len(value_list), metric_idx] = np.reshape(aug_result, (aug_result.shape[0], aug_result.shape[1]))

    aug_datas[AUGMENTATION_PROCEDURE] = samples

  return aug_datas



In [None]:

agg_sample_info = get_constrained_augmented_smaples([st_values, eda_values, eda_tonic_values, eda_phasic_values, bvp_values])
for agg_sample_key in agg_sample_info.keys():
  print(f'======================={agg_sample_key}======================')
  plot_samples_aug(sample_matrix, agg_sample_info[agg_sample_key], n_samples=1, n_samples_aug=agg_sample_info[agg_sample_key].shape[0], name_features=True)

Augment in all dataset values and write in augmented files

In [None]:

def augment_sample(data_infos, dt_procedure):
  sample_path = data_infos['diagnose_result']
  sample_path += '/' + data_infos['sample_name']
  sample_path += '/' + data_infos['game_name']
  value_list = []
  df_list = []
  print(f'Reading values in file {sample_path} ...')
  for metric_file in metric_file_list:
    sample_file_path = f'{MAIN_PROCESSED_PATH} {dt_procedure}/{sample_path}/{metric_file}'
    df = pd.read_csv(sample_file_path)
    df['datetime'] = pd.to_datetime(df['time'].apply(format_time_str), unit='s')
    df = df.sort_values(by = 'datetime',ascending = True)
    df = df.set_index('datetime')
    df_list.append(df.copy())
    value_list.append(df['values'].apply(lambda x: get_formatted_values(x)).astype(np.float64))

  agg_sample_info = get_constrained_augmented_smaples(value_list)

  for agg_sample_key in agg_sample_info.keys():
    samples = agg_sample_info[agg_sample_key]
    for aug_sample_idx in range(samples.shape[0]):
      for metric_idx, metric_file in enumerate(metric_file_list):
        aug_df_copy = df_list[metric_idx].copy()
        aug_df_copy['values'] = samples[aug_sample_idx, :len(value_list[metric_idx]), metric_idx]
        aug_df_copy.index = np.array(range(len(value_list[metric_idx])))

        if 'Unnamed: 0' in aug_df_copy.columns:
          aug_df_copy = aug_df_copy.drop(columns=['Unnamed: 0'], axis=1)

        augmented_file_name = f'{MAIN_PROCESSED_PATH} {dt_procedure}/{sample_path}/Aug-{agg_sample_key}-{(aug_sample_idx + 1)}_{metric_file}'
        if os.path.exists(augmented_file_name):
          if DELETE_AUG_SAMPLE:
            os.remove(augmented_file_name)
            aug_df_copy.to_csv(augmented_file_name, index=True)
            print(f'{augmented_file_name} file recreated')
          else:
            print(f'{augmented_file_name} file already exist')
        else:
          aug_df_copy.to_csv(augmented_file_name, index=True)
          print(f'{augmented_file_name} file created')



In [None]:

for DATASET_PROCEDURE in DATASET_PROCEDURES:
  for data_infos_idx in tqdm(range(len(all_data_infos[DATASET_PROCEDURE])), desc=f'Training process for {DATASET_PROCEDURE} dataset procedure'):
    data_infos = all_data_infos[DATASET_PROCEDURE][data_infos_idx]
    augment_sample(data_infos, DATASET_PROCEDURE)

print("Augmentation Completed!")