In [None]:
# %reset -f

In [None]:
!pip install python-dotenv imbalanced-learn

In [None]:
!pip install --upgrade tensorflow

In [None]:

%load_ext autoreload
%autoreload 2

import dotenv
import gspread
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile
import os
import random
import re
import datetime
import math
import copy
import seaborn as sns
from scipy import stats, signal
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.impute import KNNImputer
from imblearn.under_sampling import RandomUnderSampler


from tensorflow import keras
from IPython.display import SVG, display, Image

from google.auth import default
from google.colab import auth, drive


drive.mount('/content/drive', force_remount=True)


dotenv.load_dotenv('/content/drive/MyDrive/.env')

tqdm.pandas()

DELETE_FEATURE_SAMPLE = False

DATASET_PROCEDURES = ["DP_1", "DP_2"]
AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE = {'AP_1': 0, 'AP_2': 1, 'AP_3': 1, 'AP_5': 9, 'AP_6': 11, 'AP_7': 11, 'AP_8': 22, 'AP_9': 22}
AUGMENTATION_PROCEDURES = AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE.keys()
AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE = {'AP_4': ['AP_1', 'AP_2', 'AP_3'], 'AP_ALL': ['AP_1', 'AP_2', 'AP_3', 'AP_5', 'AP_6', 'AP_7'], 'AP_ALL_2': ['AP_1', 'AP_2', 'AP_3', 'AP_5', 'AP_8', 'AP_9'], 'AP_ALL_3': ['AP_1', 'AP_2', 'AP_3', 'AP_5', 'AP_6', 'AP_7', 'AP_8', 'AP_9'], 'AP_ALL_4': ['AP_1', 'AP_2', 'AP_3', 'AP_5']}
FEATURE_PROCEDURES = ['FP_1', 'FP_2', 'FP_3', 'FP_4', 'FP_5', 'FP_6', 'FP_7', 'FP_8', 'FP_9', 'FP_10', 'FP_11', 'FP_12', 'FP_13', 'FP_14']
EXPERT_SELECTION_PROCEDURE = {'FP_1': None, 'FP_2': None, 'FP_3': None, 'FP_4': None, 'FP_5': None, 'FP_6': 'Expert1', 'FP_7': 'Expert2', 'FP_8': 'Expert3', 'FP_9': None, 'FP_10': None, 'FP_11': None, 'FP_12': 'Expert1', 'FP_13': 'Expert2', 'FP_14': 'Expert3'}
DIAGNOSE_GROUP_SELECTION_PROCEDURE = {'FP_1': None, 'FP_2': None, 'FP_3': None, 'FP_4': None, 'FP_5': ['Typically Developed'], 'FP_6': None, 'FP_7': None, 'FP_8': None, 'FP_9': ['Intellectual Disabilities'], 'FP_10': ['Dyslexia', 'Obstetric Brachial Plexus Injuries'], 'FP_11': None, 'FP_12': None, 'FP_13': None, 'FP_14': None}
EXPERT_NAME_LIST = ['Expert1', 'Expert2', 'Expert3']
DIAGNOSE_GROUP_NAME_LIST = []

MAIN_PATH = os.environ.get('MAIN_EXP_PATH')
MAIN_PROCESSED_PATH = f"{MAIN_PATH}(Processed)"

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
spreadsheet = gc.open_by_key(os.environ.get('EXPR_PROC_FILE_CODE'))

for worksheet in  spreadsheet.worksheets():
  if worksheet.title == 'DatasetProcedures':
    df = pd.DataFrame(worksheet.get())
    # # Code for making the first row as header. Remove if not needed.
    df.columns = df.iloc[0]
    df = df.drop(0)
    DATASET_PROCEDURES = list(df[df['Is Active'] == 'TRUE']['Procedure Code'].values)

  if worksheet.title == 'AugmentationProcedures':
    df = pd.DataFrame(worksheet.get())
    # # Code for making the first row as header. Remove if not needed.
    df.columns = df.iloc[0]
    df = df.drop(0)
    augmentation_procedure_list = list(df[df['Is Active'] == 'TRUE'][['Procedure Code', 'Number Of Augmentation', 'Augmentation Steps']].values)
    AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE = {sheet_data[0]: int(sheet_data[1]) for sheet_data in augmentation_procedure_list if sheet_data[1] != ''}
    AUGMENTATION_PROCEDURES = AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE.keys()
    AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE = {sheet_data[0]: [aug_proc.strip() for aug_proc in sheet_data[2].split(',')] for sheet_data in augmentation_procedure_list if sheet_data[1] == ''}


  if worksheet.title == 'FeatureSelections':
    df = pd.DataFrame(worksheet.get())
    # # Code for making the first row as header. Remove if not needed.
    df.columns = df.iloc[0]
    df = df.drop(0)
    FEATURE_PROCEDURES = list(df[df['Is Active'] == 'TRUE']['Procedure Code'].values)
    EXPERT_SELECTION_PROCEDURE = {sheet_data[0]: sheet_data[1] if sheet_data[1] != 'All' else None  for sheet_data in df[df['Is Active'] == 'TRUE'][['Procedure Code', 'Selected Expert']].values}
    EXPERT_NAME_LIST = list(EXPERT_SELECTION_PROCEDURE.values())
    DIAGNOSE_GROUP_SELECTION_PROCEDURE = {sheet_data[0]: [specific_diagnosed_group.strip() for specific_diagnosed_group in sheet_data[1].split(',')] if sheet_data[1] != 'All' else None  for sheet_data in df[df['Is Active'] == 'TRUE'][['Procedure Code', 'Diagnose Group']].values}



In [None]:
DIAGNOSE_GROUP_SELECTION_PROCEDURE

In [None]:
util_script_path = os.environ.get('UTIL_SCRIPT_PATH')
if util_script_path is not None:
  sys.path.insert(0, util_script_path)
else:
  print("Warning: UTIL_SCRIPT_PATH environment variable is not set.")
  # Consider providing a default path or handling the error differently

In [None]:
from analyze_and_transform_datasets import list_files_scandir, get_formatted_values

In [None]:

all_data_infos = {}
all_data_list = []
allowed_files = ["ST_Features_10s.csv","EDA_Phasic_Features_10s.csv","EDA_Tonic_Features_10s.csv","EDA_Features_10s.csv","BVP_Features_10s.csv"]

for DATASET_PROCEDURE in DATASET_PROCEDURES:
  all_data_list = []
  list_files_scandir(allowed_files, all_data_list, f'{MAIN_PROCESSED_PATH} {DATASET_PROCEDURE}', MAIN_PATH)
  if DATASET_PROCEDURE == 'DP_2':
    removed_idx = [idx for idx, data_obj in enumerate(all_data_list) if data_obj['diagnose_result'] == 'Intellectual Disabilities' and data_obj['sample_name'] == 'C8' and data_obj['game_name'] == 'CatchAPet'][0]
    __ = all_data_list.pop(removed_idx)

  print(f'Number of files will be used in procedure {DATASET_PROCEDURE}: {len(all_data_list)}')
  all_data_infos[DATASET_PROCEDURE] = all_data_list


#### Util Functions

In [None]:


def get_metric_dataframes(data_infos, dt_procedure, file_name, referenced_main_path = './'):
  sample_path = data_infos['diagnose_result']
  sample_path += '/' + data_infos['sample_name']
  sample_path += '/' + data_infos['game_name']
  # print(f'Reading values in file {sample_path}/{file_name} ...')
  df = None
  df_path = f'{referenced_main_path} {dt_procedure}/{sample_path}/{file_name}'
  if os.path.exists(df_path):
    df = pd.read_csv(df_path)
  else:
    print(f'File {df_path} does not exist!')

  if df is not None:
    if 'Unnamed: 0' in df.columns:
      df = df.drop(columns=['Unnamed: 0'], axis=1)
    if 'Seconds' in df.columns:
      df = df.drop(columns=['Seconds'], axis=1)

  df.index = [idx for idx in range(df.shape[0])]
  return df

In [None]:
def get_labels_df(data_infos, referenced_main_path, expert_column=None):
  sample_path = data_infos['diagnose_result']
  sample_path += '/' + data_infos['sample_name']
  sample_path += '/' + data_infos['game_name']
  experLabel_df = pd.read_csv(f'{referenced_main_path}/{sample_path}/ExpertLabels.csv')
  experLabel_df = experLabel_df.dropna()
  number_of_labels = 0
  for label_values in experLabel_df[['Expert1','Expert2','Expert3']].values[1:]:
    if np.any([(label_value.__class__.__name__ == 'str') for label_value in label_values]):
      number_of_labels += 1

  label_result_df = pd.DataFrame()
  experLabel_df = experLabel_df[['Expert1','Expert2','Expert3']][1: number_of_labels]
  if expert_column is None:
    diagnose_result_list = []
    for idx in range(0, experLabel_df.shape[0]):
      diagnose_dict = dict(Counter(experLabel_df[['Expert1','Expert2','Expert3']].iloc[idx].values))
      diagnose_result = max(diagnose_dict, key= lambda x: diagnose_dict[x])
      # print(diagnose_result)
      if diagnose_result.__class__.__name__ == 'str':
        diagnose_result_list.append(diagnose_result)

    label_result_df['Diagnose'] = pd.Series(data=diagnose_result_list)
  else:
   label_result_df['Diagnose'] = experLabel_df[expert_column]

  label_result_df = label_result_df.drop(columns=['Expert1','Expert2','Expert3'], axis=1)
  label_result_df = label_result_df.dropna()
  label_result_df.index = [idx for idx in range(label_result_df.shape[0]) ]
  return label_result_df



In [None]:


def get_stress_sample_rate(param_all_data_infos, referenced_main_path = './', dataset_procedures = [], expert_name=None, print_result=False):
  random_generated_weights_info = {}
  df_lable_sum = 0
  for dataset_procedure in dataset_procedures:
    random_generated_weights_index_for_augmented_samples_list = []
    for data_idx, data_infos in enumerate(all_data_infos[dataset_procedure]):
      labels_df = get_labels_df(data_infos, referenced_main_path, expert_name)
      stess_sample_rate = (len(labels_df[labels_df['Diagnose'] == 'Stress']) / len(labels_df))

      if print_result:
        print(data_infos, stess_sample_rate, len(labels_df))

      df_lable_sum += len(labels_df)

      random_generated_weights_index_for_augmented_samples_list.append(stess_sample_rate)

    random_generated_weights_info[dataset_procedure] = random_generated_weights_index_for_augmented_samples_list

    if print_result:
      print(np.mean(random_generated_weights_index_for_augmented_samples_list), df_lable_sum)

  return random_generated_weights_info


In [None]:

def get_random_generated_index_for_augmented_samples_infos(param_all_data_infos, referenced_main_path = './', dataset_procedures = [], augmentation_procedures = [], augmentation_sample_amount_by_procedure = {}, expert_name=None):

  probability_weight_infos = get_stress_sample_rate(param_all_data_infos, referenced_main_path, dataset_procedures, expert_name)

  result_random_generated_index_for_augmented_samples_info = {}
  for aug_procedure in ['AP_2', 'AP_3']:
    for dataset_procedure in dataset_procedures:
      if aug_procedure in augmentation_procedures:
        scaled_probability_weight = probability_weight_infos[dataset_procedure]
        size_of_list = int(len(all_data_infos[dataset_procedure]) * (np.mean(scaled_probability_weight)))
        scaled_probability_weight = (np.full((len(scaled_probability_weight), ), 1) - scaled_probability_weight)
        scaled_probability_weight = scaled_probability_weight / np.sum(scaled_probability_weight)
        result_random_generated_index_for_augmented_samples_info[f'{dataset_procedure}--{aug_procedure}'] = np.random.choice(len(all_data_infos[dataset_procedure]), size=size_of_list, p=scaled_probability_weight, replace=False)

  for aug_procedure in ['AP_5', 'AP_6', 'AP_7']:
    for dataset_procedure in dataset_procedures:
      if aug_procedure in augmentation_procedures:
        result_random_generated_index_for_augmented_samples_info[f'{dataset_procedure}--{aug_procedure}'] = np.random.randint(augmentation_sample_amount_by_procedure[aug_procedure], size=len(all_data_infos[dataset_procedure]))


  for aug_procedure in ['AP_8', 'AP_9']:
    for dataset_procedure in dataset_procedures:
      if aug_procedure in augmentation_procedures:
        result_random_generated_index_for_augmented_samples_info[f'{dataset_procedure}--{aug_procedure}'] = np.random.randint(augmentation_sample_amount_by_procedure[aug_procedure], size=(len(all_data_infos[dataset_procedure]),2))

  return result_random_generated_index_for_augmented_samples_info



In [None]:

def get_datasets_by_experment_procedures(param_all_data_infos, augmented_samples_index_info, augmentation_sample_amount_by_procedure, augmentation_procedures, dataset_procedures, referenced_main_path, referenced_main_processed_path, feature_selection_expert_procedure_info, metric_file_list = []):

  result_all_metric_dataframe_infos = {}

  for augmentation_procedure in augmentation_procedures:

    # all_metric_dataframe = pd.DataFrame()

    for dataset_procedure in dataset_procedures:

      all_metric_expert_dataframe_info = {}
      for feature_selection_procedure in feature_selection_expert_procedure_info.keys():
        all_metric_expert_dataframe_info[feature_selection_procedure] = pd.DataFrame()

      for data_idx in tqdm(range(len(all_data_infos[dataset_procedure]))):

        data_infos = all_data_infos[dataset_procedure][data_idx]

        start_idx = 0 if augmentation_procedure == 'AP_1' else 1
        for idx in range(start_idx, (augmentation_sample_amount_by_procedure[augmentation_procedure] + 1)):
          metric_dataframe_list = []
          not_found_metric_file_list = []
          for metric_file in metric_file_list:
            metric_dataframe = None
            file_name = metric_file
            if idx > 0:
              file_name = f'Aug-{augmentation_procedure}-{idx}_{file_name}'

            metric_dataframe = get_metric_dataframes(data_infos, dataset_procedure, file_name, referenced_main_processed_path)


            if metric_dataframe is not None:
              metric_dataframe_list.append(metric_dataframe)
              # sample_metric_length = np.min([sample_metric_length, metric_dataframe.shape[0]])
            else:
              not_found_metric_file_list.append(metric_file)

          if len(metric_dataframe_list) == len(metric_file_list):

            for feature_selection_procedure in feature_selection_expert_procedure_info.keys():

              labels_df = get_labels_df(data_infos, referenced_main_path, feature_selection_expert_procedure_info[feature_selection_procedure])
              sample_metric_length = labels_df.shape[0]

              smaple_lenghts = [metric_dataframe.shape[0] for metric_dataframe in metric_dataframe_list]
              smaple_lenghts.append(sample_metric_length)
              sample_metric_length = np.min(smaple_lenghts)

              sample_metric_dataframe = labels_df.loc[0:sample_metric_length]
              for metric_dataframe in metric_dataframe_list:
                sample_metric_dataframe = sample_metric_dataframe.join(metric_dataframe.loc[0:sample_metric_length])
                sample_metric_dataframe['is_augmented'] = np.array([idx > 0 for _ in range(sample_metric_dataframe.shape[0])])
              if idx > 0:
                random_sample_index_list = augmented_samples_index_info[f'{dataset_procedure}--{augmentation_procedure}']
                use_for_both_strest_and_nonstress_labeld_samples = False
                if 1 == len(random_sample_index_list.shape):
                  use_for_both_strest_and_nonstress_labeld_samples = (data_idx in random_sample_index_list)
                elif 2 == len(random_sample_index_list.shape):
                  use_for_both_strest_and_nonstress_labeld_samples = ((idx - 1) in random_sample_index_list[data_idx])
                else:
                  use_for_both_strest_and_nonstress_labeld_samples = (data_idx in random_sample_index_list)
                # print("use_for_both_strest_and_nonstress_labeld_samples : ", use_for_both_strest_and_nonstress_labeld_samples)
                if not use_for_both_strest_and_nonstress_labeld_samples:
                  sample_metric_dataframe = sample_metric_dataframe[sample_metric_dataframe['Diagnose'] == 'Stress']

              all_metric_expert_dataframe_info[feature_selection_procedure] = pd.concat([all_metric_expert_dataframe_info[feature_selection_procedure], sample_metric_dataframe])
          else:
            print('Requires number of feature metric not found in this sample: ', data_infos, idx, not_found_metric_file_list)

      for feature_selection_procedure in feature_selection_expert_procedure_info.keys():

        print(f'Number of sample : {all_metric_expert_dataframe_info[feature_selection_procedure].shape[0]}')

        all_metric_expert_dataframe_info[feature_selection_procedure].index = [idx for idx in range(all_metric_expert_dataframe_info[feature_selection_procedure].shape[0])]
        result_all_metric_dataframe_infos[f'{dataset_procedure}--{augmentation_procedure}--{feature_selection_procedure}'] = all_metric_expert_dataframe_info[feature_selection_procedure].copy()
        print(f"Gathering copmleted for {dataset_procedure} and {augmentation_procedure} procedures with output of expoer {feature_selection_procedure}!")


  print("Gathering copmleted for all procedures!")

  return result_all_metric_dataframe_infos



In [None]:

def get_datasets_by_diagnose_group_procedures(param_all_data_infos, augmented_samples_index_info, augmentation_sample_amount_by_procedure, augmentation_procedures, dataset_procedures, referenced_main_path, referenced_main_processed_path, feature_selection_diagnosed_group_procedure_info, metric_file_list = []):

  result_all_metric_dataframe_infos = {}

  for augmentation_procedure in augmentation_procedures:


    for dataset_procedure in dataset_procedures:

      for feature_selection_procedure in feature_selection_diagnosed_group_procedure_info.keys():

        all_metric_dataframe = pd.DataFrame()
        diagnore_group_names = feature_selection_diagnosed_group_procedure_info[feature_selection_procedure]

        sub_data_infos =  [data_infos for data_infos in all_data_infos[dataset_procedure] if diagnore_group_names is None or data_infos['diagnose_result'] in diagnore_group_names]
        print(diagnore_group_names, len(sub_data_infos))

        procedure_code_sequence = f'{dataset_procedure}--{augmentation_procedure}--{feature_selection_procedure}'
        for data_idx in tqdm(range(len(sub_data_infos)), desc=f'Gathering dataste with procedure code sequence {procedure_code_sequence}'):

          data_infos = sub_data_infos[data_idx]

          start_idx = 0 if augmentation_procedure == 'AP_1' else 1
          for idx in range(start_idx, (augmentation_sample_amount_by_procedure[augmentation_procedure] + 1)):
            metric_dataframe_list = []
            not_found_metric_file_list = []
            for metric_file in metric_file_list:
              metric_dataframe = None
              file_name = metric_file
              if idx > 0:
                file_name = f'Aug-{augmentation_procedure}-{idx}_{file_name}'

              metric_dataframe = get_metric_dataframes(data_infos, dataset_procedure, file_name, referenced_main_processed_path)

              if metric_dataframe is not None:
                metric_dataframe_list.append(metric_dataframe)
                # sample_metric_length = np.min([sample_metric_length, metric_dataframe.shape[0]])
              else:
                not_found_metric_file_list.append(metric_file)

            if len(metric_dataframe_list) == len(metric_file_list):

              labels_df = get_labels_df(data_infos, referenced_main_path)
              sample_metric_length = labels_df.shape[0]

              smaple_lenghts = [metric_dataframe.shape[0] for metric_dataframe in metric_dataframe_list]
              smaple_lenghts.append(sample_metric_length)
              sample_metric_length = np.min(smaple_lenghts)

              sample_metric_dataframe = labels_df.loc[0:sample_metric_length]
              for metric_idx, metric_dataframe in enumerate(metric_dataframe_list):
                sample_metric_dataframe = sample_metric_dataframe.join(metric_dataframe.loc[0:sample_metric_length])
                sample_metric_dataframe['is_augmented'] = np.array([idx > 0 for _ in range(sample_metric_dataframe.shape[0])])

              if idx > 0:
                random_sample_index_list = augmented_samples_index_info[f'{dataset_procedure}--{augmentation_procedure}']
                use_for_both_strest_and_nonstress_labeld_samples = False
                if 1 == len(random_sample_index_list.shape):
                  use_for_both_strest_and_nonstress_labeld_samples = (data_idx in random_sample_index_list)
                elif 2 == len(random_sample_index_list.shape):
                  use_for_both_strest_and_nonstress_labeld_samples = ((idx - 1) in random_sample_index_list[data_idx])
                else:
                  use_for_both_strest_and_nonstress_labeld_samples = (data_idx in random_sample_index_list)
                # print("use_for_both_strest_and_nonstress_labeld_samples : ", use_for_both_strest_and_nonstress_labeld_samples)
                if not use_for_both_strest_and_nonstress_labeld_samples:
                  sample_metric_dataframe = sample_metric_dataframe[sample_metric_dataframe['Diagnose'] == 'Stress']

              all_metric_dataframe = pd.concat([all_metric_dataframe, sample_metric_dataframe])

            else:
              print('Requires number of feature metric not found in this sample: ', data_infos, idx, not_found_metric_file_list)

        print(f'Number of sample : {all_metric_dataframe.shape[0]}')

        all_metric_dataframe.index = [idx for idx in range(all_metric_dataframe.shape[0])]
        result_all_metric_dataframe_infos[procedure_code_sequence] = all_metric_dataframe.copy()
        print(f"Gathering copmleted for {dataset_procedure} and {augmentation_procedure} procedures with dianose group {feature_selection_procedure}!")




  print("Gathering copmleted for all procedures!")

  return result_all_metric_dataframe_infos



In [None]:

def get_all_metric_dataframe_as_batch(dt_procedure, fs_procedure, augmentation_procedure_batch_list, all_metric_dataframe_infos):

  all_metric_dataframe = pd.DataFrame()
  # for aug_procedure in augmentation_procedure_batch_list:
  for all_metric_dataframe_procedure in all_metric_dataframe_infos.keys():
    if dt_procedure in all_metric_dataframe_procedure and all_metric_dataframe_procedure.split('--')[1] in augmentation_procedure_batch_list and fs_procedure in all_metric_dataframe_procedure:
      all_metric_dataframe = pd.concat([all_metric_dataframe, all_metric_dataframe_infos[all_metric_dataframe_procedure].copy()])

  all_metric_dataframe.index = [idx for idx in range(all_metric_dataframe.shape[0])]
  return all_metric_dataframe



In [None]:


def get_all_metric_with_na_droped_infos(result_all_metric_dataframe_infos):

  result_all_metric_dataframe_with_na_droped_infos = {}

  for result_all_metric_dataframe_procedure in result_all_metric_dataframe_infos.keys():

    print(f'Drop na columns and rows if exist for {result_all_metric_dataframe_procedure} procedure')
    all_metric_dataframe_with_na_droped = result_all_metric_dataframe_infos[result_all_metric_dataframe_procedure].copy()
    all_metric_dataframe_with_na_droped = all_metric_dataframe_with_na_droped.dropna(how='all')
    if 'Unnamed: 0' in all_metric_dataframe_with_na_droped.columns:
      all_metric_dataframe_with_na_droped = all_metric_dataframe_with_na_droped.drop(columns=['Unnamed: 0'])

    all_metric_dataframe_with_na_droped_eda_columns = [column_name for column_name in all_metric_dataframe_with_na_droped.columns if 'eda_' in column_name.lower() or 'scr_' in column_name.lower() or 'scl_' in column_name.lower()]
    all_metric_dataframe_with_na_droped = all_metric_dataframe_with_na_droped.dropna(subset=all_metric_dataframe_with_na_droped_eda_columns)
    all_metric_dataframe_with_na_droped_st_columns = [column_name for column_name in all_metric_dataframe_with_na_droped.columns if 'st_' in column_name.lower()]
    all_metric_dataframe_with_na_droped = all_metric_dataframe_with_na_droped.dropna(subset=all_metric_dataframe_with_na_droped_st_columns)
    all_metric_dataframe_with_na_droped_sum = all_metric_dataframe_with_na_droped.isna().sum().values
    more_than_half_na_column_names = []
    for idx in range(len(all_metric_dataframe_with_na_droped_sum)):
      if all_metric_dataframe_with_na_droped_sum[idx] > len(all_metric_dataframe_with_na_droped) * 0.40:
        more_than_half_na_column_names.append(all_metric_dataframe_with_na_droped.columns[idx])

    print("These columns has na values by 0.40 percentage of all datasets and will be droped : ",  more_than_half_na_column_names)
    all_metric_dataframe_with_na_droped = all_metric_dataframe_with_na_droped.drop(columns=more_than_half_na_column_names)

    all_metric_dataframe_with_na_droped_sum = all_metric_dataframe_with_na_droped.isna().sum().values
    more_than_half_na_column_names = []
    for idx in range(len(all_metric_dataframe_with_na_droped_sum)):
      if all_metric_dataframe_with_na_droped_sum[idx] > len(all_metric_dataframe_with_na_droped) * 0.01:
        more_than_half_na_column_names.append(all_metric_dataframe_with_na_droped.columns[idx])

    print("These columns has na values by 0.01 percentage of of all datasets and their rows will be imputed by KNN : ",  more_than_half_na_column_names)
    all_metric_dataframe_with_na_droped = all_metric_dataframe_with_na_droped.dropna(subset=more_than_half_na_column_names)

    print(all_metric_dataframe_with_na_droped.info())
    result_all_metric_dataframe_with_na_droped_infos[result_all_metric_dataframe_procedure] = all_metric_dataframe_with_na_droped.copy()
    print(all_metric_dataframe_with_na_droped['Diagnose'].describe())


  return result_all_metric_dataframe_with_na_droped_infos



In [None]:
def balance_diagnose_classification_lables(result_all_metric_dataframe_with_na_droped_infos):

  result_all_metric_dataframe_with_balanced_diagnose = {}

  for all_metric_dataframe_procedure in result_all_metric_dataframe_with_na_droped_infos.keys():

    print(f'Balance columns for Diagnose values for {all_metric_dataframe_procedure} procedure')

    new_metric_dataframe = pd.DataFrame()
    df_reference = result_all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_procedure]

    all_metric_dataframe_with_na_droped_infos_copy = df_reference[df_reference['is_augmented']].copy()
    all_metric_dataframe_with_na_droped_infos_copy_stress = all_metric_dataframe_with_na_droped_infos_copy[all_metric_dataframe_with_na_droped_infos_copy['Diagnose'] == 'Stress']
    aug_number_of_stress = len(all_metric_dataframe_with_na_droped_infos_copy_stress)
    print('Number of "Stress" in augmented samples :', aug_number_of_stress)
    all_metric_dataframe_with_na_droped_infos_copy_no_stress = all_metric_dataframe_with_na_droped_infos_copy[all_metric_dataframe_with_na_droped_infos_copy['Diagnose'] == 'No Stress']
    aug_number_of_no_stress = len(all_metric_dataframe_with_na_droped_infos_copy_no_stress)
    print('Number of "No Stress" in augmented samples :',aug_number_of_no_stress)
    if aug_number_of_stress > aug_number_of_no_stress:
      all_metric_dataframe_with_na_droped_infos_stress_undersampled = all_metric_dataframe_with_na_droped_infos_copy_stress.sample(n=aug_number_of_no_stress, random_state=42)
      new_metric_dataframe = pd.concat([all_metric_dataframe_with_na_droped_infos_stress_undersampled, all_metric_dataframe_with_na_droped_infos_copy_no_stress])
    elif aug_number_of_stress < aug_number_of_no_stress:
      all_metric_dataframe_with_na_droped_infos_no_stress_undersampled = all_metric_dataframe_with_na_droped_infos_copy_no_stress.sample(n=aug_number_of_stress, random_state=42)
      new_metric_dataframe = pd.concat([all_metric_dataframe_with_na_droped_infos_no_stress_undersampled, all_metric_dataframe_with_na_droped_infos_copy_stress])
    else:
      new_metric_dataframe = pd.concat([all_metric_dataframe_with_na_droped_infos_copy_stress, all_metric_dataframe_with_na_droped_infos_copy_no_stress])

    all_metric_dataframe_with_na_droped_infos_copy = df_reference[df_reference['is_augmented'] == False].copy()
    all_metric_dataframe_with_na_droped_infos_copy_stress = all_metric_dataframe_with_na_droped_infos_copy[all_metric_dataframe_with_na_droped_infos_copy['Diagnose'] == 'Stress']
    aug_number_of_stress = len(all_metric_dataframe_with_na_droped_infos_copy_stress)
    print('Number of "Stress" in original samples :', aug_number_of_stress)
    all_metric_dataframe_with_na_droped_infos_copy_no_stress = all_metric_dataframe_with_na_droped_infos_copy[all_metric_dataframe_with_na_droped_infos_copy['Diagnose'] == 'No Stress']
    aug_number_of_no_stress = len(all_metric_dataframe_with_na_droped_infos_copy_no_stress)
    print('Number of "No Stress" in original samples :',aug_number_of_no_stress)
    if aug_number_of_stress > aug_number_of_no_stress:
      all_metric_dataframe_with_na_droped_infos_stress_undersampled = all_metric_dataframe_with_na_droped_infos_copy_stress.sample(n=aug_number_of_no_stress, random_state=42)
      new_metric_dataframe = pd.concat([new_metric_dataframe, all_metric_dataframe_with_na_droped_infos_stress_undersampled, all_metric_dataframe_with_na_droped_infos_copy_no_stress])
    elif aug_number_of_stress < aug_number_of_no_stress:
      all_metric_dataframe_with_na_droped_infos_no_stress_undersampled = all_metric_dataframe_with_na_droped_infos_copy_no_stress.sample(n=aug_number_of_stress, random_state=42)
      new_metric_dataframe = pd.concat([new_metric_dataframe, all_metric_dataframe_with_na_droped_infos_no_stress_undersampled, all_metric_dataframe_with_na_droped_infos_copy_stress])
    else:
      new_metric_dataframe = pd.concat([new_metric_dataframe, all_metric_dataframe_with_na_droped_infos_copy_stress, all_metric_dataframe_with_na_droped_infos_copy_no_stress])

    result_all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_procedure] = new_metric_dataframe.copy()

  return result_all_metric_dataframe_with_balanced_diagnose


In [None]:


def get_significant_columns_in_dataset_by_t_test(result_all_metric_dataframe_with_balanced_diagnose, p_treashold=0.05, only_orignial_data=False):

  result_significant_columns_infos = {}

  result_metric_dataframe_with_balanced_diagnose = result_all_metric_dataframe_with_balanced_diagnose.copy()

  for all_metric_dataframe_procedure in result_all_metric_dataframe_with_balanced_diagnose.keys():


    print(f'Find significant features in {all_metric_dataframe_procedure} procedure')

    significant_columns = []
    result_metric_dataframe_with_balanced_diagnose_and_non_formatted = result_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_procedure]
    if only_orignial_data:
      result_metric_dataframe_with_balanced_diagnose_and_non_formatted = result_metric_dataframe_with_balanced_diagnose_and_non_formatted[result_metric_dataframe_with_balanced_diagnose_and_non_formatted['is_augmented'] == False]

    for all_feature_column in result_metric_dataframe_with_balanced_diagnose_and_non_formatted.columns:
      if all_feature_column == 'source_file' or all_feature_column == 'Seconds' or all_feature_column == 'Diagnose' or all_feature_column == 'is_augmented':
        continue
      result_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_procedure][all_feature_column] = pd.to_numeric(
          result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].apply(lambda x: get_formatted_values(x)),
          errors='coerce'  # Ignore errors, set invalid values to NaN
      ).astype(np.float64)

      # result_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_procedure][all_feature_column] = result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].apply(lambda x: get_formatted_values(x)).astype(np.float64)
      sub_metric_dataframe = result_metric_dataframe_with_balanced_diagnose_and_non_formatted[[all_feature_column, 'Diagnose']].copy()
      sub_metric_dataframe = sub_metric_dataframe.dropna(subset=[all_feature_column])

      # https://www.statology.org/pandas-t-test/
       # Proceed with t-test if there are enough data points after dropping NaNs
      if len(sub_metric_dataframe) > 2:  # At least 2 data points for each group
        group1 = sub_metric_dataframe[sub_metric_dataframe['Diagnose'] =='Stress']
        group2 = sub_metric_dataframe[sub_metric_dataframe['Diagnose'] =='No Stress']
        ttest_result = stats.ttest_ind(group1[all_feature_column], group2[all_feature_column], equal_var=False)

        #perform Welch's t-test
        if ttest_result.pvalue < p_treashold:
          significant_columns.append(all_feature_column)

        print(f'Value has t-test with {ttest_result} and is {"not" if ttest_result.pvalue >= 0.05 else ""} significatant ')
        # print(f'Number of nan value in {all_feature_column} : {result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].isna().sum()}' )
        # print(f'Number of null value in {all_feature_column} : {result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].isnull().sum()}' )
        # print(f'Number of 0 value in {all_feature_column} : {result_metric_dataframe_with_balanced_diagnose_and_non_formatted[(result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column] == 0)][all_feature_column].sum()}' )
        print(f'Number of unique values in {all_feature_column} : {len(result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].unique())}' )
        print(result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].describe())
      else:
        print(f"Skipping t-test for column '{all_feature_column}' due to insufficient data points after dropping NaNs.")


    result_significant_columns_infos[all_metric_dataframe_procedure] = significant_columns


  return result_significant_columns_infos



In [None]:
def get_significant_columns_in_dataset_by_t_test(result_all_metric_dataframe_with_balanced_diagnose, p_treashold=0.05, only_orignial_data=False):

  result_significant_columns_infos = {}

  result_metric_dataframe_with_balanced_diagnose = result_all_metric_dataframe_with_balanced_diagnose.copy()

  for all_metric_dataframe_procedure in result_all_metric_dataframe_with_balanced_diagnose.keys():


    print(f'Find significant features in {all_metric_dataframe_procedure} procedure')

    significant_columns = []
    result_metric_dataframe_with_balanced_diagnose_and_non_formatted = result_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_procedure]
    if only_orignial_data:
      result_metric_dataframe_with_balanced_diagnose_and_non_formatted = result_metric_dataframe_with_balanced_diagnose_and_non_formatted[result_metric_dataframe_with_balanced_diagnose_and_non_formatted['is_augmented'] == False]

    for all_feature_column in result_metric_dataframe_with_balanced_diagnose_and_non_formatted.columns:
      if all_feature_column == 'source_file' or all_feature_column == 'Seconds' or all_feature_column == 'Diagnose' or all_feature_column == 'is_augmented':
        continue

      # Drop rows with NaN values in the current column
      sub_metric_dataframe = result_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_procedure][[all_feature_column, 'Diagnose']].copy()
      sub_metric_dataframe = sub_metric_dataframe.dropna(subset=[all_feature_column])

      # https://www.statology.org/pandas-t-test/
      # Proceed with t-test if there are enough data points after dropping NaNs
      if len(sub_metric_dataframe) > 2:  # At least 2 data points for each group
        group1 = sub_metric_dataframe[sub_metric_dataframe['Diagnose'] =='Stress']
        group2 = sub_metric_dataframe[sub_metric_dataframe['Diagnose'] =='No Stress']
        ttest_result = stats.ttest_ind(group1[all_feature_column], group2[all_feature_column], equal_var=False)

        #perform Welch's t-test
        if ttest_result.pvalue < p_treashold:
          significant_columns.append(all_feature_column)

        print(f'Value has t-test with {ttest_result} and is {"not" if ttest_result.pvalue >= 0.05 else ""} significatant ')
        print(f'Number of unique values in {all_feature_column} : {len(result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].unique())}' )
        print(result_metric_dataframe_with_balanced_diagnose_and_non_formatted[all_feature_column].describe())
      else:
        print(f"Skipping t-test for column '{all_feature_column}' due to insufficient data points after dropping NaNs.")


    result_significant_columns_infos[all_metric_dataframe_procedure] = significant_columns


  return result_significant_columns_infos

In [None]:

def get_dataset_with_significant_columns_by_t_test(result_significant_columns_infos, result_all_metric_dataframe_with_balanced_diagnose):

  result_all_metric_dataframe_with_na_droped_and_significant_features_infos = {}
  for result_all_metric_dataframe_with_balanced_diagnose_procedure in result_all_metric_dataframe_with_balanced_diagnose.keys():

    all_metric_dataframe_with_na_droped = result_all_metric_dataframe_with_balanced_diagnose[result_all_metric_dataframe_with_balanced_diagnose_procedure]

    all_metric_dataframe_with_na_droped_and_significant_features = all_metric_dataframe_with_na_droped.copy()
    all_metric_dataframe_with_na_droped_and_significant_features = all_metric_dataframe_with_na_droped_and_significant_features[result_significant_columns_infos[result_all_metric_dataframe_with_balanced_diagnose_procedure]]
    all_metric_dataframe_with_na_droped_and_significant_features['Diagnose'] = all_metric_dataframe_with_na_droped['Diagnose']
    all_metric_dataframe_with_na_droped_and_significant_features['is_augmented'] = all_metric_dataframe_with_na_droped['is_augmented']

    result_all_metric_dataframe_with_na_droped_and_significant_features_infos[result_all_metric_dataframe_with_balanced_diagnose_procedure] = all_metric_dataframe_with_na_droped_and_significant_features.copy()

  return result_all_metric_dataframe_with_na_droped_and_significant_features_infos


In [None]:

def display_correlation_coefficient_matrix(df_procedure_infos):

  # from cite https://stackoverflow.com/questions/38913965/make-the-size-of-a-heatmap-bigger-with-seaborn , https://stackoverflow.com/questions/67151854/multiple-seaborn-heatmaps-from-pandas-dataframe
  



In [None]:

def export_dataset_as_csv(df_procedure_infos, prefix_keyword='-'):

  for df_procedure in df_procedure_infos.keys():
    df = df_procedure_infos[df_procedure]

    df_path = f'{MAIN_PATH}/all_feature_metrics_dataset{prefix_keyword}-{df_procedure}.csv'
    renamed_df_path = None
    if os.path.exists(df_path):
      x = datetime.datetime.now()
      renamed_df_path = df_path.replace('.csv', f'-{x.strftime("%Y-%m-%d %H-%M-%S")}.csv')
      os.rename(df_path, renamed_df_path)
      print('Renamed file as ', renamed_df_path)
    df.to_csv(df_path, index=True)
    if renamed_df_path is not None:
      print('Recreated file as ', df_path)
    else:
      print('Created file as ', df_path)



### Physiological Metric Collecting

### Use all and seperate diagnose groups and all augmented values

In [None]:
all_metric_dataframe_infos = {}

In [None]:

random_generated_index_for_augmented_samples_info = get_random_generated_index_for_augmented_samples_infos(all_data_infos, MAIN_PATH, DATASET_PROCEDURES, AUGMENTATION_PROCEDURES, AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE)

for random_generated_index_for_augmented_samples_info_key in random_generated_index_for_augmented_samples_info.keys():
  print(random_generated_index_for_augmented_samples_info_key, random_generated_index_for_augmented_samples_info[random_generated_index_for_augmented_samples_info_key].shape)


In [None]:
tmp_all_metric_dataframe_infos = get_datasets_by_diagnose_group_procedures(all_data_infos, \
                                                                  random_generated_index_for_augmented_samples_info, \
                                                                  AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE, \
                                                                  ['AP_1'], \
                                                                  ['DP_2'], \
                                                                  MAIN_PATH, \
                                                                  MAIN_PROCESSED_PATH, \
                                                                  {'FP_5': 'Typically Developed'},\
                                                                  allowed_files)

tmp_all_metric_dataframe_infos['DP_2--AP_1--FP_5']['Diagnose'].describe()

In [None]:
tmp_all_metric_dataframe_with_na_droped_infos = get_all_metric_with_na_droped_infos(tmp_all_metric_dataframe_infos)
tmp_all_metric_dataframe_with_na_droped_infos['DP_2--AP_1--FP_5']['Diagnose'].describe()

In [None]:
tmp_all_metric_dataframe_with_na_droped_infos_copy_keys = copy.deepcopy(list(tmp_all_metric_dataframe_with_na_droped_infos.keys()))

for tmp_all_metric_dataframe_with_na_droped_infos_copy_key in tmp_all_metric_dataframe_with_na_droped_infos_copy_keys:

  df_columns = copy.deepcopy(list(tmp_all_metric_dataframe_with_na_droped_infos[tmp_all_metric_dataframe_with_na_droped_infos_copy_key].columns))

  for df_column in df_columns:
    if df_column not in ['source_file' , 'Seconds' , 'Diagnose' , 'is_augmented']:
      tmp_all_metric_dataframe_with_na_droped_infos[tmp_all_metric_dataframe_with_na_droped_infos_copy_key][df_column] = tmp_all_metric_dataframe_with_na_droped_infos[tmp_all_metric_dataframe_with_na_droped_infos_copy_key][df_column].apply(lambda x: get_formatted_values(x)).astype(np.float64)


In [None]:
all_metric_dataframe_with_na_droped_infos['DP_2--AP_1--FP_5'] = tmp_all_metric_dataframe_with_na_droped_infos['DP_2--AP_1--FP_5'].copy()

In [None]:

sub_DIAGNOSE_GROUP_SELECTION_PROCEDURE = {DIAGNOSE_GROUP_PROCEDURE: DIAGNOSE_GROUP_SELECTION_PROCEDURE[ DIAGNOSE_GROUP_PROCEDURE] for DIAGNOSE_GROUP_PROCEDURE in DIAGNOSE_GROUP_SELECTION_PROCEDURE.keys() if DIAGNOSE_GROUP_PROCEDURE != 'FP_1'}

for DATASET_PROCEDURE in DATASET_PROCEDURES:

  for AUGMENTATION_PROCEDURE in ['AP_1', 'AP_2', 'AP_3', 'AP_5']: # list(AUGMENTATION_PROCEDURES):

    tmp_all_metric_dataframe_infos = get_datasets_by_diagnose_group_procedures(all_data_infos, \
                                                                      random_generated_index_for_augmented_samples_info, \
                                                                      AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE, \
                                                                      [AUGMENTATION_PROCEDURE], \
                                                                      [DATASET_PROCEDURE], \
                                                                      MAIN_PATH, \
                                                                      MAIN_PROCESSED_PATH, \
                                                                      sub_DIAGNOSE_GROUP_SELECTION_PROCEDURE,\
                                                                      allowed_files)

    for tmp_all_metric_dataframe_infos_key in tmp_all_metric_dataframe_infos.keys():
      all_metric_dataframe_infos[tmp_all_metric_dataframe_infos_key] = tmp_all_metric_dataframe_infos[tmp_all_metric_dataframe_infos_key]


In [None]:

for DATASET_PROCEDURE in DATASET_PROCEDURES:

  for AUGMENTATION_PROCEDURE in AUGMENTATION_PROCEDURES:

    all_metric_dataframe_infos[f'{DATASET_PROCEDURE}--{AUGMENTATION_PROCEDURE}--FP_1'] = all_metric_dataframe_infos[f'{DATASET_PROCEDURE}--{AUGMENTATION_PROCEDURE}--FP_2'].copy()



In [None]:
for all_metric_dataframe_infos_key in all_metric_dataframe_infos.keys():
  print(all_metric_dataframe_infos_key, all_metric_dataframe_infos[all_metric_dataframe_infos_key].shape)

In [None]:
export_dataset_as_csv(all_metric_dataframe_infos)


In [None]:
!ls -al '/content/drive/MyDrive/BitirmeProje/datasets2/AKTIVES Veri Seti/AKTIVES Veri Seti/' | grep -ve '2025-01-' | grep -vw 'all_feature_metrics_dataset-'

In [None]:

for DATASET_PROCEDURE in DATASET_PROCEDURES:

  for AUGMENTATION_PROCEDURE in ['AP_6', 'AP_7', 'AP_8', 'AP_9']:

    for FEATURE_PROCEDURE in DIAGNOSE_GROUP_SELECTION_PROCEDURE.keys():

      procedure_code_composution = f'{DATASET_PROCEDURE}--{AUGMENTATION_PROCEDURE}--{FEATURE_PROCEDURE}'
      pd_read_path = f'{MAIN_PATH}/all_feature_metrics_dataset-{procedure_code_composution}.csv'
      print('Read csv', pd_read_path)
      df_sample = pd.read_csv(pd_read_path);
      all_metric_dataframe_infos[procedure_code_composution] = df_sample



---------------------------------------------------------------------

In [None]:
for all_metric_dataframe_infos_key in all_metric_dataframe_infos.keys():
  print(all_metric_dataframe_infos_key, all_metric_dataframe_infos[all_metric_dataframe_infos_key].shape)

In [None]:
for AUGMENTATION_BATCH_CODE in AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE.keys():
  total_dataset_info = { }
  for DIAGNOSE_GROUP_PROCEDURE in DIAGNOSE_GROUP_SELECTION_PROCEDURE.keys():
    for DATASET_PROCEDURE in DATASET_PROCEDURES:
      total_dataset_info[f'{DATASET_PROCEDURE}--{DIAGNOSE_GROUP_PROCEDURE}'] = 0

  print(AUGMENTATION_BATCH_CODE)
  aug_code_list = AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE[AUGMENTATION_BATCH_CODE]
  for all_metric_dataframe_infos_key in all_metric_dataframe_infos.keys():
    if all_metric_dataframe_infos_key.split('--')[1] in aug_code_list:
      print(all_metric_dataframe_infos_key, all_metric_dataframe_infos[all_metric_dataframe_infos_key].shape)
      total_dataset_info[f'{all_metric_dataframe_infos_key.split("--")[0]}--{all_metric_dataframe_infos_key.split("--")[2]}'] += all_metric_dataframe_infos[all_metric_dataframe_infos_key].shape[0]

  print(total_dataset_info)


In [None]:

all_metric_dataframe_as_batch_info = {}

for DATASET_PROCEDURE in DATASET_PROCEDURES:

  for AUGMENTATION_BATCH_CODE in AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE.keys():

    for FEATURE_PROCEDURE in DIAGNOSE_GROUP_SELECTION_PROCEDURE.keys():

      all_metric_dataframe_procedure = f'{DATASET_PROCEDURE}--{AUGMENTATION_BATCH_CODE}--{FEATURE_PROCEDURE}'
      all_metric_dataframe_as_batch_info[all_metric_dataframe_procedure] = get_all_metric_dataframe_as_batch(DATASET_PROCEDURE, FEATURE_PROCEDURE, AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE[AUGMENTATION_BATCH_CODE], all_metric_dataframe_infos)
      print(all_metric_dataframe_procedure, all_metric_dataframe_as_batch_info[all_metric_dataframe_procedure].shape)




In [None]:

all_metric_dataframe_with_na_droped_infos = get_all_metric_with_na_droped_infos(all_metric_dataframe_infos)

for all_metric_dataframe_with_na_droped_infos_key in all_metric_dataframe_with_na_droped_infos.keys():
  print(all_metric_dataframe_with_na_droped_infos_key, all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key].shape)

In [None]:

all_metric_dataframe_as_batch_with_na_droped_infos = get_all_metric_with_na_droped_infos(all_metric_dataframe_as_batch_info)

for all_metric_dataframe_as_batch_with_na_droped_infos_key in all_metric_dataframe_as_batch_with_na_droped_infos.keys():
  print(all_metric_dataframe_as_batch_with_na_droped_infos_key, all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_key].shape)

In [None]:
all_metric_dataframe_with_na_droped_infos_copy_keys = copy.deepcopy(list(all_metric_dataframe_with_na_droped_infos.keys()))

for all_metric_dataframe_with_na_droped_infos_copy_key in all_metric_dataframe_with_na_droped_infos_copy_keys:

  df_columns = copy.deepcopy(list(all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_copy_key].columns))

  for df_column in df_columns:
    if df_column not in ['source_file' , 'Seconds' , 'Diagnose' , 'is_augmented']:
      all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_copy_key][df_column] = all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_copy_key][df_column].apply(lambda x: get_formatted_values(x)).astype(np.float64)


In [None]:
all_metric_dataframe_as_batch_with_na_droped_infos_copy_keys = copy.deepcopy(list(all_metric_dataframe_as_batch_with_na_droped_infos.keys()))

for all_metric_dataframe_as_batch_with_na_droped_infos_copy_key in all_metric_dataframe_as_batch_with_na_droped_infos_copy_keys:

  df_columns = copy.deepcopy(list(all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_copy_key].columns))

  for df_column in df_columns:
    if df_column not in ['source_file' , 'Seconds' , 'Diagnose' , 'is_augmented']:
      all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_copy_key][df_column] = all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_copy_key][df_column].apply(lambda x: get_formatted_values(x)).astype(np.float64)


In [None]:

exported_dataset_infos = {}
for all_metric_dataframe_with_na_droped_infos_key in all_metric_dataframe_with_na_droped_infos.keys():
  if all_metric_dataframe_with_na_droped_infos_key.endswith('-FP_1'):
    exported_dataset_infos[all_metric_dataframe_with_na_droped_infos_key] = all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key]


for all_metric_dataframe_as_batch_with_na_droped_infos_key in all_metric_dataframe_as_batch_with_na_droped_infos.keys():
  if all_metric_dataframe_as_batch_with_na_droped_infos_key.endswith('-FP_1'):
    exported_dataset_infos[all_metric_dataframe_as_batch_with_na_droped_infos_key] = all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_key]

export_dataset_as_csv(exported_dataset_infos, '-non_balanced')


In [None]:
non_F1_only_original_all_metric_dataframe_with_na_droped_infos = {}
non_F1_all_metric_dataframe_with_na_droped_infos = {}

for all_metric_dataframe_with_na_droped_infos_key in all_metric_dataframe_with_na_droped_infos.keys():
  if not all_metric_dataframe_with_na_droped_infos_key.endswith('-FP_1'):
    if 'AP_1' in all_metric_dataframe_with_na_droped_infos_key:
      non_F1_only_original_all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key] = all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key].copy()
    else:
      non_F1_all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key] = all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key].copy()

significant_columns_only_original_infos = get_significant_columns_in_dataset_by_t_test(non_F1_only_original_all_metric_dataframe_with_na_droped_infos, only_orignial_data=True)

for significant_columns_only_original_infos_key in significant_columns_only_original_infos.keys():
  print(significant_columns_only_original_infos_key, len(significant_columns_only_original_infos[significant_columns_only_original_infos_key]))

significant_columns_infos = get_significant_columns_in_dataset_by_t_test(non_F1_all_metric_dataframe_with_na_droped_infos)

for significant_columns_infos_key in significant_columns_infos.keys():
  print(significant_columns_infos_key, len(significant_columns_infos[significant_columns_infos_key]))


In [None]:
non_F1_all_metric_dataframe_as_batch_with_na_droped_infos = {}

for all_metric_dataframe_as_batch_with_na_droped_infos_key in all_metric_dataframe_as_batch_with_na_droped_infos.keys():
  if not all_metric_dataframe_as_batch_with_na_droped_infos_key.endswith('-FP_1'):
    non_F1_all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_key] = all_metric_dataframe_as_batch_with_na_droped_infos[all_metric_dataframe_as_batch_with_na_droped_infos_key]


significant_columns_as_batch_infos = get_significant_columns_in_dataset_by_t_test(non_F1_all_metric_dataframe_as_batch_with_na_droped_infos, only_orignial_data=True)

for significant_columns_as_batch_infos_key in significant_columns_as_batch_infos.keys():
  print(significant_columns_as_batch_infos_key, len(significant_columns_as_batch_infos[significant_columns_as_batch_infos_key]))


In [None]:

all_metric_dataframe_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_infos, non_F1_all_metric_dataframe_with_na_droped_infos)

for all_metric_dataframe_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_with_na_droped_and_significant_features_infos[all_metric_dataframe_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_only_original_infos, non_F1_only_original_all_metric_dataframe_with_na_droped_infos)

for all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos[all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_as_batch_infos, non_F1_all_metric_dataframe_as_batch_with_na_droped_infos)

for all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos[all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

export_dataset_as_csv(all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos, '-non_balanced')

export_dataset_as_csv(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos, '-non_balanced')

export_dataset_as_csv(all_metric_dataframe_with_na_droped_and_significant_features_infos, '-non_balanced')


In [None]:

all_metric_dataframe_with_balanced_diagnose = balance_diagnose_classification_lables(all_metric_dataframe_with_na_droped_infos)

for all_metric_dataframe_with_balanced_diagnose_key in all_metric_dataframe_with_balanced_diagnose.keys():
  print(all_metric_dataframe_with_balanced_diagnose_key, all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].shape)



In [None]:

all_metric_dataframe_as_batch_with_balanced_diagnose = balance_diagnose_classification_lables(all_metric_dataframe_as_batch_with_na_droped_infos)

for all_metric_dataframe_as_batch_with_balanced_diagnose_key in all_metric_dataframe_as_batch_with_balanced_diagnose.keys():
  print(all_metric_dataframe_as_batch_with_balanced_diagnose_key, all_metric_dataframe_as_batch_with_balanced_diagnose[all_metric_dataframe_as_batch_with_balanced_diagnose_key].shape)



In [None]:

for all_metric_dataframe_with_balanced_diagnose_key in all_metric_dataframe_with_balanced_diagnose.keys():
  df_columns = all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].columns
  print(all_metric_dataframe_with_balanced_diagnose_key, { df_columns[column_idx]: na_info for column_idx , na_info in enumerate(all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].isna().sum())})


In [None]:

for all_metric_dataframe_as_batch_with_balanced_diagnose_key in all_metric_dataframe_as_batch_with_balanced_diagnose.keys():
  df_columns = all_metric_dataframe_as_batch_with_balanced_diagnose[all_metric_dataframe_as_batch_with_balanced_diagnose_key].columns
  print(all_metric_dataframe_as_batch_with_balanced_diagnose_key, { df_columns[column_idx]: na_info for column_idx , na_info in enumerate(all_metric_dataframe_as_batch_with_balanced_diagnose[all_metric_dataframe_as_batch_with_balanced_diagnose_key].isna().sum())})


In [None]:

exported_dataset_infos = {}
for all_metric_dataframe_with_balanced_diagnose_key in all_metric_dataframe_with_balanced_diagnose.keys():
  if all_metric_dataframe_with_balanced_diagnose_key.endswith('-FP_1'):
    exported_dataset_infos[all_metric_dataframe_with_balanced_diagnose_key] = all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key]


for ll_metric_dataframe_as_batch_with_balanced_diagnose_key in all_metric_dataframe_as_batch_with_balanced_diagnose.keys():
  if ll_metric_dataframe_as_batch_with_balanced_diagnose_key.endswith('-FP_1'):
    exported_dataset_infos[ll_metric_dataframe_as_batch_with_balanced_diagnose_key] = all_metric_dataframe_as_batch_with_balanced_diagnose[ll_metric_dataframe_as_batch_with_balanced_diagnose_key]

export_dataset_as_csv(exported_dataset_infos, '-balanced')


In [None]:
non_F1_only_original_all_metric_dataframe_with_balanced_diagnose = {}
non_F1_all_metric_dataframe_with_balanced_diagnose = {}

for all_metric_dataframe_with_balanced_diagnose_key in all_metric_dataframe_with_balanced_diagnose.keys():
  if not all_metric_dataframe_with_balanced_diagnose_key.endswith('-FP_1'):
    if 'AP_1' in all_metric_dataframe_with_balanced_diagnose_key:
      non_F1_only_original_all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key] = all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].copy()
    else:
      non_F1_all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key] = all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].copy()

significant_columns_only_original_infos = get_significant_columns_in_dataset_by_t_test(non_F1_only_original_all_metric_dataframe_with_balanced_diagnose, only_orignial_data=True)

for significant_columns_only_original_infos_key in significant_columns_only_original_infos.keys():
  print(significant_columns_only_original_infos_key, len(significant_columns_only_original_infos[significant_columns_only_original_infos_key]))

significant_columns_infos = get_significant_columns_in_dataset_by_t_test(non_F1_all_metric_dataframe_with_balanced_diagnose)

for significant_columns_infos_key in significant_columns_infos.keys():
  print(significant_columns_infos_key, len(significant_columns_infos[significant_columns_infos_key]))


In [None]:
non_F1_all_metric_dataframe_as_batch_with_balanced_diagnose = {}

for all_metric_dataframe_as_batch_with_balanced_diagnose_key in all_metric_dataframe_as_batch_with_balanced_diagnose.keys():
  if not all_metric_dataframe_as_batch_with_balanced_diagnose_key.endswith('-FP_1'):
    non_F1_all_metric_dataframe_as_batch_with_balanced_diagnose[all_metric_dataframe_as_batch_with_balanced_diagnose_key] = all_metric_dataframe_as_batch_with_balanced_diagnose[all_metric_dataframe_as_batch_with_balanced_diagnose_key]


significant_columns_as_batch_infos = get_significant_columns_in_dataset_by_t_test(non_F1_all_metric_dataframe_as_batch_with_balanced_diagnose, only_orignial_data=True)

for significant_columns_as_batch_infos_key in significant_columns_as_batch_infos.keys():
  print(significant_columns_as_batch_infos_key, len(significant_columns_as_batch_infos[significant_columns_as_batch_infos_key]))


In [None]:

all_metric_dataframe_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_infos, non_F1_all_metric_dataframe_with_balanced_diagnose)

for all_metric_dataframe_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_with_na_droped_and_significant_features_infos[all_metric_dataframe_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_only_original_infos, non_F1_only_original_all_metric_dataframe_with_balanced_diagnose)

for all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos[all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_as_batch_infos, non_F1_all_metric_dataframe_as_batch_with_balanced_diagnose)

for all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos[all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

for all_metric_dataframe_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_with_na_droped_and_significant_features_infos.keys():
  df_columns = all_metric_dataframe_with_na_droped_and_significant_features_infos[all_metric_dataframe_with_na_droped_and_significant_features_infos_key].columns
  print(all_metric_dataframe_with_na_droped_and_significant_features_infos_key, { df_columns[column_idx]: na_info for column_idx , na_info in enumerate(all_metric_dataframe_with_na_droped_and_significant_features_infos[all_metric_dataframe_with_na_droped_and_significant_features_infos_key].isna().sum())})


In [None]:

for all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos.keys():
  df_columns = all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos[all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key].columns
  print(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key, { df_columns[column_idx]: na_info for column_idx , na_info in enumerate(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos[all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos_key].isna().sum())})


In [None]:

display_correlation_coefficient_matrix(all_metric_dataframe_with_na_droped_and_significant_features_infos)


In [None]:

display_correlation_coefficient_matrix(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos)


In [None]:

display_correlation_coefficient_matrix(all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos)


In [None]:

export_dataset_as_csv(all_metric_dataframe_with_na_droped_and_significant_features_infos, '-balanced')

export_dataset_as_csv(all_metric_dataframe_only_original_with_na_droped_and_significant_features_infos, '-balanced')

export_dataset_as_csv(all_metric_dataframe_as_batch_with_na_droped_and_significant_features_infos, '-balanced')


### Use all experts or seperate Expert and all augmented values with weigtened propabilites (AP_ALL, AP_ALL_2, AP_ALL_3)


In [None]:
all_metric_dataframe_infos = {}

By All Experts

In [None]:
_ = get_stress_sample_rate(all_data_infos, MAIN_PATH, DATASET_PROCEDURES, None, True)

By Expert1

In [None]:
_ = get_stress_sample_rate(all_data_infos, MAIN_PATH, DATASET_PROCEDURES, 'Expert1', True)

By Expert2

In [None]:
_ = get_stress_sample_rate(all_data_infos, MAIN_PATH, DATASET_PROCEDURES, 'Expert2', True)

By Expert3

In [None]:
_ = get_stress_sample_rate(all_data_infos, MAIN_PATH, DATASET_PROCEDURES, 'Expert3', True)

In [None]:

random_generated_index_for_augmented_samples_info = get_random_generated_index_for_augmented_samples_infos(all_data_infos, MAIN_PATH, DATASET_PROCEDURES, AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE)

for random_generated_index_for_augmented_samples_info_key in random_generated_index_for_augmented_samples_info.keys():
  print(random_generated_index_for_augmented_samples_info_key, random_generated_index_for_augmented_samples_info[random_generated_index_for_augmented_samples_info_key].shape)


In [None]:

for DATASET_PROCEDURE in DATASET_PROCEDURES:

  for AUGMENTATION_PROCEDURE in list(AUGMENTATION_PROCEDURES):

    tmp_all_metric_dataframe_infos = get_datasets_by_experment_procedures(all_data_infos, \
                                                                      random_generated_index_for_augmented_samples_info, \
                                                                      AUGMENTATION_SAMPLE_AMOUNT_BY_PROCEDURE, \
                                                                      [AUGMENTATION_PROCEDURE], \
                                                                      [DATASET_PROCEDURE], \
                                                                      MAIN_PATH, \
                                                                      MAIN_PROCESSED_PATH, \
                                                                      EXPERT_SELECTION_PROCEDURE, \
                                                                      allowed_files)

    for tmp_all_metric_dataframe_infos_key in tmp_all_metric_dataframe_infos.keys():
      all_metric_dataframe_infos[tmp_all_metric_dataframe_infos_key] = tmp_all_metric_dataframe_infos[tmp_all_metric_dataframe_infos_key]


In [None]:

all_metric_dataframe_as_batch_info = {}

for DATASET_PROCEDURE in DATASET_PROCEDURES:

  for AUGMENTATION_BATCH_CODE in AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE.keys():

    for FEATURE_PROCEDURE in FEATURE_PROCEDURES:

      all_metric_dataframe_procedure = f'{DATASET_PROCEDURE}--{AUGMENTATION_BATCH_CODE}--{FEATURE_PROCEDURE}'
      all_metric_dataframe_as_batch_info[all_metric_dataframe_procedure] = get_all_metric_dataframe_as_batch(DATASET_PROCEDURE, FEATURE_PROCEDURE, AUGMENTATION_BATCH_AMOUNT_BY_PROCEDURE[AUGMENTATION_BATCH_CODE], all_metric_dataframe_infos)
      print(all_metric_dataframe_procedure, all_metric_dataframe_as_batch_info[all_metric_dataframe_procedure].shape)




In [None]:

all_metric_dataframe_with_na_droped_infos = get_all_metric_with_na_droped_infos(all_metric_dataframe_as_batch_info)

for all_metric_dataframe_with_na_droped_infos_key in all_metric_dataframe_with_na_droped_infos.keys():
  print(all_metric_dataframe_with_na_droped_infos_key, all_metric_dataframe_with_na_droped_infos[all_metric_dataframe_with_na_droped_infos_key].shape)

In [None]:

all_metric_dataframe_with_balanced_diagnose = balance_diagnose_classification_lables(all_metric_dataframe_with_na_droped_infos)

for all_metric_dataframe_with_balanced_diagnose_key in all_metric_dataframe_with_balanced_diagnose.keys():
  print(all_metric_dataframe_with_balanced_diagnose_key, all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].shape)



In [None]:

for all_metric_dataframe_with_balanced_diagnose_key in all_metric_dataframe_with_balanced_diagnose.keys():
  print(all_metric_dataframe_with_balanced_diagnose_key, all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_key].isna().sum())


In [None]:
all_metric_dataframe_with_balanced_diagnose_copy_keys = copy.deepcopy(list(all_metric_dataframe_with_balanced_diagnose.keys()))

for all_metric_dataframe_with_balanced_diagnose_copy_key in all_metric_dataframe_with_balanced_diagnose_copy_keys:

  df_columns = copy.deepcopy(list(all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_copy_key].columns))

  for df_column in df_columns:
    if df_column not in ['source_file' , 'Seconds' , 'Diagnose' , 'is_augmented']:
      all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_copy_key][df_column] = all_metric_dataframe_with_balanced_diagnose[all_metric_dataframe_with_balanced_diagnose_copy_key][df_column].apply(lambda x: get_formatted_values(x)).astype(np.float64)


In [None]:
significant_columns_infos = get_significant_columns_in_dataset_by_t_test(all_metric_dataframe_with_balanced_diagnose, only_orignial_data=True)

for significant_columns_infos_key in significant_columns_infos.keys():
  print(significant_columns_infos_key, len(significant_columns_infos[significant_columns_infos_key]))


In [None]:

all_metric_dataframe_with_na_droped_and_significant_features_infos = get_dataset_with_significant_columns_by_t_test(significant_columns_infos, all_metric_dataframe_with_balanced_diagnose)

for all_metric_dataframe_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_with_na_droped_and_significant_features_infos[all_metric_dataframe_with_na_droped_and_significant_features_infos_key].shape)


In [None]:

for all_metric_dataframe_with_na_droped_and_significant_features_infos_key in all_metric_dataframe_with_na_droped_and_significant_features_infos.keys():
  print(all_metric_dataframe_with_na_droped_and_significant_features_infos_key, all_metric_dataframe_with_na_droped_and_significant_features_infos[all_metric_dataframe_with_na_droped_and_significant_features_infos_key].isna().sum())


In [None]:

display_correlation_coefficient_matrix(all_metric_dataframe_with_na_droped_and_significant_features_infos)


In [None]:

export_dataset_as_csv(all_metric_dataframe_with_na_droped_and_significant_features_infos)
