## Download a Original Data

* if has not dataset, you must do unlock comment and excute.

In [None]:
#!./init

## init

In [25]:
from typing import List
import os

import numpy as np
import pandas as pd

import csv
import shutil

## Set global variable

In [26]:
dir_name = "KEMDy20"
global_data_path = f"{dir_name}"

## Make Functions

In [27]:
def get_directories(
    dir_path: str
) -> List[str]:
    t_list = []
    
    if os.path.exists(dir_path):
        t_list = sorted(os.listdir(dir_path))
    
    return t_list

def get_files(
    dir_path: str,
    extension: str
) -> List[str]:
    
    f_list = []
    
    for l in get_directories(dir_path):
        if l.endswith(f".{extension}"):
            f_list.append(l)
    
    return sorted(f_list)

def read_csv_info(
    file_path: str=""
) -> pd.DataFrame:
    
    csv_list = []
    with open(file_path) as f:
        r = csv.reader(f)

        for l in r:
            csv_list.append(l)

    return pd.DataFrame(csv_list)

def make_merged_data(
    target_dir_list: List[str],
    file_list: List[str],
    session_name: str
) -> pd.DataFrame:

    assert session_name != None
    
    result: pd.DataFrame = None
        
    for file in file_list:
        file_datas = {}
        
        for target_dir_full in target_dir_list:
            target_dir = target_dir_full.split("/")
            
            # print(f"{session_name}, {file}, {target_dir[1]}")
            # Directory path form is as like "KEMDy20/EDA/Session01/Sess01_script01_User001F.csv"
            file_datas[target_dir[1]] = read_csv_info(f"./{target_dir[0]}/{target_dir[1]}/{session_name}/{file}")
            # print(file_datas[target_dir[1]].to_string())
            
            if not file_datas[target_dir[1]].empty:
                if target_dir[1] == "EDA" or target_dir[1] == "TEMP":
                    file_datas[target_dir[1]] = file_datas[target_dir[1]].drop(1) 
                file_datas[target_dir[1]] = file_datas[target_dir[1]].drop(0)
                
                if target_dir[1] == "IBI":
                    file_datas[target_dir[1]] = file_datas[target_dir[1]].drop(columns=file_datas[target_dir[1]].columns[0])
                    # reset a column index 
                    file_datas[target_dir[1]] = file_datas[target_dir[1]].T.reset_index(drop=True).T
                    
                # drop the rows if has None
                file_datas[target_dir[1]] = file_datas[target_dir[1]].dropna()
                
                file_datas[target_dir[1]] = file_datas[target_dir[1]].reset_index(drop=True)
            else:
                if target_dir[1] == "IBI":
                    file_datas[target_dir[1]] = pd.DataFrame({"ibi": [None], "timestamp": [None], "sid": [None]})
        
        # ** for check data
        # for x in target_dir_list:
        #     t = x.split("/")
        #     print(f"{t[1]} : ")
        #     print(file_datas[t[1]].to_string())
        
        # attatching column names
        if len(file_datas["EDA"].columns) < 3:
            file_datas["EDA"] = file_datas["EDA"].set_axis(["acc", "timestamp"], axis=1)
            file_datas["EDA"]["sid"] = np.nan
        else:
            file_datas["EDA"] = file_datas["EDA"].set_axis(["acc", "timestamp", "sid"], axis=1)
        
        if len(file_datas["IBI"].columns) < 3:
            file_datas["IBI"] = file_datas["IBI"].set_axis(["ibi", "timestamp"], axis=1)
            file_datas["IBI"]["sid"] = np.nan
        else:
            file_datas["IBI"] = file_datas["IBI"].set_axis(["ibi", "timestamp", "sid"], axis=1)
        
        if len(file_datas["TEMP"].columns) < 3:
            file_datas["TEMP"] = file_datas["TEMP"].set_axis(["temp", "timestamp"], axis=1)
            file_datas["TEMP"]["sid"] = np.nan
        else:
            file_datas["TEMP"] = file_datas["TEMP"].set_axis(["temp", "timestamp", "sid"], axis=1)

        # Merge "TEMP" table to "EDA" 
        one_part_data_on_session = pd.merge(
            file_datas["EDA"], file_datas["TEMP"],
            left_on='timestamp', right_on='timestamp', how='outer')
        del one_part_data_on_session["sid_x"]
        one_part_data_on_session = one_part_data_on_session.rename(columns={"sid_y": "sid"})

        # Merge "IBI" table to merged table("EDA" and "TEMP")
        one_part_data_on_session = pd.merge(
            one_part_data_on_session, file_datas["IBI"],
            left_on='timestamp', right_on='timestamp', how='outer')
        one_part_data_on_session["sid_x"] = one_part_data_on_session["sid_x"].fillna(one_part_data_on_session["sid_y"])
        del one_part_data_on_session["sid_y"]
        one_part_data_on_session = one_part_data_on_session.rename(columns={"sid_x": "sid"})

        # reorder following as "timestamp", "sid", "acc", "temp", "ibi"
        one_part_data_on_session = one_part_data_on_session[["timestamp", "sid", "acc", "temp", "ibi"]]
        # sorting values from "timestamp" column
        one_part_data_on_session = one_part_data_on_session.sort_values("timestamp")
        one_part_data_on_session = one_part_data_on_session.reset_index(drop=True)
        
        if result is None:
            result = one_part_data_on_session
        else:
            result = pd.concat([result, one_part_data_on_session], sort=True)
            # result = result.append(one_part_data_on_session, ignore_index=True)
            
    result = result[["timestamp", "sid", "acc", "temp", "ibi"]]
    result = result.sort_values("timestamp")
    result = result.reset_index(drop=True)
    
    # print(result.to_string())
    
    return result

## Get organized dataset

In [28]:
target_dir_list = ["EDA", "IBI", "TEMP"]

all_file_list = {}
all_session_list = []

for part_name in target_dir_list:
    t_part_path = f"{global_data_path}/{part_name}"
    t_session_dir_list = get_directories(f"./{t_part_path}")
    
    for session in t_session_dir_list:
        t_session_dir_path = f"{t_part_path}/{session}"
        t_file_list = get_files(f"./{t_session_dir_path}", "csv")
        
        if session not in all_session_list:
            all_session_list.append(session)
            
        file_list_in_session = []
        
        for file in t_file_list:
            if file not in file_list_in_session:
                file_list_in_session.append(file)
        
        if session not in all_file_list:
            all_file_list[session] = file_list_in_session
            
            
org_dataset_path = f"org_{global_data_path}"

for session in all_session_list:
    l = [global_data_path + "/" + x for x in target_dir_list]
    
    session_merged_data = make_merged_data(l, all_file_list[session], session)
    
    t_ori_path = os.path.join(os.getcwd(), org_dataset_path)
    
    t_dir_list = get_directories(os.getcwd())
    if f"{org_dataset_path}" not in t_dir_list:
        os.mkdir(f"{t_ori_path}")
    
    t_dir_list = get_directories(f"{os.getcwd()}/{org_dataset_path}")
    if f"{session}" not in t_dir_list:
        os.mkdir(f"{t_ori_path}/{session}")
    
    t_ori_wav_path = f"./{global_data_path}/wav"
    
    t_file_list = get_files(f"{t_ori_wav_path}/{session}", "wav")
    t_file_list += get_files(f"{t_ori_wav_path}/{session}", "txt")

    for file in t_file_list:
        shutil.move(f"{t_ori_wav_path}/{session}/{file}", f"{t_ori_path}/{session}")
        
    session_merged_data.to_csv(f"{t_ori_path}/{session}/{session}.csv", sep=",", na_rep="NaN")
    
src_ann_path = os.path.join(os.getcwd(), f"{global_data_path}/annotation")
dest_ann_path = os.path.join(os.getcwd(), f"{org_dataset_path}/annotation")

shutil.copytree(src_ann_path, dest_ann_path)