# The Pupil Features Extracted in Batch Pipeline.
## Introuduction
The features to be extracted includes blinking rate and chunks of wavelet coefficients.

## Reference
1. The previous codes.https://github.com/BaiYunpeng1949/MobileEyeComputing/tree/master/ProcessDataNTestAlgorithm
2. TODO: add papers here.
3. My work: https://docs.google.com/document/d/1oLv3oJQLjst1_pYgd_UA3RRL1fRGSbZ6uvlMuxmZR2k/edit#heading=h.r01ccf7ox05g

## Implementation

In [1]:
import numpy as np
import pandas as pd
import streamlit as st
import plotly.express as px
from sklearn.metrics import r2_score
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from os import walk
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import csv
import os
import pywt
from numba import jit
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
def plotter(ax, data1, data2, param_dict):
    out = ax.plot(data1, data2, **param_dict)
    return out

## File-wise/Task-wise Data Pre-processing
### [Suspended] Left and Right Eye Synchronization and Calculate the Difference
Since I now implement ML/AI models to do a more inclusive analysis where more features could be analyzed, I will use the feature of the difference between left and right eye data.


1. Referenced from the previous work located as: https://github.com/BaiYunpeng1949/MobileEyeComputing/blob/master/ProcessDataNTestAlgorithm/LeftRightEyesSyncData.ipynb

2. The difference between two eyes were found correlated with cognitive workload estimation as well, see the related paper: Optimizing the usage of pupillary based indicators for cognitive workload. Reading link: https://docs.google.com/document/d/1jBezc9kqaziGlWk6sSgCvyjHTlpD5G6OoNZyh7K2HIo/edit#heading=h.qqfv1ot6zjd8

Besides, there is a considerable difference in pupil size variation for right and left eyes of the participants. See the paper: Exploring pupil size variation as a cognitive load indicator in visualization studies, link: https://drive.google.com/file/d/1z8O1NGNYA87La-CVMSr7cQkJ-VVOOUSe/view

In [3]:
def _pre_dtw_sync(left_eye_file_path, right_eye_file_path):   
    # Configure the parameters
    entity_dwt_align = 'Timestamp'
    entity_dwt_apply_dia = 'Diameter'
    entity_dwt_apply_conf = 'Confidence'
    left_eye = 'left'
    right_eye = 'right'
    
    # Tag labels
    SAMPLING_RATE_LEFT = int((left_eye_file_path.split('_')[-1]).split('Hz')[0])
    data_left = pd.read_csv(left_eye_file_path)

    SAMPLING_RATE_RIGHT = int((right_eye_file_path.split('_')[-1]).split('Hz')[0])
    data_right = pd.read_csv(right_eye_file_path)
    
    df_left = data_left[['Timestamp','Confidence','Diameter','Event']].copy()
    df_right = data_right[['Timestamp','Confidence','Diameter','Event']].copy()
    
    # Determine the left and right eye data's size first: which one is bigger?
    # Identify the number of elements in the left/right eyes data. If one applies the up-sampling method, the larger eye needs to be put in the first argument.
    len_left = len(df_left)
    len_right = len(df_right)
    if len_left >= len_right:
        df_reference = df_left.copy()  # df_reference: the one that being put in the first argument, as a reference.
        df_alignment = df_right.copy() # df_alignment: the one that being put in the second argument, to be aligned to the reference.
        df_origin = df_left.copy()
        SR_SYNC = SAMPLING_RATE_LEFT
    elif len_left < len_right:
        df_reference = df_right.copy()
        df_alignment = df_left.copy()
        df_origin = df_right.copy()
        SR_SYNC = SAMPLING_RATE_RIGHT
    
    # Calculate for warping.
    distance, path = fastdtw(df_reference[entity_dwt_align], df_alignment[entity_dwt_align], dist=euclidean)
    
    return path, df_reference, df_alignment, entity_dwt_apply_dia, entity_dwt_apply_conf, df_origin, SR_SYNC

In [4]:
# Define a function scnchronize and align/merge 2 eyes' numerical values, including diamter and confidence values.
# The method of getting the average/mean value of 2 eyes' data as computing target is referenced from the mention in LHIPA.
def _dtw_synchronize_merge_data(path_dwt, df_reference, df_alignment, entity_dwt_apply):
    # Synchronize
    data_sync = []
    for i in range(0, len(path_dwt)):
        data_sync.append([path_dwt[i][0],  # The index column is for dropping out duplicates.
                         df_reference[entity_dwt_apply].iloc[path_dwt[i][0]],
                         df_alignment[entity_dwt_apply].iloc[path_dwt[i][1]]])
    df_sync = pd.DataFrame(data=data_sync,
                           columns=['Index', 
                                    'Reference '+entity_dwt_apply, 
                                    'Alignment '+entity_dwt_apply]).dropna()
    df_sync = df_sync.drop_duplicates(subset=['Index']) # Drop the duplicates according to the index of the reference.
    df_sync = df_sync.reset_index(drop=True)
    # Merge/Align
    df_sync['Avg'+entity_dwt_apply] = df_sync.loc[:, ['Reference '+entity_dwt_apply, 'Alignment '+entity_dwt_apply]].mean(axis = 1)
    # Calculate the difference
    df_sync['Diff'+entity_dwt_apply] = df_sync['Reference '+entity_dwt_apply] - df_sync['Alignment '+entity_dwt_apply]
    
    return df_sync

In [5]:
# Synchronize the given 2 eyes' data.
def dtw_sync(left_eye_file_path, right_eye_file_path):
    # Prepare for the sychronization.
    path, df_reference, df_alignment, entity_dwt_apply_dia, entity_dwt_apply_conf, df_origin, SR_SYNC = _pre_dtw_sync(left_eye_file_path = left_eye_file_path, 
                                                                                                                  right_eye_file_path = right_eye_file_path)

    
    # Synchronize, merge, and label data.
    df_sync_dia = _dtw_synchronize_merge_data(path_dwt=path,
                                              df_reference=df_reference,
                                              df_alignment=df_alignment,
                                              entity_dwt_apply=entity_dwt_apply_dia)
    df_sync_conf = _dtw_synchronize_merge_data(path_dwt=path,
                                               df_reference=df_reference,
                                               df_alignment=df_alignment,
                                               entity_dwt_apply=entity_dwt_apply_conf)
    
    # Integrate into one dataframe.
    df_sync = pd.DataFrame()
    df_sync['Timestamp'] = df_origin['Timestamp']
    df_sync['Confidence'] = df_sync_conf['AvgConfidence']
    df_sync['Diameter'] = df_sync_dia['AvgDiameter']
    df_sync['Event'] = df_origin['Event']
    df_sync['DiffDiameter'] = df_sync_dia['DiffDiameter']
    
    # Output and save into a csv file.
    df_export = df_sync.copy()
    file_name = left_eye_file_path.split('/')[-2:]

    folder_path = '../Data/PreprocessedData/' + file_name[0] + '/'
    if os.path.exists(folder_path) is False:
        os.makedirs(folder_path)

    write_file_name = 'synchronized_' + str(SR_SYNC) + 'Hz.csv'
    write_file_path = folder_path + write_file_name
    df_export.to_csv(write_file_path)
    
    return df_sync

### Merge Two Eyes' Data using Timestamps, Interpolate, and Calculate the Difference
Reference: a blog introducing 2 sensors' data fusion - https://stackoverflow.com/questions/14079766/synchronize-dataset-multiple-users-multiple-timestamps

#### Configuration Part 

In [6]:
TS = 'Timestamp'
CF = 'Confidence'
DM = 'Diameter'
LDM = 'Left Diameter'
RDM = 'Right Diameter'
LCF = 'Left Confidence'
RCF = 'Right Confidence'
AVEDM = 'Average Diameter'
AVECF = 'Average Confidence'
DIFFDM = 'Difference Diameter'  # By my default: Left - Right - unit in pixels.
SR = 'Averaged Sampling Rate'
EVENT = 'Event'
TARGET_TASK = 'sitting'

#### Implementation
To be noted that, when using the interpolation, we need to avoid overshooting, which is easily caused by spline curves. Otherwise we will have plenty negative pupil diameters and large outliers. I would use the 'linear' method to interpolate as suggested by Marshall. Apart from that, one might also want to try [monotonic](https://stackoverflow.com/questions/40072420/interpolate-without-having-negative-values-in-python) interpolators to avoid overshootings.

In [7]:
INTERPOLATE_TYPE = 'linear'
LIMIT_DIRECTION = 'both'

In [8]:
def upsample_timestamps_sync(left_eye_file_path, right_eye_file_path):
    # Read data from csv into dataframes.
    df_left = pd.read_csv(left_eye_file_path)
    df_right = pd.read_csv(right_eye_file_path)
    df_left = df_left[[TS,CF,DM,EVENT]].copy()
    df_right = df_right[[TS,CF,DM,EVENT]].copy()

    # Collect data from the targetted events: sitting.
    df_left = df_left.loc[df_left[EVENT] == TARGET_TASK]
    df_right = df_right.loc[df_right[EVENT] == TARGET_TASK]
#     print(len(df_left[df_left[EVENT]=='default']))
#     print(len(df_right[df_right[EVENT]=='default']))
    
    # Get the diameter data indexed by timestamps.
    left_diameters = df_left[DM].to_numpy() 
    series_left = pd.Series(left_diameters, index=df_left[TS])
    right_diameters = df_right[DM].to_numpy() 
    series_right = pd.Series(right_diameters, index=df_right[TS])
    
    # Synchronize 2 eyes' data by listing all timestamps, this process is actually a up-sampling.
    df_sync = pd.DataFrame([series_left, series_right]).T.sort_index()
    df_sync = df_sync.rename(columns={0: LDM, 1: RDM})

    #Interpolate all the NAN values using 'Spline 3' method with a bi-directional strategy to fill both the first and the last NAN values.
    df_sync = df_sync.interpolate(method=INTERPOLATE_TYPE, limit_direction=LIMIT_DIRECTION, axis=0)
#     df_sync = df_sync.ffill()
    
    # Align the confidence values according to the timestamps. 
    # Reference: Adding values in new column based on indexes with pandas in python. https://stackoverflow.com/questions/45636105/adding-values-in-new-column-based-on-indexes-with-pandas-in-python
    df_sync[LCF] = df_sync.index.to_series().map(df_left.set_index(TS)[CF])
    df_sync[RCF] = df_sync.index.to_series().map(df_right.set_index(TS)[CF])
    
    # Interpolate NAN values using the normal linear method.
    df_sync[LCF] = df_sync[LCF].interpolate(method=INTERPOLATE_TYPE, limit_direction=LIMIT_DIRECTION, axis=0)
    df_sync[RCF] = df_sync[RCF].interpolate(method=INTERPOLATE_TYPE, limit_direction=LIMIT_DIRECTION, axis=0)
    
#     # Get the difference and average of two eyes' diameter data and confidence values.
#     df_sync[AVEDM] = (df_sync[LDM] + df_sync[RDM]) / 2
#     df_sync[DIFFDM] = df_sync[LDM] - df_sync[RDM]
#     df_sync[AVECF] = (df_sync[LCF] + df_sync[RCF]) / 2
    
    # Get a new column storing the current trial's averaged sampling rate (the up-sampled version by interpolation).
    ave_samp_rate = len(df_sync) / (df_sync.index[-1] - df_sync.index[0])
    df_sync.loc[:, SR] = ave_samp_rate
    df_sync = df_sync.copy()
    
    return df_sync

### Deblinks and Blinking Rate Extraction

Reference: Check how David Linderbaure's group deal with the blinks. Use confidence to identify blinks. They removed the data within 200ms. However, they did not interpolate the eliminated ones. Here I clean data before and after 200ms of blinks. The input is the numpy data list of the "confidence" conlumn. Then return a list that marks which indecies are blinks. 


#### Configuration Part

In [9]:
# Deblink.
BLINK_EXTENSION_TIMEWINDOW = 0.2 # 200ms
MIN_CF = 0.25
MIN_NUM_SAMPLE_BLINKS = 2

# Smooth.
WIN_TYPE = 'hann'
HANN_WINDOW_SIZE = 5

# Interpolate.
CURVE_TYPE = 'linear'
CURVE_ORDER = 3

# Column configuration.
ISBLINK = 'isBlink'
ISBLINK_LEFT = 'isBlink-Left'
ISBLINK_RIGHT = 'isBlink-Right'

#### Implementation

In [10]:
def deblinks(df_input, confidence_column_label, diameter_column_label, isblink_column_label):
    # Feed in the dataframe to be processed and specific columns of the same eye/the averaged eye.
    # To be noted that the input was indexed by the timestamps. 
    # One Has to use the form of df_input[Col][df_input.index[i]] to reach the ith element.
    df = df_input.copy()
    # Initiate all 0 values to the IsBlink column.
    df.loc[:,isblink_column_label] = 0
    df = df.copy()
    
    # Parameter initilization
    blinks = []
    num_samples = len(df)
    i = 0 # The index starter.
    
    # Identify the blinks according to the low confidence values.
    while i < num_samples:
        if df[confidence_column_label][df.index[i]] < MIN_CF and i < num_samples -1:
            offset = 1
            next_data = df[confidence_column_label][df.index[i+offset]]
            while next_data < MIN_CF:
                offset = offset + 1
                if i + offset >= (num_samples - 1): # Check wheter exceeding the indecies boundary.
                    break
                next_data = df[confidence_column_label][df.index[i+offset]]
            
            # Judge whether the current index exceeds the 200ms time window.
            if offset >= MIN_NUM_SAMPLE_BLINKS:
                blinks.append((i, offset))
            
            i = i + offset
        else:
            i = i + 1
    
    # Mark data before and after BLINK_EXTENSION_TIMEWINDOW of samples.
    for j in range(len(blinks)):
        blink_index = blinks[j][0]
        blink_length = blinks[j][1]
        
        # Mark blinks within the searched area as np.nan values.
        for j in range(0, blink_length):
            df[diameter_column_label][df.index[blink_index + j]] = np.nan # Flag an NAN for the blinkings' diameters.
            df[isblink_column_label][df.index[blink_index + j]] = 1 # Flag a numerical value 1 for the blinkings.
        
        # Search for the time window with a length of 200ms. Then also mark blinks with np.nan values for the convenience of interplating.
        # Decremnenting.
        blink_start_timestamp = df.index[blink_index]
        k_dec = 0
        decrement_index = blink_index - k_dec
        # Controlled by the boundary conditions.
        while decrement_index >= 0:
            dec_timestamp = df.index[decrement_index]
            if blink_start_timestamp - dec_timestamp >= BLINK_EXTENSION_TIMEWINDOW:
                break
            else:
                df[diameter_column_label][df.index[decrement_index]] = np.nan # Set an NAN flag for data processing on the blinking data.
                df[isblink_column_label][df.index[decrement_index]] = 1 # Set an numerical flag on the blinking data.
                k_dec = k_dec + 1
                decrement_index = blink_index - k_dec
        
        # Incrementing - check the boundary limits first.
        blink_stop_timestamp = df.index[blink_index + blink_length]
        k_inc = 0
        increment_index = blink_index + blink_length + k_inc
        while increment_index < num_samples:
            inc_timestamp = df.index[increment_index]
            if inc_timestamp - blink_stop_timestamp >= BLINK_EXTENSION_TIMEWINDOW:
                break
            else:
                df[diameter_column_label][df.index[increment_index]] = np.nan # Set an NAN flag for data processing on the blinking data.
                df[isblink_column_label][df.index[increment_index]] = 1 # Set an numerical flag on the blinking data.
                k_inc = k_inc + 1
                increment_index = blink_index + blink_length + k_inc
    
    
    # Smooth the data - [Suspended] - Since the objective of freqeuncy-based analysis was to detect singularities, smooth should not be included here.
    # Besides, in Marshall's patent, https://patentimages.storage.googleapis.com/91/2f/5f/236d6711dcf6b6/US6090051.pdf, smooth was not applied.
#     df[diameter_column_label] = df[diameter_column_label].rolling(window=HANN_WINDOW_SIZE, center=True, win_type=WIN_TYPE).mean()
    
    # Interpolate the data - included in Marshall's patent - however the interpolation would introduce a lot of large negative values.
    df[diameter_column_label] = df[diameter_column_label].interpolate(method=CURVE_TYPE,order=CURVE_ORDER, limit_direction='both', axis=0)
    
    df_output = df.copy()
    return df_output

### Artefact Rejection

#### Configuration Part

In [11]:
HAMPLE_WIN_SIZE = 10

#### Implementation

In [12]:
## This part is directly cited from Sam's work.

# Filtering outliers 
# Lan et al. 2020 - median filter with sliding window of 10s
# Testing with numba optimised for-loop implementation of a Hampel Filter
# Note to self: I think this filter is also commonly used for pupil diameter filtering
@jit(nopython=True)
def hampel_filter_forloop_numba(input_series, window_size, n_sigmas=3):
    
    n = len(input_series)
    new_series = input_series.copy()
    k = 1.4826 # scale factor for Gaussian distribution
    indices = []
    
    for i in range((window_size),(n - window_size)):
        x0 = np.nanmedian(input_series[(i - window_size):(i + window_size)])
        S0 = k * np.nanmedian(np.abs(input_series[(i - window_size):(i + window_size)] - x0))
        if (np.abs(input_series[i] - x0) > n_sigmas * S0):
            new_series[i] = x0
            indices.append(i)
    
    return new_series, indices

In [13]:
def rej_artifact(df_input, diameter_column_label):
    df = df_input.copy()
    x_, outlier_x_ = hampel_filter_forloop_numba(df[diameter_column_label].to_numpy(), HAMPLE_WIN_SIZE)
    df[diameter_column_label] = x_.tolist()
    df_output = df.copy()
    return df_output

## Time-series Data Visualization
In this part, I visualize pupil data for different cognitive activities.

In [14]:
def plot_pupil_diameters(df_input, title_label):
    fig, ax = plt.subplots()
    df_input[LDM].plot(ax=ax)  # Plot the left diameter data.
    df_input[RDM].plot(ax=ax, title='Clean and Preprocessed Pupil Diameters (in pixels)\n' + title_label)  # Plot the right diameter data.
    ax.legend(['Left Eye', 'Right Eye'])

## Segmenting Time Windows

In this part, I will use overlapped sliding windows to extract wavelet-related features.

The reference publications include:
1. [WiStress: Contactless Stress Monitoring Using Wireless Signals, 2021](https://dl.acm.org/doi/pdf/10.1145/3478121).
2. [Feature extraction for robust physical activity recognition, 2017](https://hcis-journal.springeropen.com/articles/10.1186/s13673-017-0097-2).
3. [Feature Engineering on Time-Series Data for Human Activity Recognition](https://towardsdatascience.com/feature-engineering-on-time-series-data-transforming-signal-data-of-a-smartphone-accelerometer-for-72cbe34b8a60).
4. [Indexing Cognitive Workload Based on Pupillary Response under Luminance and Emotional Changes, 2013](https://dl-acm-org.libproxy1.nus.edu.sg/doi/pdf/10.1145/2449396.2449428).

### Configuration Part

In [15]:
# Parameter configuration according to arguments from the functions we called.
SAMPLING_RATE = 120 # The unit is Hertz.
time_window_length = 5 # The unit is second.
overlapping_length = 3 # The unit is second.

### Implementation
The time window segmentation will be conducted file-wisely, i.e., each trial's data produces their own windows. Hence that to be aligned to the data pre-processing's file-wise fashion.

By slicing data with overlapped sliding windows, the new instances will be created. The former instances are sample points, now is conjoint sliding window waiting to be transformed into multiple features described by wavelet decomposition.

From here, the sample points are regarded as averaged ones.

In [16]:
# I constructed by own overlapped sliding window here.
def segment(df_input, window_length=time_window_length, overlap_length=overlapping_length):
    df = df_input.copy()
    
    # Parameter initialization.
    num_samples = len(df)
    i=0 # Store the sliding window's starting point.
    offset_next = 0 # Store the next sliding window's starting index.
    windows = []
    
    window_num = SAMPLING_RATE * window_length
    overlap_num = SAMPLING_RATE * overlap_length
    step_num = window_num - overlap_num
    
    # Determine the number of sample points according to the standard sampling rate, i.e., 120 Hz.
    indices = np.arange(num_samples)
    shape = (indices.size - window_num + 1, window_num)
    stride = indices.strides * 2
    view = np.lib.stride_tricks.as_strided(indices, strides = stride, shape = shape)[0::step_num]
    output_2D_list = view.copy()
    for i in range(len(output_2D_list)):
        windows.append((output_2D_list[i][0], output_2D_list[i][-1]))
    
#     # Determine the number of sample points according to given sampling rate according to the timestamps.
#     # I suspend this part, then use the standard data sampling part is for union data points and union feature expansion while using wavelet decomposition.
#     windows = []
#     while i < (num_samples - 1):
#         starting_timestamp = df.index[i]
#         offset = 1
#         stopping_timestamp = df.index[i+offset]
#         while stopping_timestamp - starting_timestamp < window_length:
#             # Check whether reaches the overlapping edge.
#             if stopping_timestamp - starting_timestamp <= (window_length - overlap_length):
#                 offset_next = offset
#                 # Then it should stop
            
#             offset = offset + 1
#             if i + offset >= (num_samples - 1): # Check wheter exceeding the indecies boundary.
#                 break
#             stopping_timestamp = df.index[i+offset]
            
#         # Store the starting index and offset index.
#         windows.append((i, i+offset))
        
#         # Update the starting index.
#         i = i + offset_next
        
#     # Iteratively check whether the last window is too short, if it is shorter thant the overlapped area, then discard it.
#     while True:
#         last_starting_index = df.index[windows[-1][0]]
#         last_ending_index = df.index[windows[-1][1]-1]
#         last_time_window_length = last_ending_index - last_starting_index
#         if last_time_window_length <= overlap_length:
#             del windows[-1]
#         else
#             break
    
    return windows

## Wavelet Coefficient Extraction

This part generates frequency features using wavelet analysis.

And generate new instances, which is composed of several sample points from the time window.

### Configuration Part

In [17]:
wavelet_decomposition_level = 2
wavelet = 'sym16'

ISBLINK_LEFT = 'isBlink-Left'
ISBLINK_RIGHT = 'isBlink-Right'

# Experimental conditions
# Luminance.
LUX = 'Luminance'
# Task difficulty - labels.
LABEL = 'Labels'
PID = 'PID'

### Implementation

In [18]:
def freq_analysis(df_input, windows_indices, j=wavelet_decomposition_level):
    df = df_input.copy()
    windows = windows_indices

    # Parameter initialization.
    freq_features_two_eyes = []
    freq_features_left = []
    freq_features_right = []
    column_name_left = 'Left-'
    column_name_right = 'Right-'
    
    blinking_rates_left = []
    blinking_rates_right = []
    luxes = []
    labels = []
    pids = []
    
    for i in range(len(windows)):
        starting_index = windows[i][0]
        ending_index = windows[i][1] + 1
        
        # Get the frequency features.
        data_left = df[LDM].to_list()[starting_index:ending_index]
        data_right = df[RDM].to_list()[starting_index:ending_index]
        
        (cA2_left, cD2, cD1) = pywt.wavedec(data_left, wavelet, 'per', level=j)
        (cA2_right, cD2, cD1) = pywt.wavedec(data_right, wavelet, 'per', level=j)
        
        cA2_two_eyes = np.concatenate((cA2_left, cA2_right), axis=0)
        
        freq_features_left.append(cA2_left)
        freq_features_right.append(cA2_right)
        freq_features_two_eyes.append(cA2_two_eyes)
        
        # Get the blinking rate feature from both eyes. df[df['col'] == value
        blinks_left = np.array(df[ISBLINK_LEFT].to_list()[starting_index:ending_index])
        blinking_rate_left = (np.sum(blinks_left))/len(blinks_left)
        blinking_rates_left.append(blinking_rate_left)
        
        blinks_right = np.array(df[ISBLINK_RIGHT].to_list()[starting_index:ending_index])
        blinking_rate_right = (np.sum(blinks_right))/len(blinks_right)
        blinking_rates_right.append(blinking_rate_right)
        
        # Get the luminance feature. df['City'].iat[0]
        luxes.append(df[LUX].iat[0])
        
        # Get the PID.
        pids.append(df[PID].iat[0])
        
        # Get the task label.
        labels.append(df[LABEL].iat[0])
        
    # Add high dimension features into the dataframe. Set the columns
    # From now, the instances are vertically conpacted by the sliding windows, but horizontally expaned.
    # Frequency features.
    horizontal_length_left = np.array(freq_features_left).shape[1]
    horizontal_length_right = np.array(freq_features_right).shape[1]
    left_features = []
    for i in range(horizontal_length_left):
        feature_name = column_name_left + str(i)
        left_features.append(feature_name)
        
    right_features = []
    for i in range(horizontal_length_right):
        feature_name = column_name_right + str(i)
        right_features.append(feature_name)
    
    feature_names = left_features + right_features
    df_freq = pd.DataFrame(freq_features_two_eyes, columns=feature_names)
    
    # Blinking rate features.
    df_freq[ISBLINK_LEFT] = blinking_rates_left
    df_freq[ISBLINK_RIGHT] = blinking_rates_right
    
    # Luminance features.
    df_freq[LUX] = luxes
    
    # PID.
    df_freq[PID] = pids
    
    # Label.
    df_freq[LABEL] = labels
        
    df_output = df_freq.copy()
    return df_output

## Run in Batch

In [19]:
# Argument configuration.
TWODMODE = '2D'
LEFT = 'left'
RIGHT = 'right'
ISBLINK = 'isBlink'
raw_data_path = '../Data/RawData4ML/VersionOctober/' # In this case, the dirpath is mypath, dirnames contains sub-folders's names I need, and no filenames since there is no files there.
dir_features = '../Data/Results/'

# Experimental conditions
# Luminance.
LUX = 'Luminance'
LOW = 'lowlux'
MID = 'middlelux'
HIGH = 'highlux'
LUXS_SET = [LOW, MID, HIGH]
# Task difficulty - labels.
LABEL = 'Labels'
NOBACK = 'nothing'
ONEBACK = 'ONEBACK'
TWOBACK = 'TWOBACK'
THREEBACK = 'THREEBACK'
TASKDIFFS_SET = [NOBACK, ONEBACK, TWOBACK, THREEBACK]

PID = 'PID'

# Create a dataframe to store results details. TODO: to be modified to align to short time windows. Columns - features; Rows - short segmentations.
FEATURES_SET = [LUX, LABEL, PID]
df_pre_features = pd.DataFrame(columns=FEATURES_SET)
frames_pre_features = []

In [20]:
# My dumb verion of finding a list's member exisiting in a string or not, an alternative could be: https://thispointer.com/combine-for-loop-if-else-statement-in-python/
def check_string(input_string, target_lists):
    for x in target_lists:
        print(x)
        if x in input_string:
            output_string = x
            break
    return output_string

In [21]:
# List all directory names
dirs_list = []
for (dir_path, dir_names, file_names) in walk(raw_data_path):
    dirs_list.extend(dir_names)
    break

In [22]:
%%time
# Traverse all the file names in a given directory.
for dir_name in dirs_list:
    dir_path = raw_data_path + dir_name + '/'
    file_names_list = []
    df_get_labels = pd.DataFrame()
    df_current_trial = pd.DataFrame(columns=FEATURES_SET)
    
    for (_, _, file_names) in walk(dir_path):
        file_names_list.extend(file_names)
    
    # Find the targetted files.
    for file_name in file_names:
        if TWODMODE in file_name:
            if LEFT in file_name:
                file_path_left = dir_path + file_name
            elif RIGHT in file_name:
                file_path_right = dir_path + file_name
      
    # Step 1: Synchronize 2 eyes' data.
    df_sync = upsample_timestamps_sync(left_eye_file_path=file_path_left, right_eye_file_path=file_path_right)
    
    # Step 2: Deblink, and interpolate.
    # Process the left eye data.
    df_deblink = deblinks(df_input=df_sync, confidence_column_label=LCF, diameter_column_label=LDM, isblink_column_label=ISBLINK_LEFT)
    # Process the right eye data.
    df_deblink = deblinks(df_input=df_deblink, confidence_column_label=RCF, diameter_column_label=RDM, isblink_column_label=ISBLINK_RIGHT)
    
    # Step3: Artifect rejection
    df_rej = rej_artifact(df_input=df_deblink, diameter_column_label=LDM)
    # Process the right eye data.
    df_rej = rej_artifact(df_input=df_rej, diameter_column_label=RDM)
    
    # Step4: Segment data using an overlapping time window.
    windows = segment(df_input=df_rej)
    
    # Step5: Get labels and experiment conditions.
    # Assign features into the current trial's dataframe.
    df_get_labels = df_rej.copy()
    # Lux conditions.
    lux_condition = [iterator for iterator in LUXS_SET if iterator in dir_name][0] # My learnt version from: https://thispointer.com/combine-for-loop-if-else-statement-in-python/
    df_get_labels.loc[:, LUX] = lux_condition
    df_get_labels = df_get_labels.copy()
    # Task difficulty labels.
    task_diff_condition = [iterator for iterator in TASKDIFFS_SET if iterator in dir_name][0]
    df_get_labels.loc[:, LABEL] = task_diff_condition
    df_get_labels = df_get_labels.copy()
    # PID.
    pid = dir_name.split('-')[-1]
    df_get_labels.loc[:, PID] = pid
    df_get_labels = df_get_labels.copy()
    # Step6: Generate wavelet features.And re-set instances: vertically decrease but horizontally increase.
    df_freq_analyzed = freq_analysis(df_input=df_get_labels, windows_indices=windows)
    
    df_current_trial = df_freq_analyzed.copy()
    frames_pre_features.append(df_current_trial)
#     break
    
#     # Debug Area: Plot to pre-view pupil data. TODO: delete later, for debugging and preprocessing validation: is there any problems?
#     plot_pupil_diameters(df_input=df_sync, title_label=dir_name + ' synchronized')
#     plot_pupil_diameters(df_input=df_deblink, title_label=dir_name + ' delinked')
#     plot_pupil_diameters(df_input=df_rej, title_label=dir_name + ' rejected artifact')

# Merge dataframes/single trials' data into a large dataset.
df_pre_features = pd.concat(frames_pre_features)

CPU times: total: 2min 53s
Wall time: 2min 56s


In [23]:
df_pre_features

Unnamed: 0,Left-0,Left-1,Left-2,Left-3,Left-4,Left-5,Left-6,Left-7,Left-8,Left-9,...,Right-145,Right-146,Right-147,Right-148,Right-149,isBlink-Left,isBlink-Right,Luminance,PID,Labels
0,90.878101,90.000122,89.429937,90.488841,90.425234,90.336186,90.534690,90.501258,90.304912,90.299277,...,94.908427,95.267926,94.824154,95.681502,95.550234,0.325000,0.370000,lowlux,07,nothing
1,90.272990,90.943095,90.475785,90.534682,90.372235,90.283447,90.175733,90.071457,89.966849,89.862239,...,96.455060,95.353432,94.965808,95.486500,94.833888,0.168333,0.203333,lowlux,07,nothing
2,89.356322,88.406687,88.938648,89.101954,88.632723,89.213376,88.886822,88.681709,89.061931,88.684184,...,95.822918,96.589191,96.326834,96.544795,95.752655,0.000000,0.000000,lowlux,07,nothing
3,89.591427,90.381350,89.822829,90.178344,90.201855,90.039328,89.327095,90.271126,89.445756,89.488081,...,93.682537,93.863888,94.325592,93.692027,93.015056,0.135000,0.195000,lowlux,07,nothing
4,89.352206,89.354459,90.085756,90.154200,90.493916,88.939724,89.547730,90.279304,90.332414,91.536459,...,93.907159,92.912075,93.245723,93.272628,94.553329,0.135000,0.195000,lowlux,07,nothing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.685748,91.797745,91.581850,91.573155,91.780403,1.000000,0.431667,lowlux,02,THREEBACK
79,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.279524,91.298279,91.303723,91.315547,91.318091,1.000000,0.530000,lowlux,02,THREEBACK
80,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,91.266389,91.254105,91.241145,91.231029,91.214267,1.000000,0.663333,lowlux,02,THREEBACK
81,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,93.249352,...,89.824706,89.854385,89.802735,90.034177,89.511720,1.000000,0.816667,lowlux,02,THREEBACK


## Output to csv files

In [24]:
# Create a new folder to store this batch of calculation results.
dirpath_results = '../Data/Results/'
now = datetime.datetime.now()
datestamp = now.strftime("%d-%m-%H-%M")
results_folder_path = dirpath_results + datestamp + '/'

if os.path.exists(results_folder_path) is False:
    os.makedirs(results_folder_path)

# Write the current dataframe as a csv into the new created folder.
df_pre_features.to_csv(results_folder_path + 'results.csv', encoding='utf-8', index=False, header=True)  