# The Pupil Features Extracted in Batch Pipeline.
## Introuduction
The features to be extracted includes blinking rate and chunks of wavelet coefficients.

## Reference
1. The previous codes.https://github.com/BaiYunpeng1949/MobileEyeComputing/tree/master/ProcessDataNTestAlgorithm
2. TODO: add papers here.
3. My work: https://docs.google.com/document/d/1oLv3oJQLjst1_pYgd_UA3RRL1fRGSbZ6uvlMuxmZR2k/edit#heading=h.r01ccf7ox05g

## Implementation

In [1]:
import numpy as np
import pandas as pd
import streamlit as st
import plotly.express as px
from sklearn.metrics import r2_score
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from os import walk
import pandas as pd
import datetime
import csv
import os
from numba import jit

In [2]:
def plotter(ax, data1, data2, param_dict):
    out = ax.plot(data1, data2, **param_dict)
    return out

## File-wise/Task-wise Data Pre-processing
### [Suspended] Left and Right Eye Synchronization and Calculate the Difference
Since I now implement ML/AI models to do a more inclusive analysis where more features could be analyzed, I will use the feature of the difference between left and right eye data.


1. Referenced from the previous work located as: https://github.com/BaiYunpeng1949/MobileEyeComputing/blob/master/ProcessDataNTestAlgorithm/LeftRightEyesSyncData.ipynb

2. The difference between two eyes were found correlated with cognitive workload estimation as well, see the related paper: Optimizing the usage of pupillary based indicators for cognitive workload. Reading link: https://docs.google.com/document/d/1jBezc9kqaziGlWk6sSgCvyjHTlpD5G6OoNZyh7K2HIo/edit#heading=h.qqfv1ot6zjd8

Besides, there is a considerable difference in pupil size variation for right and left eyes of the participants. See the paper: Exploring pupil size variation as a cognitive load indicator in visualization studies, link: https://drive.google.com/file/d/1z8O1NGNYA87La-CVMSr7cQkJ-VVOOUSe/view

In [3]:
def _pre_dtw_sync(left_eye_file_path, right_eye_file_path):   
    # Configure the parameters
    entity_dwt_align = 'Timestamp'
    entity_dwt_apply_dia = 'Diameter'
    entity_dwt_apply_conf = 'Confidence'
    left_eye = 'left'
    right_eye = 'right'
    
    # Tag labels
    SAMPLING_RATE_LEFT = int((left_eye_file_path.split('_')[-1]).split('Hz')[0])
    data_left = pd.read_csv(left_eye_file_path)

    SAMPLING_RATE_RIGHT = int((right_eye_file_path.split('_')[-1]).split('Hz')[0])
    data_right = pd.read_csv(right_eye_file_path)
    
    df_left = data_left[['Timestamp','Confidence','Diameter','Event']].copy()
    df_right = data_right[['Timestamp','Confidence','Diameter','Event']].copy()
    
    # Determine the left and right eye data's size first: which one is bigger?
    # Identify the number of elements in the left/right eyes data. If one applies the up-sampling method, the larger eye needs to be put in the first argument.
    len_left = len(df_left)
    len_right = len(df_right)
    if len_left >= len_right:
        df_reference = df_left.copy()  # df_reference: the one that being put in the first argument, as a reference.
        df_alignment = df_right.copy() # df_alignment: the one that being put in the second argument, to be aligned to the reference.
        df_origin = df_left.copy()
        SR_SYNC = SAMPLING_RATE_LEFT
    elif len_left < len_right:
        df_reference = df_right.copy()
        df_alignment = df_left.copy()
        df_origin = df_right.copy()
        SR_SYNC = SAMPLING_RATE_RIGHT
    
    # Calculate for warping.
    distance, path = fastdtw(df_reference[entity_dwt_align], df_alignment[entity_dwt_align], dist=euclidean)
    
    return path, df_reference, df_alignment, entity_dwt_apply_dia, entity_dwt_apply_conf, df_origin, SR_SYNC

In [4]:
# Define a function scnchronize and align/merge 2 eyes' numerical values, including diamter and confidence values.
# The method of getting the average/mean value of 2 eyes' data as computing target is referenced from the mention in LHIPA.
def _dtw_synchronize_merge_data(path_dwt, df_reference, df_alignment, entity_dwt_apply):
    # Synchronize
    data_sync = []
    for i in range(0, len(path_dwt)):
        data_sync.append([path_dwt[i][0],  # The index column is for dropping out duplicates.
                         df_reference[entity_dwt_apply].iloc[path_dwt[i][0]],
                         df_alignment[entity_dwt_apply].iloc[path_dwt[i][1]]])
    df_sync = pd.DataFrame(data=data_sync,
                           columns=['Index', 
                                    'Reference '+entity_dwt_apply, 
                                    'Alignment '+entity_dwt_apply]).dropna()
    df_sync = df_sync.drop_duplicates(subset=['Index']) # Drop the duplicates according to the index of the reference.
    df_sync = df_sync.reset_index(drop=True)
    # Merge/Align
    df_sync['Avg'+entity_dwt_apply] = df_sync.loc[:, ['Reference '+entity_dwt_apply, 'Alignment '+entity_dwt_apply]].mean(axis = 1)
    # Calculate the difference
    df_sync['Diff'+entity_dwt_apply] = df_sync['Reference '+entity_dwt_apply] - df_sync['Alignment '+entity_dwt_apply]
    
    return df_sync

In [5]:
# Synchronize the given 2 eyes' data.
def dtw_sync(left_eye_file_path, right_eye_file_path):
    # Prepare for the sychronization.
    path, df_reference, df_alignment, entity_dwt_apply_dia, entity_dwt_apply_conf, df_origin, SR_SYNC = _pre_dtw_sync(left_eye_file_path = left_eye_file_path, 
                                                                                                                  right_eye_file_path = right_eye_file_path)

    
    # Synchronize, merge, and label data.
    df_sync_dia = _dtw_synchronize_merge_data(path_dwt=path,
                                              df_reference=df_reference,
                                              df_alignment=df_alignment,
                                              entity_dwt_apply=entity_dwt_apply_dia)
    df_sync_conf = _dtw_synchronize_merge_data(path_dwt=path,
                                               df_reference=df_reference,
                                               df_alignment=df_alignment,
                                               entity_dwt_apply=entity_dwt_apply_conf)
    
    # Integrate into one dataframe.
    df_sync = pd.DataFrame()
    df_sync['Timestamp'] = df_origin['Timestamp']
    df_sync['Confidence'] = df_sync_conf['AvgConfidence']
    df_sync['Diameter'] = df_sync_dia['AvgDiameter']
    df_sync['Event'] = df_origin['Event']
    df_sync['DiffDiameter'] = df_sync_dia['DiffDiameter']
    
    # Output and save into a csv file.
    df_export = df_sync.copy()
    file_name = left_eye_file_path.split('/')[-2:]

    folder_path = '../Data/PreprocessedData/' + file_name[0] + '/'
    if os.path.exists(folder_path) is False:
        os.makedirs(folder_path)

    write_file_name = 'synchronized_' + str(SR_SYNC) + 'Hz.csv'
    write_file_path = folder_path + write_file_name
    df_export.to_csv(write_file_path)
    
    return df_sync

### Merge Two Eyes' Data using Timestamps, Interpolate, and Calculate the Difference
Reference: a blog introducing 2 sensors' data fusion - https://stackoverflow.com/questions/14079766/synchronize-dataset-multiple-users-multiple-timestamps

#### Configuration Part 

In [6]:
TS = 'Timestamp'
CF = 'Confidence'
DM = 'Diameter'
LDM = 'Left Diameter'
RDM = 'Right Diameter'
LCF = 'Left Confidence'
RCF = 'Right Confidence'
AVEDM = 'Average Diameter'
AVECF = 'Average Confidence'
DIFFDM = 'Difference Diameter'  # By my default: Left - Right - unit in pixels.

#### Implementation

In [7]:
def upsample_timestamps_sync(left_eye_file_path, right_eye_file_path):
    # Read data from csv into dataframes.
    df_left = pd.read_csv(left_eye_file_path)
    df_right = pd.read_csv(right_eye_file_path)
    df_left = df_left[[TS,CF,DM]].copy()
    df_right = df_right[[TS,CF,DM]].copy()
    
    # Get the diameter data indexed by timestamps.
    left_diameters = df_left[DM].to_numpy() 
    series_left = pd.Series(left_diameters, index=df_left[TS])
    right_diameters = df_right[DM].to_numpy() 
    series_right = pd.Series(right_diameters, index=df_right[TS])
    
    # Synchronize 2 eyes' data by listing all timestamps, this process is actually a up-sampling.
    df_sync = pd.DataFrame([series_left, series_right]).T.sort_index()
    df_sync = df_sync.rename(columns={0: LDM, 1: RDM})

    # Interpolate all the NAN values using 'Spline 3' method with a bi-directional strategy to fill both the first and the last NAN values.
    df_sync = df_sync.interpolate(method='spline', order=3, limit_direction='both', axis=0)
    
    # Align the confidence values according to the timestamps. 
    # Reference: Adding values in new column based on indexes with pandas in python. https://stackoverflow.com/questions/45636105/adding-values-in-new-column-based-on-indexes-with-pandas-in-python
    df_sync[LCF] = df_sync.index.to_series().map(df_left.set_index(TS)[CF])
    df_sync[RCF] = df_sync.index.to_series().map(df_right.set_index(TS)[CF])
    
    # Interpolate NAN values using the normal linear method.
    df_sync[LCF] = df_sync[LCF].interpolate(method='linear', limit_direction='both', axis=0)
    df_sync[RCF] = df_sync[RCF].interpolate(method='linear', limit_direction='both', axis=0)
    
#     # Get the difference and average of two eyes' diameter data and confidence values.
#     df_sync[AVEDM] = (df_sync[LDM] + df_sync[RDM]) / 2
#     df_sync[DIFFDM] = df_sync[LDM] - df_sync[RDM]
#     df_sync[AVECF] = (df_sync[LCF] + df_sync[RCF]) / 2
    
    return df_sync

In [8]:
# # Debug area
# left = "../Data/RawData4ML/VersionOctober/05-10-13-15-lowlux-nothing/left2D_89Hz.csv"
# right = "../Data/RawData4ML/VersionOctober/05-10-13-15-lowlux-nothing/right2D_75Hz.csv"
# df = upsample_timestamps_sync(left, right)

### Deblinks and Blinking Rate Extraction

Reference: Check how David Linderbaure's group deal with the blinks. Use confidence to identify blinks. They removed the data within 200ms. However, they did not interpolate the eliminated ones. Here I clean data before and after 200ms of blinks. The input is the numpy data list of the "confidence" conlumn. Then return a list that marks which indecies are blinks. 


#### Configuration Part

In [9]:
# Deblink.
BLINK_EXTENSION_TIMEWINDOW = 0.2 # 200ms
MIN_CF = 0.25
MIN_NUM_SAMPLE_BLINKS = 2

# Smooth.
WIN_TYPE = 'hann'
HANN_WINDOW_SIZE = 5

# Interpolate.
CURVE_TYPE = 'spline'
CURVE_ORDER = 3

#### Implementation

In [10]:
def deblinks(df_input, confidence_column_label, diameter_column_label):
    # Feed in the dataframe to be processed and specific columns of the same eye/the averaged eye.
    # To be noted that the input was indexed by the timestamps. 
    # One Has to use the form of df_input[Col][df_input.index[i]] to reach the ith element.
    
    # Parameter initilization
    blinks = []
    num_samples = len(df_input)
    i = 0 # The index starter.
    
    # Identify the blinks according to the low confidence values.
    while i < num_samples:
        if df_input[confidence_column_label][df_input.index[i]] < MIN_CF and i < num_samples -1:
            offset = 1
            next_data = df_input[confidence_column_label][df_input.index[i+offset]]
            while next_data < MIN_CF:
                offset = offset + 1
                if i + offset >= num_samples:
                    break
                next_data = df_input[confidence_column_label][df_input.index[i+offset]]
            
            # Judge whether the current index exceeds the 200ms time window.
            if offset >= MIN_NUM_SAMPLE_BLINKS:
                blinks.append((i, offset))
            
            i = i + offset
        else:
            i = i + 1
    
    # Mark data before and after BLINK_EXTENSION_TIMEWINDOW of samples.
    for j in range(len(blinks)):
        blink_index = blinks[j][0]
        blink_length = blinks[j][1]
        
        # Mark blinks within the searched area as np.nan values.
        for j in range(0, blink_length):
            df_input[diameter_column_label][df_input.index[blink_index + j]] = np.nan
        
        # Search for the time window with a length of 200ms. Then also mark blinks with np.nan values for the convenience of interplating.
        # Decremnenting.
        blink_start_timestamp = df_input.index[blink_index]
        k_dec = 0
        decrement_index = blink_index - k_dec
        # Controlled by the boundary conditions.
        while decrement_index >= 0:
            dec_timestamp = df_input.index[decrement_index]
            if blink_start_timestamp - dec_timestamp >= BLINK_EXTENSION_TIMEWINDOW:
                break
            else:
                df_input[diameter_column_label][df_input.index[decrement_index]] = np.nan
                k_dec = k_dec + 1
                decrement_index = blink_index - k_dec
        
        # Incrementing
        blink_stop_timestamp = df_input.index[blink_index + blink_length]
        k_inc = 0
        increment_index = blink_index + blink_length + k_inc
        while increment_index < num_samples:
            inc_timestamp = df_input.index[increment_index]
            if inc_timestamp - blink_stop_timestamp >= BLINK_EXTENSION_TIMEWINDOW:
                break
            else:
                df_input[diameter_column_label][df_input.index[increment_index]] = np.nan
                k_inc = k_inc + 1
                increment_index = blink_index + blink_length + k_inc
    
    
    # Smooth the data - [Suspended] - Since the objective of freqeuncy-based analysis was to detect singularities, smooth should not be included here.
    # Besides, in Marshall's patent, https://patentimages.storage.googleapis.com/91/2f/5f/236d6711dcf6b6/US6090051.pdf, smooth was not applied.
#     df_input[diameter_column_label] = df_input[diameter_column_label].rolling(window=HANN_WINDOW_SIZE, center=True, win_type=WIN_TYPE).mean()
    
    # Interpolate the data - included in Marshall's patent.
    df_input[diameter_column_label] = df_input[diameter_column_label].interpolate(method=CURVE_TYPE,order=CURVE_ORDER, limit_direction='both', axis=0)
    
    df_output = df_input.copy()
    return df_output

In [11]:
# # Debug area.
# # Example debug, to be deleted later.
# df_ = deblinks(df, LCF, LDM)

### [Suspended] Artefact Rejection
This part is suspended for better results of the singularity detection.
#### Configuration Part

In [12]:
HAMPLE_WIN_SIZE = 10

#### Implementation

In [13]:
## This part is directly cited from Sam's work.

# Filtering outliers 
# Lan et al. 2020 - median filter with sliding window of 10s
# Testing with numba optimised for-loop implementation of a Hampel Filter
# Note to self: I think this filter is also commonly used for pupil diameter filtering
@jit(nopython=True)
def hampel_filter_forloop_numba(input_series, window_size, n_sigmas=3):
    
    n = len(input_series)
    new_series = input_series.copy()
    k = 1.4826 # scale factor for Gaussian distribution
    indices = []
    
    for i in range((window_size),(n - window_size)):
        x0 = np.nanmedian(input_series[(i - window_size):(i + window_size)])
        S0 = k * np.nanmedian(np.abs(input_series[(i - window_size):(i + window_size)] - x0))
        if (np.abs(input_series[i] - x0) > n_sigmas * S0):
            new_series[i] = x0
            indices.append(i)
    
    return new_series, indices

In [14]:
def rej_artifact(df_input, diameter_column_label):
    df_ouput = df_input.copy()
    x_, outlier_x_ = hampel_filter_forloop_numba(df_ouput[diameter_column_label].to_numpy(), HAMPLE_WIN_SIZE)
    df_ouput[diameter_column_label] = x_.tolist()
    return df_ouput

## Segmenting smaller windows

## Wavelet Coefficient Extraction

## Run in Batch

In [15]:
# Argument configuration.
CONF_2DMODE = '2D'
CONF_LEFT = 'left'
CONF_RIGHT = 'right'
raw_data_path = '../Data/RawData4ML/VersionOctober/' # In this case, the dirpath is mypath, dirnames contains sub-folders's names I need, and no filenames since there is no files there.
dir_features = '../Data/Results/'

# Create a dataframe to store results details. TODO: to be modified to align to short time windows. Columns - features; Rows - short segmentations.
df_features = pd.DataFrame(columns=['Filename', 'Max level value', 
                                   'IPA 2', 'IPA 3', 'IPA 4', 'IPA 5', 'IPA 6', 'IPA 7', 'IPA 8', 'IPA 9',
                                   'LHIPA 2', 'LHIPA 3', 'LHIPA 4', 'LHIPA 5', 'LHIPA 6', 'LHIPA 7', 'LHIPA 8', 'LHIPA 9'])  # Needs to be changed according to requirements.

In [16]:
# List all directory names
dirs_list = []
for (dir_path, dir_names, file_names) in walk(raw_data_path):
    dirs_list.extend(dir_names)
    break

In [17]:
%%time
# Traverse all the file names in a given directory.
for dir_name in dirs_list:
    dir_path = raw_data_path + dir_name + '/'
    file_names_list = []
    for (_, _, file_names) in walk(dir_path):
        file_names_list.extend(file_names)
    
    # Find the targetted files.
    for file_name in file_names:
        if CONF_2DMODE in file_name:
            if CONF_LEFT in file_name:
                file_path_left = dir_path + file_name
            elif CONF_RIGHT in file_name:
                file_path_right = dir_path + file_name
      
    # Synchronize 2 eyes' data.
    df_sync = upsample_timestamps_sync(left_eye_file_path=file_path_left, right_eye_file_path=file_path_right)
    
    # Deblink, and interpolate.
    # Process the left eye data.
    df_deblink = deblinks(df_input=df_sync, confidence_column_label=LCF, diameter_column_label=LDM)
    # Process the right eye data.
    df_deblink = deblinks(df_input=df_deblink, confidence_column_label=RCF, diameter_column_label=RDM)
    
    break

CPU times: total: 672 ms
Wall time: 661 ms


In [18]:
df_deblink

Unnamed: 0_level_0,Left Diameter,Right Diameter,Left Confidence,Right Confidence
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
352723.490801,43.808819,45.411758,0.562328,0.602150
352723.492894,43.873134,41.347263,0.575907,0.602150
352723.498762,44.005180,45.536383,0.589486,0.708128
352723.509503,43.845901,45.697473,0.551540,0.814106
352723.517701,44.138456,45.596012,0.547389,0.920084
...,...,...,...,...
352783.046935,44.030834,50.373090,0.671217,0.618062
352783.049092,44.351574,50.720688,0.592153,0.643356
352783.060591,43.223335,50.387328,0.513089,0.604240
352783.064396,44.410248,49.592499,0.467182,0.565125
