Import useful modules

In [1]:
import os
import time
import scipy
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# custom module frpm ./mymodul.py
from mymodule import Preprocess

Declare path of the resources going to be used

In [2]:
# input
DATASET_PATH = "D:/datasets/UTA-RLDD/csv/"
FPS_INFO_PATH = "D:/datasets/UTA-RLDD/fps/fps.txt"
# output
BASE_PATH = "dataset/base/"
BASE_DERIVED_PATH = "dataset/base_derived/"
SEQ_BASE_PATH = "dataset/seq_base/"
SEQ_BASE_DERIVED_PATH = "dataset/seq_base_derived/"

Declare constant variable

In [3]:
MINUTES_LENGTH = 3 # will be used as window length for calculate accumulative features and sequential window length 
EYE_THRESHOLD = 0.5
MOUTH_THRESHOLD = 0.5
RATIO_THRESHOLD = 0.1 # will be used to eliminate subject csv's if there is a NaN percentage more than this value
BTB_MICROSLEEP_SECOND = 1
BTB_YAWNING_SECOND = 6
NEW_FPS = 3
SEQ_LEN = int(NEW_FPS * 60 * MINUTES_LENGTH)

Config preprocess module

In [4]:
Preprocess.config(EYE_THRESHOLD, MOUTH_THRESHOLD, BTB_MICROSLEEP_SECOND, BTB_YAWNING_SECOND)

Get FPS information for each video

In [5]:
df_fps = pd.read_csv(FPS_INFO_PATH, delimiter=';', names=['subject','class','fps'], index_col=False)
df_fps = df_fps.astype({"class": int})
df_fps = Preprocess.changeClassLabel(df_fps, False)
df_fps.head()

Unnamed: 0,subject,class,fps
0,1,0,29.9689
1,1,2,29.9689
2,1,1,29.9689
3,2,0,29.9689
4,2,2,29.9689


loop through csv dataset folder, for each subject, create four types of data

In [6]:
%%time
for filename in os.listdir(DATASET_PATH):
    if filename.endswith('.csv'):
        subject = int(filename[:2])
        filepath = DATASET_PATH + filename
        
        # create dataframe from csv
        df = pd.read_csv(filepath, delimiter=';', names=['subject','class','frame','rEar','lEar','mar'])
        df = Preprocess.changeClassLabel(df)
        
        # pass or failed the NaN ratio test
        if not Preprocess.isSufficient(df, RATIO_THRESHOLD):
            # failed, there is more than 10 percent NaN values, continue to next failed
            print(f"processing failed for {filename}, there is more than 10% NaN values exists, continue to next file")
            continue
            
        # fill the NaN value and transform data
        df.fillna(method="ffill", inplace=True)
        df.fillna(method="bfill", inplace=True)
        df = Preprocess.transform(df)
        
        # data containers
        container_base = []
        container_seq_base = []
        container_base_derived = []
        container_seq_base_derived = []
        
        # Loops through eaach class
        groups = df.groupby("class")
        for idx_class in range(3):
            # DataFrame for each class
            df_class = groups.get_group(idx_class).set_index('frame')
            # get fps for this subject and class
            fps = df_fps.loc[(df_fps["class"] == idx_class) & (df_fps["subject"] == subject)].fps.values[0]
            # lowering fps, so that sequential data doesnt have so much data to save and process
            df_class = Preprocess.change_fps(df_class, fps, NEW_FPS)
            
            # Save Base Data
            container_base.append(df_class.iloc[:,2:].values)
             
            # Make sequential and save Seq Base Data
            container_seq_base.append(Preprocess.sequencialize(df_class, SEQ_LEN))
            
            # perform feature engineering
            df_class = Preprocess.feature(df_class, NEW_FPS, MINUTES_LENGTH)
             
            #Save Base Derived Data
            container_base_derived.append(df_class.iloc[:,2:].values)
            
            #Make sequential and save Seq Base Derived Data
            container_seq_base_derived.append(Preprocess.sequencialize(df_class, SEQ_LEN))
            
        np.save(file=f'{BASE_PATH}{filename[:2]}', arr=Preprocess.balancing_labelling(container_base))
        np.save(file=f'{BASE_DERIVED_PATH}{filename[:2]}', arr=Preprocess.balancing_labelling(container_base_derived))
        np.save(file=f'{SEQ_BASE_PATH}{filename[:2]}', arr=Preprocess.balancing_labelling(container_seq_base))
        np.save(file=f'{SEQ_BASE_DERIVED_PATH}{filename[:2]}', arr=Preprocess.balancing_labelling(container_seq_base_derived))
        
        print(f"processing finished for {filename}")

processing finished for 01.csv
processing finished for 02.csv
processing finished for 03.csv
processing finished for 04.csv
processing finished for 05.csv
processing finished for 06.csv
processing finished for 07.csv
processing finished for 08.csv
processing finished for 09.csv
processing finished for 10.csv
processing failed for 11.csv, there is more than 10% NaN values exists, continue to next file
processing finished for 12.csv
processing finished for 13.csv
processing finished for 14.csv
processing finished for 15.csv
processing finished for 16.csv
processing finished for 17.csv
processing finished for 18.csv
processing finished for 19.csv
processing finished for 20.csv
processing finished for 21.csv
processing finished for 22.csv
processing failed for 23.csv, there is more than 10% NaN values exists, continue to next file
processing finished for 24.csv
processing finished for 25.csv
processing finished for 26.csv
processing finished for 27.csv
processing finished for 28.csv
proces