# Data Preprocessing
**By: M. Alwi Sukra**

Preprocess transformed data until prepared for machine learning algorithm

#### import useful libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import time
from collections import deque

#### get transformed data

In [2]:
timestamp = "1593487344"

In [3]:
df = pd.read_csv(f"transformed-datas/{timestamp}_preprocessed-data.csv", index_col=False)

In [4]:
df.head()

Unnamed: 0,frame,subject,class_label,rEar_norm,lEar_norm,mar_norm,perclos,microsleep_rate,yawning_rate
0,4484,1,0,0.582719,0.622472,0.216977,0.033683,0.0,0.0
1,4485,1,0,0.576412,0.601768,0.210735,0.033683,0.0,0.0
2,4486,1,0,0.498003,0.554422,0.201034,0.03346,0.0,0.0
3,4487,1,0,0.595975,0.58195,0.208411,0.033237,0.0,0.0
4,4488,1,0,0.549539,0.53491,0.200352,0.033237,0.0,0.0


In [5]:
df_identical = pd.read_csv(f"transformed-datas/{timestamp}_preprocessed-data-identical_fps.csv", index_col=False)

In [6]:
df_identical.head()

Unnamed: 0,frame,subject,class_label,rEar_norm,lEar_norm,mar_norm,perclos,microsleep_rate,yawning_rate
0,4321,1,0,0.535363,0.553641,0.098949,0.030556,0.0,0.0
1,4324,1,0,0.547988,0.562705,0.083339,0.030556,0.0,0.0
2,4327,1,0,0.576507,0.554961,0.112021,0.030556,0.0,0.0
3,4330,1,0,0.555828,0.533281,0.085486,0.030556,0.0,0.0
4,4333,1,0,0.535523,0.520933,0.084005,0.030556,0.0,0.0


## Balancing Data

Balance both original fps and identical fps data so that **for each subject has the same amount of data class 0 and class 1**

#### get fps info

In [7]:
FPS_PATH = "D:/datasets/ngantuk/fps_info.txt"

In [8]:
df_fps = pd.read_csv(FPS_PATH, names=['subject','class_label','fps'], delimiter=';', index_col=False)
df_fps.head()

Unnamed: 0,subject,class_label,fps
0,1,0,24.91
1,1,1,24.91
2,2,0,30.0
3,2,1,30.0
4,3,0,30.03


Get data **just for the first 2 minutes**

In [9]:
MINUTES_LENGTH = 2

#### normal data

In [10]:
dfs_balanced = []

In [11]:
for subject, df_subject in df.groupby('subject'):
    min_fps = min(df_fps.loc[(df_fps['subject'] == subject)]['fps'].values)
    window_size = int(min_fps * 60 * MINUTES_LENGTH) + 1
    for class_label, df_class in df_subject.groupby('class_label'):
        _df = pd.DataFrame(df_class).reset_index(drop=True)
        dfs_balanced.append(_df.iloc[:window_size])

In [12]:
df_balanced = pd.concat(dfs_balanced)

In [13]:
df_balanced.head()

Unnamed: 0,frame,subject,class_label,rEar_norm,lEar_norm,mar_norm,perclos,microsleep_rate,yawning_rate
0,4484,1,0,0.582719,0.622472,0.216977,0.033683,0.0,0.0
1,4485,1,0,0.576412,0.601768,0.210735,0.033683,0.0,0.0
2,4486,1,0,0.498003,0.554422,0.201034,0.03346,0.0,0.0
3,4487,1,0,0.595975,0.58195,0.208411,0.033237,0.0,0.0
4,4488,1,0,0.549539,0.53491,0.200352,0.033237,0.0,0.0


In [14]:
df_balanced.groupby('class_label').count()['frame']

class_label
0    28267
1    28267
Name: frame, dtype: int64

#### fps identical data

In [15]:
dfs_identical_balanced = []

In [16]:
FPS = 10
WINDOW_SIZE = int(FPS * 60 * MINUTES_LENGTH) + 1

In [17]:
for subject, df_subject in df_identical.groupby('subject'):
    for class_label, df_class in df_subject.groupby('class_label'):
        _df = pd.DataFrame(df_class).reset_index(drop=True)
        dfs_identical_balanced.append(_df.iloc[:WINDOW_SIZE])

In [18]:
df_identical_balanced = pd.concat(dfs_identical_balanced)

In [19]:
df_identical_balanced.head()

Unnamed: 0,frame,subject,class_label,rEar_norm,lEar_norm,mar_norm,perclos,microsleep_rate,yawning_rate
0,4321,1,0,0.535363,0.553641,0.098949,0.030556,0.0,0.0
1,4324,1,0,0.547988,0.562705,0.083339,0.030556,0.0,0.0
2,4327,1,0,0.576507,0.554961,0.112021,0.030556,0.0,0.0
3,4330,1,0,0.555828,0.533281,0.085486,0.030556,0.0,0.0
4,4333,1,0,0.535523,0.520933,0.084005,0.030556,0.0,0.0


In [20]:
df_identical_balanced.groupby('class_label').count()['frame']

class_label
0    11667
1    11918
Name: frame, dtype: int64

## Data Preparation for ANN

#### get data

In [21]:
df_ann = df_balanced[df_balanced.columns[2:]]

#### shuffle data

In [22]:
df_ann = df_ann.sample(frac=1).reset_index(drop=True)

In [23]:
df_ann.head()

Unnamed: 0,class_label,rEar_norm,lEar_norm,mar_norm,perclos,microsleep_rate,yawning_rate
0,1,0.530295,0.526667,0.081011,0.511378,0.0,0.0
1,0,0.497504,0.491098,0.189768,0.161481,0.005556,0.0
2,1,0.488087,0.532871,0.14563,0.211481,0.011111,0.0
3,1,0.470621,0.534121,0.045105,0.254,0.005556,0.0
4,0,0.611368,0.613629,0.071422,0.048925,0.0,0.0


#### seperate into x and y

In [24]:
def dnn_xy_separator(data, start_idx = 1, end_idx = 8):
    X = []
    y = []
    for row in data:
        X.append(row[start_idx:end_idx])
        y.append(int(row[0]))
    return np.array(X), np.array(y)

In [25]:
%%time
# all feature
X_all, y_all = dnn_xy_separator(data=df_ann.values)
# only base feature
X_base, y_base = dnn_xy_separator(data=df_ann.values, end_idx=4)
# only aggregate feature
X_agg, y_agg = dnn_xy_separator(data=df_ann.values, start_idx=4)

Wall time: 205 ms


#### data spotlight

In [26]:
print(X_all[20],'\n',X_base[20],'\n',X_agg[20])

[0.64163624 0.59831496 0.06980418 0.28944444 0.03888889 0.        ] 
 [0.64163624 0.59831496 0.06980418] 
 [0.28944444 0.03888889 0.        ]


In [27]:
print(y_all[20],'\n',y_base[20],'\n',y_agg[20])

1 
 1 
 1


#### save data

In [28]:
prefix='datasets/dnn'
# all feature
np.save(file=f'{prefix}-all-X', arr=X_all)
np.save(file=f'{prefix}-all-y', arr=y_all)
# only base feature
np.save(file=f'{prefix}-base-X', arr=X_base)
np.save(file=f'{prefix}-base-y', arr=y_base)
# only aggregate feature
np.save(file=f'{prefix}-agg-X', arr=X_agg)
np.save(file=f'{prefix}-agg-y', arr=y_agg)

## Data Preparation for LSTM

**Data for LSTM is difference with data for ANN**. ANN required one frame for input at a time, while **LSTM requuired sequential collection of frame** for an input at a time. So first we need to sequentialize the data. the **window of sequentiial data is one minutes**

In [29]:
MINUTES_LENGTH = 1
FPS = 10
SEQ_LEN = int(FPS * 60 * MINUTES_LENGTH)

#### defince function to make a sequencial window

In [30]:
def sequencialize(df, seq_len, label):
    sequential_data = []
    prev = deque(maxlen=seq_len)
    for row in df.values:
        prev.append([data for data in row[3:]])
        if len(prev) == seq_len:
            sequential_data.append([np.array(prev),label])
    return sequential_data

#### get data

In [31]:
seq_datas = []

In [32]:
%%time
for subject, df_subject in df_identical_balanced.groupby('subject'):
    for class_label, df_class in df_subject.groupby('class_label'):
        _df = pd.DataFrame(df_class)
        seq_data = sequencialize(_df, SEQ_LEN, class_label)
        seq_datas.extend(seq_data)

Wall time: 4.1 s


#### shuffle data

In [33]:
random.shuffle(seq_datas)

#### seperate into x and y

In [34]:
def lstm_xy_separator(data, start_idx = 0, end_idx = 7):
    X = []
    y = []
    for seq, label in data:
        if(start_idx == 0 and end_idx == 7):
            X.append(seq)
        else:
            X.append([attr[start_idx:end_idx] for attr in seq])
        y.append(int(label))
    return np.array(X), np.array(y)

In [35]:
%%time
# all feature
X_all, y_all = lstm_xy_separator(data=seq_datas)
# only base feature
X_base, y_base = lstm_xy_separator(data=seq_datas, end_idx=3)
# only aggregate feature
X_agg, y_agg = lstm_xy_separator(data=seq_datas, start_idx=3)

Wall time: 10.8 s


#### data spotlight

In [36]:
print(X_all[100][:5],'\n',X_base[100][:5],'\n',X_agg[100][:5])

[[0.52601531 0.5141006  0.11373574 0.25277778 0.02222222 0.        ]
 [0.53527573 0.52016758 0.10531359 0.25277778 0.02222222 0.        ]
 [0.54453615 0.52623457 0.09689145 0.25277778 0.02222222 0.        ]
 [0.55379657 0.53230156 0.0884693  0.25277778 0.02222222 0.        ]
 [0.56383134 0.55940122 0.0505471  0.25277778 0.02222222 0.        ]] 
 [[0.52601531 0.5141006  0.11373574]
 [0.53527573 0.52016758 0.10531359]
 [0.54453615 0.52623457 0.09689145]
 [0.55379657 0.53230156 0.0884693 ]
 [0.56383134 0.55940122 0.0505471 ]] 
 [[0.25277778 0.02222222 0.        ]
 [0.25277778 0.02222222 0.        ]
 [0.25277778 0.02222222 0.        ]
 [0.25277778 0.02222222 0.        ]
 [0.25277778 0.02222222 0.        ]]


In [37]:
print(y_all[100],'\n',y_base[100],'\n',y_agg[100])

1 
 1 
 1


#### save data

In [38]:
prefix='datasets/lstm'
# all feature
np.save(file=f'{prefix}-all-X', arr=X_all)
np.save(file=f'{prefix}-all-y', arr=y_all)
# only base feature
np.save(file=f'{prefix}-base-X', arr=X_base)
np.save(file=f'{prefix}-base-y', arr=y_base)
# only aggregate feature
np.save(file=f'{prefix}-agg-X', arr=X_agg)
np.save(file=f'{prefix}-agg-y', arr=y_agg)