# 2. Processing action units

In [27]:
import pandas as pd
import pickle

# need to enable iterative imputer explicitly since its still experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Loading the file 

The file has 3174403 rows and 41 columns. 

In [28]:
filepath = "/Users/dionnespaltman/Desktop/V5/clean_actionunits.csv"
clean_data = pd.read_csv(filepath, sep=',')

if 'Unnamed: 0' in clean_data.columns:
    clean_data.drop(columns=['Unnamed: 0'], inplace=True)

display(clean_data)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3174398,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174399,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174400,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174401,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


In [29]:
print(clean_data.columns)

Index(['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r',
       'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r',
       'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r',
       'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c',
       'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c',
       'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage'],
      dtype='object')


When I made this notebook, I still thought I would get more data. However, we have to work with only the data I already received. This is data from 184 unique participants (missing IDs from 147 to 287). 

In [30]:
print(len(clean_data['ID'].unique()))

184


Here I check information from what stages is available. At first, I wanted to use the recordings from the waiting room to add them as a feature. However, this is not possible, since there is so little data from these timepoints. So I am only going to use data from stage 4, 5 and 6 which is when the participants are in the donation chair. 

In [31]:
# Count occurrences of each value in the "Stage" column
stage_counts = clean_data['Stage'].value_counts()

print(stage_counts)

[4, 5, 6]       2118735
[3, 4, 5, 6]     285556
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Stage, dtype: int64


# Missing values 

There are quite some missing values. This is possible when participants look away from the camera. Then naturally OpenFace wasn't able to get any information from their faces. 

In [32]:
nan_counts = clean_data.notna().sum()
print(nan_counts)

Frame         3174403
Face_id       3174403
Confidence    3174403
Success       3174403
AU01_r        2931009
AU02_r        2931009
AU04_r        2931009
AU05_r        2931009
AU06_r        2931009
AU07_r        2931009
AU09_r        2931009
AU10_r        2931009
AU12_r        2931009
AU14_r        2931009
AU15_r        2931009
AU17_r        2931009
AU20_r        2931009
AU23_r        2931009
AU25_r        2931009
AU26_r        2931009
AU45_r        2931009
AU01_c        2931009
AU02_c        2931009
AU04_c        2931009
AU05_c        2931009
AU06_c        2931009
AU07_c        2931009
AU09_c        2931009
AU10_c        2931009
AU12_c        2931009
AU14_c        2931009
AU15_c        2931009
AU17_c        2931009
AU20_c        2931009
AU23_c        2931009
AU25_c        2931009
AU26_c        2931009
AU28_c        2931009
AU45_c        2931009
ID            3174403
Stage         3174403
dtype: int64


In [33]:
nan_counts_au_df = clean_data.isna().sum()
print(nan_counts_au_df)

Frame              0
Face_id            0
Confidence         0
Success            0
AU01_r        243394
AU02_r        243394
AU04_r        243394
AU05_r        243394
AU06_r        243394
AU07_r        243394
AU09_r        243394
AU10_r        243394
AU12_r        243394
AU14_r        243394
AU15_r        243394
AU17_r        243394
AU20_r        243394
AU23_r        243394
AU25_r        243394
AU26_r        243394
AU45_r        243394
AU01_c        243394
AU02_c        243394
AU04_c        243394
AU05_c        243394
AU06_c        243394
AU07_c        243394
AU09_c        243394
AU10_c        243394
AU12_c        243394
AU14_c        243394
AU15_c        243394
AU17_c        243394
AU20_c        243394
AU23_c        243394
AU25_c        243394
AU26_c        243394
AU28_c        243394
AU45_c        243394
ID                 0
Stage              0
dtype: int64


In [34]:
print(243394/3174403*100)

7.667394467558151


# Dealing with missing values - MICE

Because we're missing about 7 percent of the action units, it would not be smart to simply delete these rows. We will use MICE to deal with the missing values. 
Link: https://www.machinelearningplus.com/machine-learning/mice-imputation/?utm_content=cmp-true 

In [35]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [39]:
# Use Numeric Features
data = clean_data.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']]
data

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.31,0.59,0.00,0.0,0.31,0.31,0.23,0.21,0.62,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.12,0.00,0.0,0.32,0.06,0.10,0.60,0.50,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.26,1.67,0.00,0.0,0.39,0.26,0.00,1.22,0.64,0.18,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3174398,0.62,0.36,0.30,0.0,0.70,0.47,0.00,0.06,1.54,1.10,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3174399,0.63,0.38,0.25,0.0,0.78,0.56,0.00,0.04,1.45,1.07,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3174400,0.69,0.51,0.26,0.0,0.79,0.56,0.00,0.00,1.43,1.04,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3174401,0.67,0.46,0.32,0.0,0.81,0.56,0.00,0.02,1.43,1.16,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


Running everything up until the imputer was very fast. Running the imputer took around 6 minutes. 

In [40]:
# fit on the dataset
imputer.fit(data)

In [18]:
action_units_imputed = imputer.transform(data)

In [19]:
# Replace with imputed values
clean_data.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 
                     'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 
                     'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 
                     'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']] = action_units_imputed
clean_data

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
103,104,0.0,0.88,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
104,105,0.0,0.98,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
111,112,0.0,0.98,1.0,0.31,0.59,0.0,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
114,115,0.0,0.98,1.0,0.0,0.12,0.0,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
153,154,0.0,0.98,1.0,1.26,1.67,0.0,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"


In [20]:
num_unique_ids = clean_data['ID'].nunique()
print(num_unique_ids)

184


Then we will double check if everything went correctly. We should not have any missing values anymore. This is important to be able to implement TS Fresh. 

In [21]:
nan_counts_au_df = clean_data.isna().sum()
print(nan_counts_au_df)

Frame         0
Face_id       0
Confidence    0
Success       0
AU01_r        0
AU02_r        0
AU04_r        0
AU05_r        0
AU06_r        0
AU07_r        0
AU09_r        0
AU10_r        0
AU12_r        0
AU14_r        0
AU15_r        0
AU17_r        0
AU20_r        0
AU23_r        0
AU25_r        0
AU26_r        0
AU45_r        0
AU01_c        0
AU02_c        0
AU04_c        0
AU05_c        0
AU06_c        0
AU07_c        0
AU09_c        0
AU10_c        0
AU12_c        0
AU14_c        0
AU15_c        0
AU17_c        0
AU20_c        0
AU23_c        0
AU25_c        0
AU26_c        0
AU28_c        0
AU45_c        0
ID            0
Stage         0
dtype: int64


# Saving the action units file with the imputed values 

In [22]:
clean_data.to_csv("/Users/dionnespaltman/Desktop/V5/imputed_data.csv", sep=',')

# Loading action units file with the imputed values 

In [24]:
imputed_df = pd.read_csv("/Users/dionnespaltman/Desktop/V5/imputed_data.csv")

display(imputed_df)

# Creating file with just Stage = [4, 5, 6]

In [27]:
# Count occurrences of Stage values in action_units DataFrame
stage_counts = imputed_df['Stage'].value_counts()
print(stage_counts)

[4, 5, 6]       2118735
[3, 4, 5, 6]     285556
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Stage, dtype: int64


In [28]:
imputed_df_456 = imputed_df[imputed_df['Stage'].apply(lambda x: set(x) == set([4, 5, 6]))]

num_unique_ids = imputed_df_456['ID'].nunique()
print(num_unique_ids)

display(imputed_df_456)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
103,104,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
104,105,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
111,112,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
114,115,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
153,154,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3181518,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,0.47,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]"
3181519,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,0.85,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]"
3181520,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,0.67,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]"
3181521,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,0.82,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]"


Okay, so we had data from 184 participants. But we only have information from stages 4, 5 and 6 from 104 IDs. 

In [31]:
print(imputed_df_456.shape)

(2118735, 41)


# Descriptives frames

In [32]:
# Group by 'ID' and count the number of frames per ID
frame_counts = imputed_df_456.groupby('ID').size()

# Calculate the minimum, maximum, and average number of frames per ID
min_frames_per_id = frame_counts.min()
max_frames_per_id = frame_counts.max()
average_frames_per_id = frame_counts.mean()

print("Minimum number of frames per ID:", min_frames_per_id)
print("Maximum number of frames per ID:", max_frames_per_id)
print("Average number of frames per ID:", average_frames_per_id)


Minimum number of frames per ID: 8
Maximum number of frames per ID: 37833
Average number of frames per ID: 20372.451923076922


In [33]:
# Group by 'ID' and count the number of frames per ID
frame_counts = imputed_df_456.groupby('ID').size()

# Convert the frame counts Series to a list and sort it in ascending order
sorted_frame_counts = sorted(frame_counts.tolist())

print("Counts of frames per ID sorted in ascending order:", sorted_frame_counts)


Counts of frames per ID sorted in ascending order: [8, 22, 91, 196, 798, 1172, 1233, 1421, 2068, 9114, 9686, 10702, 14706, 15372, 15429, 15478, 15747, 16382, 16407, 16446, 16562, 16662, 16744, 16872, 17407, 17413, 17458, 17500, 17611, 17666, 17718, 17975, 18239, 18291, 18346, 18432, 18567, 18689, 18703, 19148, 19300, 19544, 19573, 19672, 19857, 19884, 19998, 20219, 20261, 20330, 20695, 21019, 21043, 21189, 21339, 21777, 22020, 22137, 22145, 22376, 22435, 22592, 22879, 22898, 22963, 22997, 23031, 23044, 23099, 23127, 23572, 23807, 23892, 23906, 24287, 24338, 24662, 24744, 24917, 25133, 25270, 25897, 26036, 26107, 27205, 27518, 27661, 27804, 27874, 28201, 28324, 28716, 29140, 29523, 29700, 30361, 31039, 31310, 31749, 31842, 33323, 35252, 35868, 37833]


# TS Fresh requirements

Now we'll start to process the action units data using TS Fresh. First we import TS Fresh. 

In [34]:
# Necessary imports
!pip install tsfresh

import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.preprocessing import StandardScaler
from tsfresh.feature_extraction import ComprehensiveFCParameters



# Appling TS Fresh on action units - only on intensity!

In [35]:
print(list(imputed_df_456.columns))

['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage']


In [36]:
# au_long_format = pd.melt(imputed_df_456, id_vars=['ID', 'Frame'],
#                                    var_name='Measurement', value_name='Value',
#                                    value_vars=['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 
#                                                'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 
#                                                'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 
#                                                'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 
#                                                'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c'])

au_long_format = pd.melt(imputed_df_456, id_vars=['ID', 'Frame'],
                                   var_name='Measurement', value_name='Value',
                                   value_vars=['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 
                                               'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 
                                               'AU25_r', 'AU26_r', 'AU45_r'])

In [37]:
display(au_long_format)

Unnamed: 0,ID,Frame,Measurement,Value
0,80,104,AU01_r,0.00
1,80,105,AU01_r,0.00
2,80,112,AU01_r,0.31
3,80,115,AU01_r,0.00
4,80,154,AU01_r,1.26
...,...,...,...,...
36018490,27,21069.0,AU45_r,1.50
36018491,27,21070.0,AU45_r,1.23
36018492,27,21071.0,AU45_r,1.43
36018493,27,21072.0,AU45_r,1.23


In [38]:
num_unique_ids = au_long_format['ID'].nunique()
print(num_unique_ids)

104


In [40]:
nan_counts = au_long_format.isna().sum()
print(nan_counts)

ID             0
Frame          0
Measurement    0
Value          0
dtype: int64


In [41]:
# Convert 'ID' and 'Frame' columns to integer data type
au_long_format['ID'] = au_long_format['ID'].astype(int)
au_long_format['Frame'] = au_long_format['Frame'].astype(int)

# # Convert 'Measurement' column to string data type
au_long_format['Measurement'] = au_long_format['Measurement'].astype(str)

In [42]:
column_types = au_long_format.dtypes
print(column_types)

ID               int64
Frame            int64
Measurement     object
Value          float64
dtype: object


We will use the following six intensity level characteristics: sum, variance, standard deviation, maximum, mean, and mean-root square values. 
More information about the settings in TS Fresh can be found here: https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.feature_calculators.sum_values 

In [43]:
# Specificeer de gewenste kenmerken
settings = {
    'sum_values': {},
    'variance': {},
    'standard_deviation': {},
    'maximum': {},
    'mean': {},
    'root_mean_square': {}
}

# Extract features met aangepaste instellingen
extracted_features_au = extract_features(
    au_long_format,
    column_id='ID',  # Identifies the time series
    column_sort='Frame',  # Orders the time points
    column_kind='Measurement',  # Distinguishes between different variables if necessary
    column_value='Value',  # The actual measurements
    default_fc_parameters=settings
)

Feature Extraction: 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]


In [44]:
num_unique_ids = extracted_features_au.index.nunique()
extracted_features_au['ID'] = extracted_features_au.index

print(num_unique_ids)

display(extracted_features_au.head(5))

104


Unnamed: 0,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__mean,AU01_r__root_mean_square,AU02_r__sum_values,AU02_r__variance,AU02_r__standard_deviation,AU02_r__maximum,...,AU26_r__maximum,AU26_r__mean,AU26_r__root_mean_square,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__mean,AU45_r__root_mean_square,ID
23,4982.48,0.425041,0.651952,5.0,0.338806,0.734732,2244.43,0.164346,0.405396,3.71,...,5.0,0.633284,0.955509,9231.74,0.825039,0.908316,4.91,0.627753,1.104135,23
24,9390.23,0.448366,0.669601,5.0,0.345166,0.753329,5122.35,0.277836,0.527102,5.0,...,5.0,1.484701,2.045023,11887.0,0.634554,0.796589,5.0,0.436942,0.908555,24
25,6954.35,0.599805,0.774471,4.53,0.424512,0.883185,2192.4,0.120917,0.347731,3.25,...,5.0,0.862301,1.317818,9020.78,0.750701,0.86643,4.04,0.550652,1.026605,25
26,9707.43,0.87328,0.934495,4.73,0.547885,1.083263,2641.0,0.150057,0.387372,3.25,...,5.0,0.552359,0.876652,6585.31,0.609348,0.780607,4.9,0.371673,0.864574,26
27,21049.9,1.475421,1.214669,5.99,1.000328,1.573555,16193.17,1.597831,1.264053,5.61,...,6.69,0.142027,1.377048,23027.73,1.160635,1.077328,5.04,1.094318,1.535632,27


# Saving the extracted featues to csv (not standardized)

In [45]:
extracted_features_au.to_csv('/Users/dionnespaltman/Desktop/V4/extracted_features.csv', sep=',')

In [46]:
display(extracted_features_au)

Unnamed: 0,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__mean,AU01_r__root_mean_square,AU02_r__sum_values,AU02_r__variance,AU02_r__standard_deviation,AU02_r__maximum,...,AU26_r__maximum,AU26_r__mean,AU26_r__root_mean_square,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__mean,AU45_r__root_mean_square,ID
23,4982.48,0.425041,0.651952,5.00,0.338806,0.734732,2244.43,0.164346,0.405396,3.71,...,5.00,0.633284,0.955509,9231.74,0.825039,0.908316,4.91,0.627753,1.104135,23
24,9390.23,0.448366,0.669601,5.00,0.345166,0.753329,5122.35,0.277836,0.527102,5.00,...,5.00,1.484701,2.045023,11887.00,0.634554,0.796589,5.00,0.436942,0.908555,24
25,6954.35,0.599805,0.774471,4.53,0.424512,0.883185,2192.40,0.120917,0.347731,3.25,...,5.00,0.862301,1.317818,9020.78,0.750701,0.866430,4.04,0.550652,1.026605,25
26,9707.43,0.873280,0.934495,4.73,0.547885,1.083263,2641.00,0.150057,0.387372,3.25,...,5.00,0.552359,0.876652,6585.31,0.609348,0.780607,4.90,0.371673,0.864574,26
27,21049.90,1.475421,1.214669,5.99,1.000328,1.573555,16193.17,1.597831,1.264053,5.61,...,6.69,0.142027,1.377048,23027.73,1.160635,1.077328,5.04,1.094318,1.535632,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,17123.56,1.294928,1.137949,5.00,0.661218,1.316107,7521.49,0.472388,0.687305,4.99,...,5.00,0.671385,1.044762,15141.36,0.816148,0.903409,4.48,0.584676,1.076101,140
142,11750.48,0.739290,0.859820,5.00,0.509915,0.999652,4726.74,0.271937,0.521476,5.00,...,5.00,0.448007,0.692926,10547.03,0.506052,0.711374,3.53,0.457691,0.845892,142
144,7175.00,0.461053,0.679009,4.97,0.383628,0.779887,3864.99,0.282831,0.531819,4.54,...,5.00,0.551210,0.832457,7319.31,0.418456,0.646882,3.78,0.391344,0.756047,144
145,11354.47,0.516678,0.718803,5.00,0.362647,0.805103,7260.07,0.303435,0.550850,4.93,...,4.89,0.641290,0.913377,16768.57,0.748187,0.864978,4.31,0.535566,1.017358,145


# Standardize 


In [162]:
# Get the unique IDs from the index
unique_ids = extracted_features_au.index.unique().tolist()

# Print the list of unique IDs
print("List of unique IDs:")
print(unique_ids)


List of unique IDs:
[23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 85, 87, 88, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 106, 107, 108, 111, 112, 113, 114, 115, 117, 118, 119, 120, 121, 122, 123, 124, 125, 127, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 142, 144, 145, 146]


In [165]:
from sklearn.preprocessing import StandardScaler

# Exclude the 'ID' column from scaling
columns_to_scale = extracted_features_au.columns.drop('ID')

# Extract the 'ID' column
id_column = list(extracted_features_au['ID'])
# print(id_column)

# Scale the data
scaler = StandardScaler()
scaler.fit(extracted_features_au[columns_to_scale])
scaled_data = scaler.transform(extracted_features_au[columns_to_scale])

# Create a DataFrame with the scaled data
extracted_features_au_standardized = pd.DataFrame(scaled_data, columns=columns_to_scale)
display(extracted_features_au_standardized)

# Add the 'ID' column back to the DataFrame
extracted_features_au_standardized['ID'] = id_column

# Display the standardized DataFrame
display(extracted_features_au_standardized)


Unnamed: 0,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__mean,AU01_r__root_mean_square,AU02_r__sum_values,AU02_r__variance,AU02_r__standard_deviation,AU02_r__maximum,...,AU26_r__standard_deviation,AU26_r__maximum,AU26_r__mean,AU26_r__root_mean_square,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__mean,AU45_r__root_mean_square
0,-0.831420,-0.293678,-0.056030,0.514376,-0.487054,-0.246817,-0.774358,-0.309942,-0.164329,-0.013710,...,-0.289518,0.458331,-0.574983,-0.554028,-0.333560,0.221977,0.345618,0.548923,0.603537,0.429001
1,0.145470,-0.226965,0.005772,0.514376,-0.452632,-0.185153,0.433448,0.148409,0.398905,0.898596,...,1.793425,0.458331,2.294469,2.317495,0.203487,-0.260024,-0.030426,0.619766,-0.659902,-0.281599
2,-0.394394,0.206189,0.372988,0.177477,-0.023172,0.245411,-0.796194,-0.485337,-0.431191,-0.339028,...,0.557817,0.458331,0.196854,0.400873,-0.376228,0.033873,0.204639,-0.135892,0.093018,0.147314
3,0.215771,0.988392,0.933333,0.320838,0.644587,0.908811,-0.607925,-0.367648,-0.247737,-0.339028,...,-0.394321,0.458331,-0.847716,-0.761866,-0.868821,-0.323805,-0.084215,0.541052,-1.092069,-0.441394
4,2.729605,2.710656,1.914401,1.224014,3.093433,2.534479,5.079650,5.479450,3.809395,1.329997,...,1.682971,1.863336,-2.230621,0.556978,2.456784,1.071168,0.914462,0.651252,3.692845,1.996765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,1.859409,2.194404,1.645755,0.514376,1.258001,1.680854,1.440319,0.934144,1.140301,0.891524,...,-0.033310,0.458331,-0.446575,-0.318795,0.861706,0.199480,0.329100,0.210451,0.318307,0.327148
100,0.668573,0.605149,0.671849,0.514376,0.439073,0.631581,0.267418,0.124587,0.372873,0.898596,...,-0.853016,0.458331,-1.199405,-1.246095,-0.067533,-0.585183,-0.317237,-0.537335,-0.522513,-0.509273
101,-0.345492,-0.190675,0.038715,0.492872,-0.244453,-0.097095,-0.094241,0.168584,0.420737,0.573278,...,-0.565965,0.458331,-0.851589,-0.878346,-0.720364,-0.806836,-0.534298,-0.340549,-0.961821,-0.835709
102,0.580805,-0.031575,0.178060,0.514376,-0.358016,-0.013487,1.330607,0.251797,0.508808,0.849091,...,-0.485857,0.366881,-0.548002,-0.665071,1.190821,0.027513,0.199753,0.076637,-0.006872,0.113717


Unnamed: 0,AU01_r__sum_values,AU01_r__variance,AU01_r__standard_deviation,AU01_r__maximum,AU01_r__mean,AU01_r__root_mean_square,AU02_r__sum_values,AU02_r__variance,AU02_r__standard_deviation,AU02_r__maximum,...,AU26_r__maximum,AU26_r__mean,AU26_r__root_mean_square,AU45_r__sum_values,AU45_r__variance,AU45_r__standard_deviation,AU45_r__maximum,AU45_r__mean,AU45_r__root_mean_square,ID
0,-0.831420,-0.293678,-0.056030,0.514376,-0.487054,-0.246817,-0.774358,-0.309942,-0.164329,-0.013710,...,0.458331,-0.574983,-0.554028,-0.333560,0.221977,0.345618,0.548923,0.603537,0.429001,23
1,0.145470,-0.226965,0.005772,0.514376,-0.452632,-0.185153,0.433448,0.148409,0.398905,0.898596,...,0.458331,2.294469,2.317495,0.203487,-0.260024,-0.030426,0.619766,-0.659902,-0.281599,24
2,-0.394394,0.206189,0.372988,0.177477,-0.023172,0.245411,-0.796194,-0.485337,-0.431191,-0.339028,...,0.458331,0.196854,0.400873,-0.376228,0.033873,0.204639,-0.135892,0.093018,0.147314,25
3,0.215771,0.988392,0.933333,0.320838,0.644587,0.908811,-0.607925,-0.367648,-0.247737,-0.339028,...,0.458331,-0.847716,-0.761866,-0.868821,-0.323805,-0.084215,0.541052,-1.092069,-0.441394,26
4,2.729605,2.710656,1.914401,1.224014,3.093433,2.534479,5.079650,5.479450,3.809395,1.329997,...,1.863336,-2.230621,0.556978,2.456784,1.071168,0.914462,0.651252,3.692845,1.996765,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,1.859409,2.194404,1.645755,0.514376,1.258001,1.680854,1.440319,0.934144,1.140301,0.891524,...,0.458331,-0.446575,-0.318795,0.861706,0.199480,0.329100,0.210451,0.318307,0.327148,140
100,0.668573,0.605149,0.671849,0.514376,0.439073,0.631581,0.267418,0.124587,0.372873,0.898596,...,0.458331,-1.199405,-1.246095,-0.067533,-0.585183,-0.317237,-0.537335,-0.522513,-0.509273,142
101,-0.345492,-0.190675,0.038715,0.492872,-0.244453,-0.097095,-0.094241,0.168584,0.420737,0.573278,...,0.458331,-0.851589,-0.878346,-0.720364,-0.806836,-0.534298,-0.340549,-0.961821,-0.835709,144
102,0.580805,-0.031575,0.178060,0.514376,-0.358016,-0.013487,1.330607,0.251797,0.508808,0.849091,...,0.366881,-0.548002,-0.665071,1.190821,0.027513,0.199753,0.076637,-0.006872,0.113717,145


# Saving the standardized extracted features to csv 

In [166]:
extracted_features_au_standardized.to_csv('/Users/dionnespaltman/Desktop/V4/extracted_features_standardized.csv', sep=',')

# Save columns names of extracted features to json 

In [138]:
columns_action_units = list(extracted_features_au.columns) 
print(len(columns_action_units))

# print(columns_action_units)

350


In [139]:
import json

# Save columns_action_units list to a JSON file
with open('/Users/dionnespaltman/Desktop/V4/columns_action_units.json', 'w') as f:
    json.dump(columns_action_units, f)

# Understanding extracted features 

In [84]:
display(extracted_features_au)

Unnamed: 0,AU10_c__sum_values,AU10_c__variance,AU10_c__standard_deviation,AU10_c__maximum,AU10_c__minimum,AU10_c__median,AU10_c__mean,AU10_c__mean_abs_change,"AU10_c__agg_linear_trend__attr_""slope""__chunk_len_5__f_agg_""max""","AU10_c__agg_linear_trend__attr_""slope""__chunk_len_5__f_agg_""min""",...,AU09_r__sum_values,AU09_r__variance,AU09_r__standard_deviation,AU09_r__maximum,AU09_r__minimum,AU09_r__median,AU09_r__mean,AU09_r__mean_abs_change,"AU09_r__agg_linear_trend__attr_""slope""__chunk_len_5__f_agg_""max""","AU09_r__agg_linear_trend__attr_""slope""__chunk_len_5__f_agg_""min"""
23,7873.0,0.248750,0.498748,1.0,0.0,1.0,0.535360,0.024685,-0.000017,-0.000013,...,1772.86,0.109496,0.330902,4.98,0.00,0.00,0.120554,0.019213,-0.000032,-0.000022
24,16831.0,0.235917,0.485713,1.0,0.0,1.0,0.618673,0.010881,0.000053,0.000053,...,3197.59,0.063365,0.251725,2.59,0.00,0.00,0.117537,0.024182,-0.000029,-0.000019
25,3506.0,0.168213,0.410138,1.0,0.0,0.0,0.214015,0.011110,-0.000043,-0.000027,...,2275.65,0.153114,0.391297,4.10,0.00,0.00,0.138912,0.024858,0.000048,0.000033
26,867.0,0.046539,0.215729,1.0,0.0,0.0,0.048933,0.003612,-0.000031,-0.000027,...,1755.79,0.052783,0.229745,3.25,0.00,0.00,0.099096,0.016996,-0.000013,-0.000012
27,200.0,0.009414,0.097026,1.0,0.0,0.0,0.009504,0.002281,-0.000005,-0.000002,...,20877.08,0.779602,0.882951,3.45,-4.43,1.05,0.992115,0.150508,0.000089,0.000095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,4704.0,0.148649,0.385550,1.0,0.0,0.0,0.181643,0.010967,0.000057,0.000048,...,2324.06,0.044894,0.211882,3.71,0.00,0.00,0.089742,0.014612,-0.000008,-0.000007
142,9082.0,0.238788,0.488660,1.0,0.0,0.0,0.394116,0.020049,0.000013,0.000019,...,3068.82,0.144935,0.380703,4.70,0.00,0.00,0.133172,0.022507,0.000057,0.000042
144,252.0,0.013292,0.115292,1.0,0.0,0.0,0.013474,0.001925,0.000002,0.000001,...,2307.91,0.123643,0.351629,5.00,0.00,0.00,0.123398,0.018496,-0.000025,-0.000019
145,10297.0,0.220715,0.469804,1.0,0.0,0.0,0.328873,0.011115,0.000052,0.000050,...,3768.23,0.065670,0.256262,3.08,0.00,0.00,0.120352,0.016477,0.000025,0.000020


In [86]:
indexes = extracted_features_au.index.tolist()
print(len(indexes))

104
