# 2. Processing action units

In [2]:
import pandas as pd
import pickle

# need to enable iterative imputer explicitly since its still experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Loading the file 

The file has 3174403 rows and 41 columns. 

In [28]:
filepath = "/Users/dionnespaltman/Desktop/V5/clean_actionunits.csv"
clean_data = pd.read_csv(filepath, sep=',')

if 'Unnamed: 0' in clean_data.columns:
    clean_data.drop(columns=['Unnamed: 0'], inplace=True)

display(clean_data)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3174398,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174399,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174400,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174401,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


In [29]:
print(clean_data.columns)

Index(['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r',
       'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r',
       'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r',
       'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c',
       'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c',
       'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage'],
      dtype='object')


When I made this notebook, I still thought I would get more data. However, we have to work with only the data I already received. This is data from 184 unique participants (missing IDs from 147 to 287). 

In [30]:
print(len(clean_data['ID'].unique()))

184


Here I check information from what stages is available. At first, I wanted to use the recordings from the waiting room to add them as a feature. However, this is not possible, since there is so little data from these timepoints. So I am only going to use data from stage 4, 5 and 6 which is when the participants are in the donation chair. 

In [31]:
# Count occurrences of each value in the "Stage" column
stage_counts = clean_data['Stage'].value_counts()

print(stage_counts)

[4, 5, 6]       2118735
[3, 4, 5, 6]     285556
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Stage, dtype: int64


# Missing values 

There are quite some missing values. This is possible when participants look away from the camera. Then naturally OpenFace wasn't able to get any information from their faces. 

In [32]:
nan_counts = clean_data.notna().sum()
print(nan_counts)

Frame         3174403
Face_id       3174403
Confidence    3174403
Success       3174403
AU01_r        2931009
AU02_r        2931009
AU04_r        2931009
AU05_r        2931009
AU06_r        2931009
AU07_r        2931009
AU09_r        2931009
AU10_r        2931009
AU12_r        2931009
AU14_r        2931009
AU15_r        2931009
AU17_r        2931009
AU20_r        2931009
AU23_r        2931009
AU25_r        2931009
AU26_r        2931009
AU45_r        2931009
AU01_c        2931009
AU02_c        2931009
AU04_c        2931009
AU05_c        2931009
AU06_c        2931009
AU07_c        2931009
AU09_c        2931009
AU10_c        2931009
AU12_c        2931009
AU14_c        2931009
AU15_c        2931009
AU17_c        2931009
AU20_c        2931009
AU23_c        2931009
AU25_c        2931009
AU26_c        2931009
AU28_c        2931009
AU45_c        2931009
ID            3174403
Stage         3174403
dtype: int64


In [33]:
nan_counts_au_df = clean_data.isna().sum()
print(nan_counts_au_df)

Frame              0
Face_id            0
Confidence         0
Success            0
AU01_r        243394
AU02_r        243394
AU04_r        243394
AU05_r        243394
AU06_r        243394
AU07_r        243394
AU09_r        243394
AU10_r        243394
AU12_r        243394
AU14_r        243394
AU15_r        243394
AU17_r        243394
AU20_r        243394
AU23_r        243394
AU25_r        243394
AU26_r        243394
AU45_r        243394
AU01_c        243394
AU02_c        243394
AU04_c        243394
AU05_c        243394
AU06_c        243394
AU07_c        243394
AU09_c        243394
AU10_c        243394
AU12_c        243394
AU14_c        243394
AU15_c        243394
AU17_c        243394
AU20_c        243394
AU23_c        243394
AU25_c        243394
AU26_c        243394
AU28_c        243394
AU45_c        243394
ID                 0
Stage              0
dtype: int64


In [34]:
print(243394/3174403*100)

7.667394467558151


# Dealing with missing values - MICE

Because we're missing about 7 percent of the action units, it would not be smart to simply delete these rows. We will use MICE to deal with the missing values. 
Link: https://www.machinelearningplus.com/machine-learning/mice-imputation/?utm_content=cmp-true 

In [35]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [39]:
# Use Numeric Features
data = clean_data.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']]
data

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.31,0.59,0.00,0.0,0.31,0.31,0.23,0.21,0.62,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.12,0.00,0.0,0.32,0.06,0.10,0.60,0.50,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.26,1.67,0.00,0.0,0.39,0.26,0.00,1.22,0.64,0.18,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3174398,0.62,0.36,0.30,0.0,0.70,0.47,0.00,0.06,1.54,1.10,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3174399,0.63,0.38,0.25,0.0,0.78,0.56,0.00,0.04,1.45,1.07,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3174400,0.69,0.51,0.26,0.0,0.79,0.56,0.00,0.00,1.43,1.04,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3174401,0.67,0.46,0.32,0.0,0.81,0.56,0.00,0.02,1.43,1.16,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


Running everything up until the imputer was very fast. Running the imputer took around 6 minutes. 

In [40]:
# fit on the dataset
imputer.fit(data)

In [18]:
action_units_imputed = imputer.transform(data)

In [19]:
# Replace with imputed values
clean_data.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 
                     'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 
                     'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 
                     'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']] = action_units_imputed
clean_data

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
103,104,0.0,0.88,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
104,105,0.0,0.98,1.0,0.0,0.0,0.0,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
111,112,0.0,0.98,1.0,0.31,0.59,0.0,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
114,115,0.0,0.98,1.0,0.0,0.12,0.0,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
153,154,0.0,0.98,1.0,1.26,1.67,0.0,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"


In [20]:
num_unique_ids = clean_data['ID'].nunique()
print(num_unique_ids)

184


Then we will double check if everything went correctly. We should not have any missing values anymore. This is important to be able to implement TS Fresh. 

In [21]:
nan_counts_au_df = clean_data.isna().sum()
print(nan_counts_au_df)

Frame         0
Face_id       0
Confidence    0
Success       0
AU01_r        0
AU02_r        0
AU04_r        0
AU05_r        0
AU06_r        0
AU07_r        0
AU09_r        0
AU10_r        0
AU12_r        0
AU14_r        0
AU15_r        0
AU17_r        0
AU20_r        0
AU23_r        0
AU25_r        0
AU26_r        0
AU45_r        0
AU01_c        0
AU02_c        0
AU04_c        0
AU05_c        0
AU06_c        0
AU07_c        0
AU09_c        0
AU10_c        0
AU12_c        0
AU14_c        0
AU15_c        0
AU17_c        0
AU20_c        0
AU23_c        0
AU25_c        0
AU26_c        0
AU28_c        0
AU45_c        0
ID            0
Stage         0
dtype: int64


# Saving the action units file with the imputed values 

In [22]:
clean_data.to_csv("/Users/dionnespaltman/Desktop/V5/imputed_data.csv", sep=',')

# Loading action units file with the imputed values 

In [3]:
imputed_df = pd.read_csv("/Users/dionnespaltman/Desktop/V5/imputed_data.csv")

display(imputed_df)

Unnamed: 0.1,Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3174398,3174398,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174399,3174399,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174400,3174400,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3174401,3174401,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


# Creating file with just Stage = [4, 5, 6]

In [9]:
# Count occurrences of Stage values in action_units DataFrame
stage_counts = imputed_df['Stage'].value_counts()
print(stage_counts)

[4, 5, 6]       2118735
[3, 4, 5, 6]     285556
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Stage, dtype: int64


In [10]:
# Convert the 'Stage' column to a string representation
imputed_df['Stage_str'] = imputed_df['Stage'].apply(lambda x: str(x))

# Display the DataFrame to check the conversion
print("DataFrame with 'Stage' as string:")
display(imputed_df)

# Filter rows where 'Stage_str' is exactly "[4, 5, 6]"
filtered_df = imputed_df[imputed_df['Stage_str'] == str([4, 5, 6])]

# Display the filtered DataFrame
print("Filtered DataFrame where Stage is exactly '[4, 5, 6]':")
display(filtered_df)

DataFrame with 'Stage' as string:


Unnamed: 0.1,Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3174398,3174398,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3174399,3174399,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3174400,3174400,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3174401,3174401,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]


Filtered DataFrame where Stage is exactly '[4, 5, 6]':


Unnamed: 0.1,Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106523,3106523,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3106524,3106524,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3106525,3106525,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3106526,3106526,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"


Okay, so we had data from 184 participants. But we only have information from stages 4, 5 and 6 from 104 IDs. 

In [12]:
# Count the unique IDs in the filtered DataFrame
num_unique_ids = filtered_df['ID'].nunique()
print("Number of unique IDs in filtered DataFrame:", num_unique_ids)

print(filtered_df.shape)

Number of unique IDs in filtered DataFrame: 104
(2118735, 43)


# Descriptives frames

In [13]:
# Group by 'ID' and count the number of frames per ID
frame_counts = filtered_df.groupby('ID').size()

# Calculate the minimum, maximum, and average number of frames per ID
min_frames_per_id = frame_counts.min()
max_frames_per_id = frame_counts.max()
average_frames_per_id = frame_counts.mean()

print("Minimum number of frames per ID:", min_frames_per_id)
print("Maximum number of frames per ID:", max_frames_per_id)
print("Average number of frames per ID:", average_frames_per_id)


Minimum number of frames per ID: 8
Maximum number of frames per ID: 37833
Average number of frames per ID: 20372.451923076922


In [14]:
# Group by 'ID' and count the number of frames per ID
frame_counts = filtered_df.groupby('ID').size()

# Convert the frame counts Series to a list and sort it in ascending order
sorted_frame_counts = sorted(frame_counts.tolist())

print("Counts of frames per ID sorted in ascending order:", sorted_frame_counts)


Counts of frames per ID sorted in ascending order: [8, 22, 91, 196, 798, 1172, 1233, 1421, 2068, 9114, 9686, 10702, 14706, 15372, 15429, 15478, 15747, 16382, 16407, 16446, 16562, 16662, 16744, 16872, 17407, 17413, 17458, 17500, 17611, 17666, 17718, 17975, 18239, 18291, 18346, 18432, 18567, 18689, 18703, 19148, 19300, 19544, 19573, 19672, 19857, 19884, 19998, 20219, 20261, 20330, 20695, 21019, 21043, 21189, 21339, 21777, 22020, 22137, 22145, 22376, 22435, 22592, 22879, 22898, 22963, 22997, 23031, 23044, 23099, 23127, 23572, 23807, 23892, 23906, 24287, 24338, 24662, 24744, 24917, 25133, 25270, 25897, 26036, 26107, 27205, 27518, 27661, 27804, 27874, 28201, 28324, 28716, 29140, 29523, 29700, 30361, 31039, 31310, 31749, 31842, 33323, 35252, 35868, 37833]


# Save filtered_df 

In [15]:
filtered_df.to_csv("/Users/dionnespaltman/Desktop/V5/filtered_df.csv", sep=',')