# Cleaning action units

In this notebook, the files we got from the FAINT project team is looked at, cleaned and made ready for preprocessing in TS Fresh. 

In [1]:
# import 
import zipfile
import os
import pandas as pd
import csv
import socket  
import pickle
import re

# need to enable iterative imputer explicitly since its still experimental
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Unzipping

In [3]:
# Define paths to the zipped and unzipped folders 
zip_file_path = '/Users/dionnespaltman/Desktop/V4/full_donation_data.zip'
# output_folder = '/Users/dionnespaltman/Desktop/V4/full donation data - unzipped'
output_folder = '/Users/dionnespaltman/Desktop/V4/full_donation_data_unzipped_03-06-2024'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Unzip the folder
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

print("Unzipped folder created at:", output_folder)

Unzipped folder created at: /Users/dionnespaltman/Desktop/V4/full_donation_data_unzipped_03-06-2024


Check the size of the folder. It's 18684.90839099884 MB. 

In [5]:
# Define the path to the folder
folder_path_unzipped_data = '/Users/dionnespaltman/Desktop/V4/full_donation_data_unzipped_03-06-2024'

# Initialize total size variable
total_size = 0

# Iterate over all files in the folder
for filename in os.listdir(folder_path_unzipped_data):
    file_path = os.path.join(folder_path_unzipped_data, filename)
    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        # Get the size of the file and add it to total_size
        total_size += os.path.getsize(file_path)

# Convert total size to a human-readable format (e.g., bytes to megabytes)
total_size_mb = total_size / (1024 * 1024)  # Convert bytes to megabytes

print("Total size of all files in the folder:", total_size_mb, "MB")

Total size of all files in the folder: 18684.916207313538 MB


# Creating a dictionary with all the files 

I want to access the file in a easy way. Since the files have important information in their filename, I will put all the files in a pandas dictionary (very efficient in my experience). The filename will serve as a key. In total 403 files were uploaded in the dictionary, that is correct. 

I tried to load it again on May 30th, and only could open 326 files (it took 35 minutes). Will try again later. 

On June 3rd, I could open 411 files. 

In [6]:
# List all files in the folder
file_names = os.listdir(folder_path_unzipped_data)

# Read a subset of files into a dictionary
dictionary = {}
num_files_to_read = 450  # Adjust the number of files to read as needed
for i, file_name in enumerate(file_names):
    if i >= num_files_to_read:
        break
    if file_name.endswith('.csv'):  # Assuming the files are CSV format
        file_path = os.path.join(folder_path_unzipped_data, file_name)
        try:
            dictionary[file_name] = pd.read_csv(file_path)
            print("File loaded successfully:", file_name)
        except Exception as e:
            print("Error loading file:", file_name, "- Error:", e)

# Now you have a dictionary 'data' containing DataFrames for each file (up to the specified number)
# Access them using keys (file names)
# For example:
print("Number of files loaded:", len(dictionary))

File loaded successfully: 16-07.csv
File loaded successfully: 80_04,05,06.csv
File loaded successfully: 7-04-05-06.csv
File loaded successfully: 324_02.csv
File loaded successfully: 87_04,05,06.csv
File loaded successfully: 78_04,05,06.csv
File loaded successfully: 328_02.csv
File loaded successfully: 105_04 donation not completed. No blood flow.csv
File loaded successfully: 92_04,05,06.csv
File loaded successfully: 38_4,5,6.csv
File loaded successfully: 129_04,05,06.csv
File loaded successfully: 85_07.csv
File loaded successfully: DSCN2370.csv
File loaded successfully: 95_04,05,06.csv
File loaded successfully: 300_02.csv
File loaded successfully: 20-07.csv
File loaded successfully: 97_07.csv
File loaded successfully: 290_01.csv
File loaded successfully: 312_02.csv
File loaded successfully: 127_03.csv
File loaded successfully: 40_07.csv
File loaded successfully: 101_07.csv
File loaded successfully: 118_04,05,06.csv
File loaded successfully: 14-03,04,05,06.csv
File loaded successfully: 

  dictionary[file_name] = pd.read_csv(file_path)


File loaded successfully: 108_04,05,06.csv
File loaded successfully: 80_07.csv
File loaded successfully: 305_02.csv
File loaded successfully: 299_01.csv
Number of files loaded: 411


In [7]:
import os
import pandas as pd

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path_unzipped_data) if f.endswith('.csv')]

# Read a subset of files into a dictionary
dictionary_optimized = {}
num_files_to_read = min(450, len(csv_files))  # Adjust the number of files to read as needed
errors = []

for file_name in csv_files[:num_files_to_read]:
    file_path = os.path.join(folder_path_unzipped_data, file_name)
    try:
        dictionary_optimized[file_name] = pd.read_csv(file_path)
    except Exception as e:
        errors.append(f"Error loading file: {file_name} - Error: {e}")

# Print summary
print("Number of files loaded:", len(dictionary_optimized))
if errors:
    print("Errors encountered with the following files:")
    for error in errors:
        print(error)

  dictionary_optimized[file_name] = pd.read_csv(file_path)


Number of files loaded: 411
Errors encountered with the following files:
Error loading file: 141_04,05,06.csv - Error: Error tokenizing data. C error: Expected 714 fields in line 13173, saw 1270

Error loading file: 128_06.csv - Error: Error tokenizing data. C error: Expected 679 fields in line 1539, saw 924

Error loading file: 112_07.csv - Error: No columns to parse from file


Double check if the dictionary is not empty. Check the structure. 

In [8]:
# Check if the dictionary is not empty
if dictionary:
    # Get the first key (file name) and its corresponding DataFrame
    first_file_name = next(iter(dictionary.keys()))
    first_df = dictionary[first_file_name]
    
    # Display the DataFrame
    print("DataFrame for the first file '{}' in the dictionary:".format(first_file_name))
    display(first_df.head(10))
else:
    print("The dictionary is empty. No files loaded.")

DataFrame for the first file '16-07.csv' in the dictionary:


Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,1,0,0.0,0.98,1,-0.186231,-0.071084,-0.979931,-0.32102,-0.025671,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,0,0.04,0.98,1,-0.195272,-0.066113,-0.978518,-0.321741,-0.022495,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,0,0.08,0.98,1,-0.183891,-0.07279,-0.980248,-0.325824,-0.032522,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,0,0.12,0.98,1,-0.18905,-0.06148,-0.980041,-0.323833,-0.018319,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,0,0.16,0.98,1,-0.193372,-0.059505,-0.979319,-0.317909,-0.020461,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,6,0,0.2,0.98,1,-0.182038,-0.053976,-0.981809,-0.320776,-0.025344,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,7,0,0.24,0.98,1,-0.17406,-0.062096,-0.982775,-0.332763,-0.027208,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,8,0,0.28,0.98,1,-0.183968,-0.068336,-0.980554,-0.32563,-0.040619,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,9,0,0.32,0.98,1,-0.172816,-0.076083,-0.982011,-0.330309,-0.047691,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,10,0,0.36,0.98,1,-0.162287,-0.070443,-0.984226,-0.325321,-0.033416,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Dropping columns 

I want to check one single dictionary and use a key to see if everything works correctly as it should be. The shape of the dataframe is (2750, 714). It has 2750 rows and 714 columns. A lot of the rows are not interesting for the current research. 

In [9]:
# Get the DataFrame with the key '16-07.csv'
random_df = dictionary['16-07.csv']

# Get the shape of the DataFrame
shape_random_df = random_df.shape

# Print the shape
print("Shape of the DataFrame with key '16-07.csv':", shape_random_df)

Shape of the DataFrame with key '16-07.csv': (2750, 714)


Here I'm making a function to be able to delete the columns I'm not interested in. These include columns with the following substrings: 'gaze', 'p', 'x', 'X', 'Y', 'y', 'Z',  'pose', 'eye'. After I've made the function, I test it using some examples. 

In [10]:
def get_columns_to_delete(columns):
    # List of substrings to search for in column names
    substrings = ['gaze', 'p', 'x', 'X', 'Y', 'y', 'Z',  'pose', 'eye']

    # Initialize an empty list to store column names to delete
    columns_to_delete = []

    # Iterate through each column name
    for column in columns:
        # Check if any of the substrings are present in the column name
        if any(sub in column for sub in substrings):
            # If present, add the column name to the list of columns to delete
            columns_to_delete.append(column)

    return columns_to_delete

In [11]:
# Example usage:
columns = ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'gaze_0_x', 'gaze_0_y', 'pose_Rx', 'pose_Ry']
columns_to_delete = get_columns_to_delete(columns)
print(columns_to_delete)

['gaze_0_x', 'gaze_0_y', 'pose_Rx', 'pose_Ry']


Then I test it on a seperate dataframe in the dictionary. From the 714 columsn in total, there are 675 columns that can be deleted. 

In [12]:
# Get the DataFrame from the dictionary
df_to_process = dictionary['16-07.csv']

# Get the list of column names
columns = df_to_process.columns
print(len(columns))

# Get the list of columns to delete
columns_to_delete = get_columns_to_delete(columns)

# Print the list of columns to delete
print(len(columns_to_delete))

714
675


In [13]:
# Define the new dictionary to store modified DataFrames
filtered_dictionary = {}

# Iterate through all the DataFrames in the original dictionary
for key, df in dictionary.items():
    # Get the list of column names for the current DataFrame
    columns = df.columns
    
    # Get the list of columns to delete
    columns_to_delete = get_columns_to_delete(columns)
    
    # Create a new DataFrame without the columns to delete
    df_dropped = df.drop(columns=columns_to_delete)
    
    # Add the new DataFrame to the new dictionary
    filtered_dictionary[key] = df_dropped

New shape of a single dataframe is (2750, 39). 

In [14]:
# Get the DataFrame with the key '16-07.csv'
random_df = filtered_dictionary['16-07.csv']

# Print the shape
print("Shape of the DataFrame with key '16-07.csv':", random_df.shape)

# Print column names
print(random_df.columns)

Shape of the DataFrame with key '16-07.csv': (2750, 39)
Index(['frame', ' face_id', ' confidence', ' success', ' AU01_r', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c',
       ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c',
       ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c',
       ' AU26_c', ' AU28_c', ' AU45_c'],
      dtype='object')


# Adding ID 

It is important to be able to easily access the ID. Here I create a new column in each dataframe and store the ID there. 

In [15]:
# Iterate over the keys of the dictionary
for key in filtered_dictionary.keys():
    # Extract the ID from the key
    id = re.search(r'\d+', key).group()
    
    # Add a new column to the dataframe with the extracted ID
    filtered_dictionary[key]['ID'] = id

Check if it's correctly implemented. 

In [16]:
# Get the DataFrame from the dictionary
random_df = filtered_dictionary['16-07.csv']

display(random_df)

Unnamed: 0,frame,face_id,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID
0,1,0,0.98,1,0.00,0.00,0.92,0.0,0.23,1.15,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
1,2,0,0.98,1,0.07,0.00,0.78,0.0,0.16,0.95,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2,3,0,0.98,1,0.15,0.00,0.81,0.0,0.16,0.91,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,16
3,4,0,0.98,1,0.14,0.10,0.82,0.0,0.15,1.01,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
4,5,0,0.98,1,0.15,0.11,0.87,0.0,0.15,0.99,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745,2746,0,0.98,1,0.17,0.10,0.54,0.0,0.01,1.33,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2746,2747,0,0.98,1,0.22,0.13,0.57,0.0,0.01,1.29,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2747,2748,0,0.98,1,0.14,0.03,0.61,0.0,0.00,1.24,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2748,2749,0,0.98,1,0.09,0.03,0.60,0.0,0.00,1.06,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16


# Adding Stage

Each file has the information of one or more stages. The stages are the different stages of the entire donation proccess. 

- Stage 1: Recorded a one-minute video. Initial VVR measurement was taken.
- Stage 2: Donors completed standard registration forms at Sanquin. Participants were recorded as they progressed through the blood donation procedure. Continued to take VVR measurements.
- Stage 3: Donors either waited in the waiting area or were sent directly to the donation chair, depending on BCC location. Recorded a third video, lasting 1 to 2 minutes, for those waiting. Donors self-reported VVR during this time.
- Stages 4–6: Continuous video recording in the donation chair. VVR levels assessed three times: at needle insertion (stage 4), around 300 mL of donated blood (stage 5), and during needle uncoupling (stage 6).
- Stage 7: Final recording and VVR level assessment in the waiting area. Donors recuperated from the donation process.

Unfortunately the formatting is not the same for each file. So it was necessary to do some testing. 

In [17]:
def extract_timeframe(filename):
    # Extract numerical values from the filename
    numerical_values = re.findall(r'\d+', filename)
    
    # Convert each numerical value to an integer
    stages = [int(value) for value in numerical_values]
    
    return stages

In [18]:
# Test cases
filename1 = '16-07.csv'
filename2 = '6-03,04,05,06.csv'

stages_file1 = extract_timeframe(filename1)
stages_file2 = extract_timeframe(filename2)

print("Timeframes from filename 1:", stages_file1)
print("Timeframes from filename 2:", stages_file2)

Timeframes from filename 1: [16, 7]
Timeframes from filename 2: [6, 3, 4, 5, 6]


In [19]:
# Check to see if all the keys have the same format 
def check_key_format(keys):
    # Regular expression patterns to match the two formats
    pattern_1 = re.compile(r'^\d+_\d+(\,\d+)*\.csv$')
    pattern_2 = re.compile(r'^\d+(-\d+(\,\d+)*)*\.csv$')

    # Lists to store keys with different formats
    format_1_keys = []
    format_2_keys = []
    other_keys = []

    # Iterate through all keys
    for key in keys:
        # Match the key format with both patterns
        match_1 = pattern_1.match(key)
        match_2 = pattern_2.match(key)
        if match_1:
            format_1_keys.append(key)
        elif match_2:
            format_2_keys.append(key)
        else:
            other_keys.append(key)

    return format_1_keys, format_2_keys, other_keys

# Usage example:
keys = filtered_dictionary.keys()  
format_1_keys, format_2_keys, other_keys = check_key_format(keys)
print("Keys with format 1:", format_1_keys)
print("Keys with format 2:", format_2_keys)
print("Other keys:", other_keys)


Keys with format 1: ['80_04,05,06.csv', '324_02.csv', '87_04,05,06.csv', '78_04,05,06.csv', '328_02.csv', '92_04,05,06.csv', '38_4,5,6.csv', '129_04,05,06.csv', '85_07.csv', '95_04,05,06.csv', '300_02.csv', '97_07.csv', '290_01.csv', '312_02.csv', '127_03.csv', '40_07.csv', '101_07.csv', '118_04,05,06.csv', '324_01.csv', '144_03.csv', '113_07.csv', '52_07.csv', '328_01.csv', '31_07.csv', '125_07.csv', '64_07.csv', '146_07.csv', '300_01.csv', '49_04,05,06.csv', '89_04.csv', '76_07.csv', '68_07.csv', '129_07.csv', '290_02.csv', '312_01.csv', '68_03.csv', '33_07.csv', '94_04,05,06.csv', '111_07.csv', '39_04,05,06.csv', '93_04,05,06.csv', '50_07.csv', '326_01.csv', '64_03.csv', '42_07.csv', '103_07.csv', '79_04,05,06.csv', '292_02.csv', '310_01.csv', '31_03.csv', '81_04,05,06.csv', '52_03.csv', '113_03.csv', '74_07.csv', '135_07.csv', '144_07.csv', '302_01.csv', '139_07.csv', '78_07.csv', '127_07.csv', '66_07.csv', '48_04,05,06.csv', '326_02.csv', '95_07.csv', '292_01.csv', '310_02.csv', '

There are a few keys with a different format: ['105_04 donation not completed. No blood flow.csv', 'DSCN2370.csv', '126_01 (5).csv', '126_01 (4).csv', '126_01 (3).csv']. 

Since it seems these are not correct or double, we delete these from the data. 

Length of the filtered dictionary on June 3, 2024 is 406. 

In [20]:
# Filter out dataframes with keys in other_keys
temp = {key: value for key, value in filtered_dictionary.items() if key not in other_keys}

filtered_dictionary = temp

# Verify that dataframes with keys in other_keys are deleted
print(len(filtered_dictionary))

406


# Modify keys

We used to have 403 files (all were put into a dictionary). Then 5 dataframes in the dictionary were dropped, so we have a length of 398. 

May 30: only 322 files. 
June 3: 406 files. 

In [21]:
# Iterate through the keys and modify keys to the desired format
for key in list(filtered_dictionary.keys()):
    parts = key.split('-')
    if len(parts) >= 2 and parts[-1].endswith('.csv'):
        id_part = parts[0]
        stage_part = ",".join(parts[1].split(','))
        new_key = f"{id_part}_{stage_part}.csv"
        filtered_dictionary[new_key] = filtered_dictionary.pop(key)

# Verify the modified keys
# print("Modified keys in desired format:", data_filtered.keys())
print(len(filtered_dictionary))

406


Create a column in the dataframes for stage. 

In [22]:
# Iterate through the dataframes in the data_filtered dictionary
for key, df in filtered_dictionary.items():
    # Remove the '.csv' suffix from the key
    key = key.replace('.csv', '')
    # Extract the timepoints from the key
    stages_str = key.split('_')[1]
    # Remove leading zeros from the timepoints string
    stages_str = ','.join(str(int(tp.lstrip('0'))) for tp in stages_str.split(','))
    # Convert the timepoints string into a list of integers
    stages_list = [int(tp) for tp in stages_str.split(',')]
    # Add the Timeframe column to the dataframe
    df['Stage'] = [stages_list] * len(df)

# Get the DataFrame from the dictionary
random_df = filtered_dictionary['95_04,05,06.csv']

display(random_df)

Unnamed: 0,frame,face_id,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,1,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.43,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
1,2,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.41,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
2,3,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.37,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
3,4,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.29,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
4,5,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.18,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17880,17881,0.0,0.98,1.0,2.59,0.59,0.58,0.0,0.00,0.16,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"
17881,17882,0.0,0.98,1.0,2.73,0.81,0.65,0.0,0.00,0.26,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"
17882,17883,0.0,0.98,1.0,2.73,0.79,0.55,0.0,0.00,0.19,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"
17883,17884,0.0,0.98,1.0,2.77,0.82,0.44,0.0,0.00,0.32,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"


# Merge

To be able to work sufficiently and preprocess using TS Fresh, all the dataframes in the dictionary should be merged into one. 
The new big dataframe is just called df. 

In [23]:
action_units = pd.concat(filtered_dictionary.values(), ignore_index=True)

# Rename columns using a dictionary where keys are the current column names and values are the new column names
action_units = action_units.rename(columns={'frame': 'Frame',
                        ' face_id': 'Face_id', 
                        ' confidence': 'Confidence', 
                        ' success': 'Success'
                        })

display(action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,1,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,2,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,3,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,4,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,5,0.0,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3487088,2196,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3487089,2197,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3487090,2198,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3487091,2199,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


# Drop columns with confidence below 0.80

If values are not confident enough, they should be dropped. We go from 3250418 to 3174403. 

May 30: we go from 2555651 to 2487193. 
June 3: we go from 3487093 to 3395916. 

In [24]:
print(action_units.shape)
action_units = action_units[action_units['Confidence'] >= 0.80]
print(action_units.shape)

(3487093, 41)
(3395916, 41)


# Check for missing values

In [25]:
# Check for missing values in 'ID' and 'Timeframe' columns
missing_id = action_units['ID'].isna().any()
missing_stage = action_units['Stage'].isna().any()

# Print the results
print("Missing values in 'ID' column:", missing_id)
print("Missing values in 'Timeframe' column:", missing_stage)

Missing values in 'ID' column: False
Missing values in 'Timeframe' column: False


# Check column names

The column names should be consistent, so no spaces at the beginning and all starting with a capital letter. 

In [26]:
# Print the column names
print("Column names:", action_units.columns.tolist())
print(len(action_units.columns.tolist()))

Column names: ['Frame', 'Face_id', 'Confidence', 'Success', ' AU01_r', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c', ' AU45_c', 'ID', 'Stage']
41


In [27]:
# Rename columns using a dictionary where keys are the current column names and values are the new column names
action_units = action_units.rename(columns={' AU01_r': 'AU01_r',
                                            ' AU02_r': 'AU02_r',
                                            ' AU04_r': 'AU04_r',
                                            ' AU05_r': 'AU05_r',
                                            ' AU06_r': 'AU06_r',
                                            ' AU07_r': 'AU07_r',
                                            ' AU09_r': 'AU09_r',
                                            ' AU10_r': 'AU10_r',
                                            ' AU12_r': 'AU12_r',
                                            ' AU14_r': 'AU14_r',
                                            ' AU15_r': 'AU15_r',
                                            ' AU17_r': 'AU17_r',
                                            ' AU20_r': 'AU20_r',
                                            ' AU23_r': 'AU23_r',
                                            ' AU25_r': 'AU25_r',
                                            ' AU26_r': 'AU26_r',
                                            ' AU45_r': 'AU45_r',
                                            ' AU01_c': 'AU01_c',
                                            ' AU02_c': 'AU02_c',
                                            ' AU04_c': 'AU04_c',
                                            ' AU05_c': 'AU05_c',
                                            ' AU06_c': 'AU06_c',
                                            ' AU07_c': 'AU07_c',
                                            ' AU09_c': 'AU09_c',
                                            ' AU10_c': 'AU10_c',
                                            ' AU12_c': 'AU12_c',
                                            ' AU14_c': 'AU14_c',
                                            ' AU15_c': 'AU15_c',
                                            ' AU17_c': 'AU17_c',
                                            ' AU20_c': 'AU20_c',
                                            ' AU23_c': 'AU23_c',
                                            ' AU25_c': 'AU25_c',
                                            ' AU26_c': 'AU26_c',
                                            ' AU28_c': 'AU28_c',
                                            ' AU45_c': 'AU45_c'
                                            })

In [28]:
# Print the column names
print("Column names:", action_units.columns.tolist())
print(len(action_units.columns.tolist()))

Column names: ['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage']
41


# Check number of unique IDs

Unfortunately, the data I received is not complete. We only have data from 184 IDs. However, we don't have information from all stages for every one of those IDs. So the data is very limited. 

May 30: Only 174 unique IDs. 
June 3: 184 unique IDs. 

In [29]:
# Get number of unique IDs
num_unique_ids = len(action_units['ID'].unique())
print("Number of Unique IDs:", num_unique_ids)

Number of Unique IDs: 184


# Save file 

The file we end up with has 3174403 rows and 41 columns. 

In [31]:
print(action_units.columns)

Index(['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r',
       'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r',
       'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r',
       'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c',
       'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c',
       'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage'],
      dtype='object')


Edit the file below to the correct date. 

In [32]:
action_units.to_csv('/Users/dionnespaltman/Desktop/V6/action_units_03-06-2024.csv', sep=',')

# Loading df 

In [33]:
action_units = pd.read_csv('/Users/dionnespaltman/Desktop/V6/action_units_03-06-2024.csv', sep=',')

if 'Unnamed: 0' in action_units.columns:
    action_units.drop(columns=['Unnamed: 0'], inplace=True)

In [34]:
display(action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


In [35]:
print(len(action_units['ID'].unique()))

184


# Stages

Here I check information from what stages is available. At first, I wanted to use the recordings from the waiting room to add them as a feature. However, this is not possible, since there is so little data from these timepoints. So I am only going to use data from stage 4, 5 and 6 which is when the participants are in the donation chair. 

June 3: 

[4, 5, 6]       2313881

[3, 4, 5, 6]     311923

[7]              269554

[1]              163222

[3]              134364

[2]               78159

[4]               68762

[4, 5]            40338

[4, 6]            15713


In [36]:
# Count occurrences of each value in the "Stage" column
stage_counts = action_units['Stage'].value_counts()

print(stage_counts)

[4, 5, 6]       2313881
[3, 4, 5, 6]     311923
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Stage, dtype: int64


# Missing values 

There are quite some missing values. This is possible when participants look away from the camera. Then naturally OpenFace wasn't able to get any information from their faces. 

In [37]:
nan_counts = action_units.notna().sum()
print(nan_counts)

Frame         3395916
Face_id       3395916
Confidence    3395916
Success       3395916
AU01_r        3152522
AU02_r        3152522
AU04_r        3152522
AU05_r        3152522
AU06_r        3152522
AU07_r        3152522
AU09_r        3152522
AU10_r        3152522
AU12_r        3152522
AU14_r        3152522
AU15_r        3152522
AU17_r        3152522
AU20_r        3152522
AU23_r        3152522
AU25_r        3152522
AU26_r        3152522
AU45_r        3152522
AU01_c        3152522
AU02_c        3152522
AU04_c        3152522
AU05_c        3152522
AU06_c        3152522
AU07_c        3152522
AU09_c        3152522
AU10_c        3152522
AU12_c        3152522
AU14_c        3152522
AU15_c        3152522
AU17_c        3152522
AU20_c        3152522
AU23_c        3152522
AU25_c        3152522
AU26_c        3152522
AU28_c        3152522
AU45_c        3152522
ID            3395916
Stage         3395916
dtype: int64


In [38]:
na_counts = action_units.isna().sum()
print(na_counts)

Frame              0
Face_id            0
Confidence         0
Success            0
AU01_r        243394
AU02_r        243394
AU04_r        243394
AU05_r        243394
AU06_r        243394
AU07_r        243394
AU09_r        243394
AU10_r        243394
AU12_r        243394
AU14_r        243394
AU15_r        243394
AU17_r        243394
AU20_r        243394
AU23_r        243394
AU25_r        243394
AU26_r        243394
AU45_r        243394
AU01_c        243394
AU02_c        243394
AU04_c        243394
AU05_c        243394
AU06_c        243394
AU07_c        243394
AU09_c        243394
AU10_c        243394
AU12_c        243394
AU14_c        243394
AU15_c        243394
AU17_c        243394
AU20_c        243394
AU23_c        243394
AU25_c        243394
AU26_c        243394
AU28_c        243394
AU45_c        243394
ID                 0
Stage              0
dtype: int64


Percentage of missing values is 243394 / 3395916 * 100 = 7.167256198327639

In [39]:
print(243394/3395916*100)

7.167256198327639


# Dealing with missing values - MICE

Because we're missing about 7 percent of the action units, it would not be smart to simply delete these rows. We will use MICE to deal with the missing values. 
Link: https://www.machinelearningplus.com/machine-learning/mice-imputation/?utm_content=cmp-true 

In [42]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [43]:
# Use Numeric Features
to_be_imputed = action_units.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']]
to_be_imputed

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.31,0.59,0.00,0.0,0.31,0.31,0.23,0.21,0.62,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.12,0.00,0.0,0.32,0.06,0.10,0.60,0.50,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.26,1.67,0.00,0.0,0.39,0.26,0.00,1.22,0.64,0.18,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,0.62,0.36,0.30,0.0,0.70,0.47,0.00,0.06,1.54,1.10,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3395912,0.63,0.38,0.25,0.0,0.78,0.56,0.00,0.04,1.45,1.07,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3395913,0.69,0.51,0.26,0.0,0.79,0.56,0.00,0.00,1.43,1.04,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3395914,0.67,0.46,0.32,0.0,0.81,0.56,0.00,0.02,1.43,1.16,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


Running everything up until the imputer was very fast. Running the imputer took around 6 minutes. 

June 3: running the imputer took around 7 minutes. 

In [44]:
# fit on the dataset
imputer.fit(to_be_imputed)

In [45]:
imputed = imputer.transform(to_be_imputed)

In [46]:
# Replace with imputed values
action_units.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 
                     'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 
                     'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 
                     'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']] = imputed

imputed_action_units = action_units

imputed_action_units

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


In [47]:
num_unique_ids = imputed_action_units['ID'].nunique()

print(num_unique_ids)

184


Then we will double check if everything went correctly. We should not have any missing values anymore. This is important to be able to implement TS Fresh. 

In [48]:
nan_counts = imputed_action_units.isna().sum()
print(nan_counts)

Frame         0
Face_id       0
Confidence    0
Success       0
AU01_r        0
AU02_r        0
AU04_r        0
AU05_r        0
AU06_r        0
AU07_r        0
AU09_r        0
AU10_r        0
AU12_r        0
AU14_r        0
AU15_r        0
AU17_r        0
AU20_r        0
AU23_r        0
AU25_r        0
AU26_r        0
AU45_r        0
AU01_c        0
AU02_c        0
AU04_c        0
AU05_c        0
AU06_c        0
AU07_c        0
AU09_c        0
AU10_c        0
AU12_c        0
AU14_c        0
AU15_c        0
AU17_c        0
AU20_c        0
AU23_c        0
AU25_c        0
AU26_c        0
AU28_c        0
AU45_c        0
ID            0
Stage         0
dtype: int64


# Saving the action units file with the imputed values 

Don't forget to change the filename. 

In [49]:
imputed_action_units.to_csv("/Users/dionnespaltman/Desktop/V6/imputed_action_units_03-06-2024.csv", sep=',')

# Loading action units file with the imputed values 

In [50]:
imputed_action_units = pd.read_csv("/Users/dionnespaltman/Desktop/V6/imputed_action_units_03-06-2024.csv")

if 'Unnamed: 0' in imputed_action_units.columns:
    imputed_action_units.drop(columns=['Unnamed: 0'], inplace=True)

display(imputed_action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


# Creating file with just Stage = [4, 5, 6]

Stage counts June 3: 

[4, 5, 6]       2313881

[3, 4, 5, 6]     311923

[7]              269554

[1]              163222

[3]              134364

[2]               78159

[4]               68762

[4, 5]            40338

[4, 6]            15713

In [51]:
# Count occurrences of Stage values in action_units DataFrame
stage_counts = imputed_action_units['Stage'].value_counts()
print(stage_counts)

[4, 5, 6]       2313881
[3, 4, 5, 6]     311923
[7]              269554
[1]              163222
[3]              134364
[2]               78159
[4]               68762
[4, 5]            40338
[4, 6]            15713
Name: Stage, dtype: int64


In [52]:
# Convert the 'Stage' column to a string representation
imputed_action_units['Stage_str'] = imputed_action_units['Stage'].apply(lambda x: str(x))

# Display the DataFrame to check the conversion
print("DataFrame with 'Stage' as string:")
display(imputed_action_units)

# Filter rows where 'Stage_str' is exactly "[4, 5, 6]"
filtered_action_units = imputed_action_units[imputed_action_units['Stage_str'] == str([4, 5, 6])]

# Display the filtered DataFrame
print("Filtered DataFrame where Stage is exactly '[4, 5, 6]':")
display(filtered_action_units)

DataFrame with 'Stage' as string:


Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]


Filtered DataFrame where Stage is exactly '[4, 5, 6]':


Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3301669,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,0.47,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301670,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,0.85,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301671,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,0.67,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301672,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,0.82,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"


Okay, so we had data from 184 participants. But we only have information from stages 4, 5 and 6 from 104 IDs. 

May 30: only 82 participants in stages 4, 5, 6. 

June 3: 111 participants in stages 4, 5, 6. 

In [53]:
# Count the unique IDs in the filtered DataFrame
num_unique_ids = filtered_action_units['ID'].nunique()
print("Number of unique IDs in filtered DataFrame:", num_unique_ids)

print(filtered_action_units.shape)

Number of unique IDs in filtered DataFrame: 111
(2313881, 42)


# Get list of available IDs 

May 30: [ 80  87  78  92  38 129  95 118  49  94  39  93  79  81  48 119 114  57
 146 113  50  45 106  42 101  74 130  73  99  66  33 125  61  34 122  43
 100  44 107 112  51 115 140  35  60 123  32  98 124 131  72 136  75  46
  41 102 142 117  54 145  53  65  30  62  37 121 134  77  88 133  67 111
 144  52  40 103  47 132 135  36  63 120  31  64 127  91  69  96  29 138
  58  85 139  82  28  97  68  59 108  25  23  24  26  27]

June 3: [ 80  87  78  92  38 129  95 118  49  94  39  93  79  81  48 119 114  57
 146 113  50  45 106  42 101  74 130  73  99  66  33 125  61  34 122  43
 100  44 107 112  51 115 140  35  60 123  32  98 124 131  72 136  75  46
  41 102 142 117  54 145  53  65  30  62  37 121 134  77  88 133  67 111
 144  52 116 143  40 103  47 104 132  71 135  76  36  63 120  31  64 127
  91  69  96  29  83  84 138  58  85 139  82  28  97  68  59 108  25  23
  24  26  27]

In [58]:
# Get the list of unique IDs in the filtered DataFrame
unique_ids = filtered_action_units['ID'].unique()

# Print the list of unique IDs
print("List of unique IDs in filtered DataFrame:")
print(unique_ids)
print(len(unique_ids))

List of unique IDs in filtered DataFrame:
[ 80  87  78  92  38 129  95 118  49  94  39  93  79  81  48 119 114  57
 146 113  50  45 106  42 101  74 130  73  99  66  33 125  61  34 122  43
 100  44 107 112  51 115 140  35  60 123  32  98 124 131  72 136  75  46
  41 102 142 117  54 145  53  65  30  62  37 121 134  77  88 133  67 111
 144  52 116 143  40 103  47 104 132  71 135  76  36  63 120  31  64 127
  91  69  96  29  83  84 138  58  85 139  82  28  97  68  59 108  25  23
  24  26  27]
111


# Descriptives frames

In [55]:
# Group by 'ID' and count the number of frames per ID
frame_counts = filtered_action_units.groupby('ID').size()

# Calculate the minimum, maximum, and average number of frames per ID
min_frames_per_id = frame_counts.min()
max_frames_per_id = frame_counts.max()
average_frames_per_id = frame_counts.mean()

print("Minimum number of frames per ID:", min_frames_per_id)
print("Maximum number of frames per ID:", max_frames_per_id)
print("Average number of frames per ID:", average_frames_per_id)


Minimum number of frames per ID: 8
Maximum number of frames per ID: 37833
Average number of frames per ID: 20845.774774774774


In [56]:
# Group by 'ID' and count the number of frames per ID
frame_counts = filtered_action_units.groupby('ID').size()

# Convert the frame counts Series to a list and sort it in ascending order
sorted_frame_counts = sorted(frame_counts.tolist())

print("Counts of frames per ID sorted in ascending order:", sorted_frame_counts)

Counts of frames per ID sorted in ascending order: [8, 22, 91, 196, 798, 1172, 1233, 1421, 2068, 9114, 9686, 10702, 14706, 15372, 15429, 15478, 15747, 16382, 16407, 16446, 16562, 16662, 16744, 16872, 17407, 17413, 17458, 17500, 17611, 17666, 17718, 17975, 18239, 18291, 18346, 18432, 18567, 18689, 18703, 19148, 19300, 19544, 19573, 19672, 19857, 19884, 19998, 20219, 20261, 20330, 20695, 21019, 21043, 21189, 21339, 21777, 22020, 22137, 22145, 22222, 22376, 22435, 22592, 22658, 22879, 22898, 22963, 22997, 23031, 23044, 23099, 23127, 23572, 23807, 23892, 23906, 24287, 24338, 24662, 24744, 24917, 25133, 25270, 25897, 25902, 26036, 26107, 27205, 27518, 27661, 27804, 27874, 28201, 28324, 28497, 28716, 29140, 29523, 29700, 30314, 30361, 31039, 31310, 31610, 31749, 31842, 33323, 33943, 35252, 35868, 37833]


# Save filtered_action_units

In [57]:
filtered_action_units.to_csv("/Users/dionnespaltman/Desktop/V6/filtered_action_units_03-06-2024.csv", sep=',')

In [59]:
display(filtered_action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3301669,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,0.47,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301670,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,0.85,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301671,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,0.67,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301672,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,0.82,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
