# Cleaning action units
In this notebook, the files we got from the FAINT project team is looked at, cleaned and made ready for preprocessing in TS Fresh. 

In [7]:
# import 
import os
import pandas as pd
import csv
import re

# need to enable iterative imputer explicitly since its still experimental
from sklearn.impute import IterativeImputer

# Open data

In [3]:
# Define the path to the folder
folder_path = '/Users/dionnespaltman/11-06-2024'

# Initialize total size variable
total_size = 0

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        # Get the size of the file and add it to total_size
        total_size += os.path.getsize(file_path)

# Convert total size to a human-readable format (e.g., bytes to megabytes)
total_size_mb = total_size / (1024 * 1024)  # Convert bytes to megabytes

print("Total size of all files in the folder:", total_size_mb, "MB")

Total size of all files in the folder: 35614.811098098755 MB


# Creating a dictionary with all the files 
June 11, 2024: Loading the files took around 11 minutes. 

In [None]:
# List all files in the folder
file_names = sorted(os.listdir(folder_path))

In [5]:
# Read a subset of files into a dictionary
dictionary = {}
for i, file_name in enumerate(file_names):
    if i >= 328:
        break
    if file_name.endswith('.csv'): 
        file_path = os.path.join(folder_path, file_name)
        try:
            dictionary[file_name] = pd.read_csv(file_path)
            print("File loaded successfully:", file_name)
        except Exception as e:
            print("Error loading file:", file_name, "- Error:", e)

# Now you have a dictionary 'data' containing DataFrames for each file (up to the specified number)
print("Number of files loaded:", len(dictionary))

File loaded successfully: 100_04,05,06.csv
File loaded successfully: 101_04,05,06.csv
File loaded successfully: 102_04,05,06.csv
File loaded successfully: 103_04,05,06.csv
File loaded successfully: 104_04,05,06.csv
File loaded successfully: 105_04,05,06.csv
File loaded successfully: 106_04,05,06.csv
File loaded successfully: 107_04,05,06.csv
File loaded successfully: 108_04,05,06.csv
File loaded successfully: 109_04,05,06.csv
File loaded successfully: 110_04,05,06.csv
File loaded successfully: 111_04,05,06.csv
File loaded successfully: 112_04,05,06.csv
File loaded successfully: 113_04,05,06.csv
File loaded successfully: 114_04,05,06.csv
File loaded successfully: 115_04,05,06.csv
File loaded successfully: 116_04,05,06.csv
File loaded successfully: 117_04,05,06.csv
File loaded successfully: 118_04,05,06.csv
File loaded successfully: 119_04,05,06.csv
File loaded successfully: 120_04,05,06.csv
File loaded successfully: 121_04,05,06.csv
File loaded successfully: 122_04,05,06.csv
File loaded

June 11, 2024: Loading the files into a dictionary took around 11 minutes. 

In [6]:
# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Read the files into a dictionary
dictionary_optimized = {}
num_files_to_read = min(328, len(csv_files)) 
errors = []

for file_name in csv_files[:num_files_to_read]:
    file_path = os.path.join(folder_path, file_name)
    try:
        dictionary_optimized[file_name] = pd.read_csv(file_path)
    except Exception as e:
        errors.append(f"Error loading file: {file_name} - Error: {e}")

# Print summary
print("Number of files loaded:", len(dictionary_optimized))
if errors:
    print("Errors encountered with the following files:")
    for error in errors:
        print(error)

Number of files loaded: 281


Dataframe '100_04,05,06.csv' has 24650 rows and 714 columns. 

In [9]:
# Check if the dictionary is not empty
if dictionary:
    # Get the first key (file name) and its corresponding DataFrame
    first_file_name = next(iter(dictionary.keys()))
    first_df = dictionary[first_file_name]
    
    # Display the DataFrame
    print("DataFrame for the first file '{}' in the dictionary:".format(first_file_name))
    display(first_df.head(10))
    print(first_df.shape)
else:
    print("The dictionary is empty. No files loaded.")

DataFrame for the first file '100_04,05,06.csv' in the dictionary:


Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,1,0,0.0,0.03,1,0.219065,0.232502,-0.947604,0.110053,0.148434,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,2,0,0.04,0.03,0,0.192279,0.251568,-0.948548,0.088629,0.202299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,0.08,0.03,0,0.289762,0.30499,-0.907204,0.111347,0.213317,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,0.12,0.03,0,0.255982,0.309473,-0.915805,0.102882,0.256116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,0.16,0.03,0,0.367767,0.342167,-0.864679,0.08207,0.215777,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,0,0.2,0.03,0,0.222987,0.398617,-0.889596,0.061008,0.273352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,0,0.24,0.03,0,0.346051,0.39699,-0.850087,0.09182,0.225025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,0,0.28,0.03,0,0.25552,0.411072,-0.87506,0.092869,0.193613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,0,0.32,0.03,0,0.29572,0.409374,-0.863112,0.109991,0.238213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,0,0.36,0.03,0,0.268667,0.367646,-0.890311,0.089827,0.354228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(24650, 714)


# Dropping columns 

In [10]:
# Get the DataFrame 
random_df = dictionary['23-04,05,06.csv']

# Get the shape of the DataFrame
shape_random_df = random_df.shape

# Print the shape
print("Shape of the DataFrame with key '23-04,05,06.csv':", shape_random_df)

Shape of the DataFrame with key '23-04,05,06.csv': (15600, 714)


Here I'm making a function to be able to delete the columns I'm not interested in. These include columns with the following substrings: 'gaze', 'p', 'x', 'X', 'Y', 'y', 'Z',  'pose', 'eye'. After I've made the function, I test it using some examples. 

In [11]:
def get_columns_to_delete(columns):
    # List of substrings to search for in column names
    substrings = ['gaze', 'p', 'x', 'X', 'Y', 'y', 'Z',  'pose', 'eye']

    # Initialize an empty list to store column names to delete
    columns_to_delete = []

    # Iterate through each column name
    for column in columns:
        # Check if any of the substrings are present in the column name
        if any(sub in column for sub in substrings):
            # If present, add the column name to the list of columns to delete
            columns_to_delete.append(column)

    return columns_to_delete

In [12]:
# Example usage
columns = ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'gaze_0_x', 'gaze_0_y', 'pose_Rx', 'pose_Ry']
columns_to_delete = get_columns_to_delete(columns)
print(columns_to_delete)

['gaze_0_x', 'gaze_0_y', 'pose_Rx', 'pose_Ry']


Then I test it on a seperate dataframe in the dictionary. From the 714 columsn in total, there are 675 columns that can be deleted. 

In [13]:
# Get the DataFrame from the dictionary
df_to_process = dictionary['23-04,05,06.csv']

# Get the list of column names
columns = df_to_process.columns
print(len(columns))

# Get the list of columns to delete
columns_to_delete = get_columns_to_delete(columns)

# Print the list of columns to delete
print(len(columns_to_delete))

714
675


June 11, 2024: Running the code below takes about 10 seconds. 

In [14]:
# Define the new dictionary to store modified DataFrames
filtered_dictionary = {}

# Iterate through all the DataFrames in the original dictionary
for key, df in dictionary.items():
    # Get the list of column names for the current DataFrame
    columns = df.columns
    
    # Get the list of columns to delete
    columns_to_delete = get_columns_to_delete(columns)
    
    # Create a new DataFrame without the columns to delete
    df_dropped = df.drop(columns=columns_to_delete)
    
    # Add the new DataFrame to the new dictionary
    filtered_dictionary[key] = df_dropped

In [15]:
# Get the DataFrame with the key 23-04,05,06.csv
random_df = filtered_dictionary['23-04,05,06.csv']

# Print the shape
print("Shape of the DataFrame with key '23-04,05,06.csv':", random_df.shape)

# Print column names
print(random_df.columns)

Shape of the DataFrame with key '23-04,05,06.csv': (15600, 39)
Index(['frame', ' face_id', ' confidence', ' success', ' AU01_r', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c',
       ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c',
       ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c',
       ' AU26_c', ' AU28_c', ' AU45_c'],
      dtype='object')


# Adding ID 

It is important to be able to easily access the ID. Here I create a new column in each dataframe and store the ID there. 

In [16]:
# Iterate over the keys of the dictionary
for key in filtered_dictionary.keys():
    # Extract the ID from the key
    id = re.search(r'\d+', key).group()
    
    # Add a new column to the dataframe with the extracted ID
    filtered_dictionary[key]['ID'] = id

Check if it's correctly implemented. 

In [17]:
# Get the DataFrame from the dictionary
random_df = filtered_dictionary['23-04,05,06.csv']

display(random_df)

Unnamed: 0,frame,face_id,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID
0,1,0,0.88,1,1.67,0.00,0.0,0.0,0.36,1.07,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,23
1,2,0,0.98,1,2.32,0.55,0.0,0.0,0.56,1.67,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,23
2,3,0,0.98,1,2.50,0.89,0.0,0.0,0.59,1.89,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,23
3,4,0,0.98,1,2.20,0.96,0.0,0.0,0.47,1.80,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,23
4,5,0,0.98,1,2.27,0.97,0.0,0.0,0.27,1.82,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15595,15596,0,0.98,1,0.00,0.00,0.0,0.0,0.51,0.98,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,23
15596,15597,0,0.98,1,0.00,0.00,0.0,0.0,0.46,1.18,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,23
15597,15598,0,0.98,1,0.00,0.00,0.0,0.0,0.49,1.22,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,23
15598,15599,0,0.98,1,0.00,0.00,0.0,0.0,0.49,1.27,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,23


# Adding Stage

Each file has the information of one or more stages. The stages are the different stages of the entire donation proccess. 

- Stage 1: Recorded a one-minute video. Initial VVR measurement was taken.
- Stage 2: Donors completed standard registration forms at Sanquin. Participants were recorded as they progressed through the blood donation procedure. Continued to take VVR measurements.
- Stage 3: Donors either waited in the waiting area or were sent directly to the donation chair, depending on BCC location. Recorded a third video, lasting 1 to 2 minutes, for those waiting. Donors self-reported VVR during this time.
- Stages 4–6: Continuous video recording in the donation chair. VVR levels assessed three times: at needle insertion (stage 4), around 300 mL of donated blood (stage 5), and during needle uncoupling (stage 6).
- Stage 7: Final recording and VVR level assessment in the waiting area. Donors recuperated from the donation process.

Initially, the formatting of the files were not the same so it was necessary to do some testing. 

In [18]:
def extract_timeframe(filename):
    # Extract numerical values from the filename
    numerical_values = re.findall(r'\d+', filename)
    
    # Convert each numerical value to an integer
    stages = [int(value) for value in numerical_values]
    
    return stages

In [19]:
# Test cases
filename1 = '16-07.csv'
filename2 = '6-03,04,05,06.csv'

stages_file1 = extract_timeframe(filename1)
stages_file2 = extract_timeframe(filename2)

print("Timeframes from filename 1:", stages_file1)
print("Timeframes from filename 2:", stages_file2)

Timeframes from filename 1: [16, 7]
Timeframes from filename 2: [6, 3, 4, 5, 6]


In [20]:
# Check to see if all the keys have the same format 
def check_key_format(keys):
    # Regular expression patterns to match the two formats
    pattern_1 = re.compile(r'^\d+_\d+(\,\d+)*\.csv$')
    pattern_2 = re.compile(r'^\d+(-\d+(\,\d+)*)*\.csv$')

    # Lists to store keys with different formats
    format_1_keys = []
    format_2_keys = []
    other_keys = []

    # Iterate through all keys
    for key in keys:
        # Match the key format with both patterns
        match_1 = pattern_1.match(key)
        match_2 = pattern_2.match(key)
        if match_1:
            format_1_keys.append(key)
        elif match_2:
            format_2_keys.append(key)
        else:
            other_keys.append(key)

    return format_1_keys, format_2_keys, other_keys

# Usage example:
keys = filtered_dictionary.keys()  
format_1_keys, format_2_keys, other_keys = check_key_format(keys)
print("Keys with format 1:", format_1_keys)
print("Keys with format 2:", format_2_keys)
print("Other keys:", other_keys)


Keys with format 1: ['100_04,05,06.csv', '101_04,05,06.csv', '102_04,05,06.csv', '103_04,05,06.csv', '104_04,05,06.csv', '105_04,05,06.csv', '106_04,05,06.csv', '107_04,05,06.csv', '108_04,05,06.csv', '109_04,05,06.csv', '110_04,05,06.csv', '111_04,05,06.csv', '112_04,05,06.csv', '113_04,05,06.csv', '114_04,05,06.csv', '115_04,05,06.csv', '116_04,05,06.csv', '117_04,05,06.csv', '118_04,05,06.csv', '119_04,05,06.csv', '120_04,05,06.csv', '121_04,05,06.csv', '122_04,05,06.csv', '123_04,05,06.csv', '124_04,05,06.csv', '125_04,05,06.csv', '126_04,05,06.csv', '127_04,05,06.csv', '128_04,05,06.csv', '129_04,05,06.csv', '130_04,05,06.csv', '131_04,05,06.csv', '132_04,05,06.csv', '133_04,05,06.csv', '134_04,05,06.csv', '135_04,05,06.csv', '136_04,05,06.csv', '137_04,05,06.csv', '138_04,05,06.csv', '139_04,05,06.csv', '140_04,05,06.csv', '141_04,05,06.csv', '142_04,05,06.csv', '143_04,05,06.csv', '144_04,05,06.csv', '145_04,05,06.csv', '146_04,05,06.csv', '147_04,05,06.csv', '148_04,05,06.csv',

June 3, 2024: length of the filtered dictionary is 406 (also including files with the wrong stage). 

June 11, 2024: length of the filtered dictionary is 281. 

In [21]:
# Filter out dataframes with keys in other_keys
temp = {key: value for key, value in filtered_dictionary.items() if key not in other_keys}

filtered_dictionary = temp

# Verify that dataframes with keys in other_keys are deleted
print(len(filtered_dictionary))

281


# Modify keys

We used to have 403 files (all were put into a dictionary). Then 5 dataframes in the dictionary were dropped, so we have a length of 398. 

May 30: only 322 files. 

June 3: 406 files. 

June 11: 281 files. 

In [22]:
# Iterate through the keys and modify keys to the desired format
for key in list(filtered_dictionary.keys()):
    parts = key.split('-')
    if len(parts) >= 2 and parts[-1].endswith('.csv'):
        id_part = parts[0]
        stage_part = ",".join(parts[1].split(','))
        new_key = f"{id_part}_{stage_part}.csv"
        filtered_dictionary[new_key] = filtered_dictionary.pop(key)

# Verify the modified keys
# print("Modified keys in desired format:", data_filtered.keys())
print(len(filtered_dictionary))

281


Create a column in the dataframes for stage. 

In [23]:
# Iterate through the dataframes in the data_filtered dictionary
for key, df in filtered_dictionary.items():
    # Remove the '.csv' suffix from the key
    key = key.replace('.csv', '')
    # Extract the timepoints from the key
    stages_str = key.split('_')[1]
    # Remove leading zeros from the timepoints string
    stages_str = ','.join(str(int(tp.lstrip('0'))) for tp in stages_str.split(','))
    # Convert the timepoints string into a list of integers
    stages_list = [int(tp) for tp in stages_str.split(',')]
    # Add the Timeframe column to the dataframe
    df['Stage'] = [stages_list] * len(df)

# Get the DataFrame from the dictionary
random_df = filtered_dictionary['95_04,05,06.csv']

display(random_df)

Unnamed: 0,frame,face_id,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,1,0,0.98,1,0.00,0.00,0.00,0.0,0.43,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
1,2,0,0.98,1,0.00,0.00,0.00,0.0,0.41,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
2,3,0,0.98,1,0.00,0.00,0.00,0.0,0.29,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
3,4,0,0.98,1,0.00,0.00,0.00,0.0,0.21,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
4,5,0,0.98,1,0.00,0.00,0.00,0.0,0.12,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28670,28671,0,0.93,1,4.89,1.69,0.88,0.0,0.00,0.00,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,95,"[4, 5, 6]"
28671,28672,0,0.93,1,4.93,1.96,1.07,0.0,0.00,0.00,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,95,"[4, 5, 6]"
28672,28673,0,0.93,1,4.97,2.35,1.17,0.0,0.00,0.01,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,95,"[4, 5, 6]"
28673,28674,0,0.93,1,5.00,2.60,1.29,0.0,0.00,0.12,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,95,"[4, 5, 6]"


# Merge

To be able to work sufficiently and preprocess using TS Fresh, all the dataframes in the dictionary should be merged into one. 
The new big dataframe is just called df. 

June 11, 2024: Running the code below took 22 seconds. 

In [24]:
action_units = pd.concat(filtered_dictionary.values(), ignore_index=True)

# Rename columns using a dictionary where keys are the current column names and values are the new column names
action_units = action_units.rename(columns={'frame': 'Frame',
                        ' face_id': 'Face_id', 
                        ' confidence': 'Confidence', 
                        ' success': 'Success'
                        })

display(action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,1.0,0.0,0.03,1.0,0.00,0.00,0.00,0.0,0.77,1.39,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,100,"[4, 5, 6]"
1,2.0,0.0,0.03,0.0,0.00,0.00,0.00,0.0,0.26,0.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
2,3.0,0.0,0.03,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
3,4.0,0.0,0.03,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
4,5.0,0.0,0.03,0.0,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6677962,26796.0,0.0,0.98,1.0,0.91,0.07,0.73,0.0,1.01,0.88,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]
6677963,26797.0,0.0,0.98,1.0,0.93,0.02,0.63,0.0,0.93,0.77,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]
6677964,26798.0,0.0,0.98,1.0,0.87,0.08,0.57,0.0,0.88,0.70,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]
6677965,26799.0,0.0,0.98,1.0,0.86,0.16,0.62,0.0,0.83,0.60,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]


# Drop columns with confidence below 0.80

If values are not confident enough, they should be dropped. 

May 30: we go from 2555651 to 2487193. 

June 3: we go from 3487093 to 3395916. 

June 11: we go from 6677967 to 6521676. So 2.39 percent of rows had a confidence below 0.80. 

In [25]:
print(action_units.shape)
action_units = action_units[action_units['Confidence'] >= 0.80]
print(action_units.shape)

(6677967, 41)
(6521676, 41)


In [27]:
print((6677967 - 6521676)/ 6521676 * 100)

2.3964851979767166


# Check for missing values

In [28]:
# Check for missing values in 'ID' and 'Timeframe' columns
missing_id = action_units['ID'].isna().any()
missing_stage = action_units['Stage'].isna().any()

# Print the results
print("Missing values in 'ID' column:", missing_id)
print("Missing values in 'Timeframe' column:", missing_stage)

Missing values in 'ID' column: False
Missing values in 'Timeframe' column: False


# Check column names
The column names should be consistent, so no spaces at the beginning and all starting with a capital letter. 

In [29]:
# Print the column names
print("Column names:", action_units.columns.tolist())
print(len(action_units.columns.tolist()))

Column names: ['Frame', 'Face_id', 'Confidence', 'Success', ' AU01_r', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c', ' AU45_c', 'ID', 'Stage']
41


In [30]:
# Rename columns using a dictionary where keys are the current column names and values are the new column names
action_units = action_units.rename(columns={' AU01_r': 'AU01_r',
                                            ' AU02_r': 'AU02_r',
                                            ' AU04_r': 'AU04_r',
                                            ' AU05_r': 'AU05_r',
                                            ' AU06_r': 'AU06_r',
                                            ' AU07_r': 'AU07_r',
                                            ' AU09_r': 'AU09_r',
                                            ' AU10_r': 'AU10_r',
                                            ' AU12_r': 'AU12_r',
                                            ' AU14_r': 'AU14_r',
                                            ' AU15_r': 'AU15_r',
                                            ' AU17_r': 'AU17_r',
                                            ' AU20_r': 'AU20_r',
                                            ' AU23_r': 'AU23_r',
                                            ' AU25_r': 'AU25_r',
                                            ' AU26_r': 'AU26_r',
                                            ' AU45_r': 'AU45_r',
                                            ' AU01_c': 'AU01_c',
                                            ' AU02_c': 'AU02_c',
                                            ' AU04_c': 'AU04_c',
                                            ' AU05_c': 'AU05_c',
                                            ' AU06_c': 'AU06_c',
                                            ' AU07_c': 'AU07_c',
                                            ' AU09_c': 'AU09_c',
                                            ' AU10_c': 'AU10_c',
                                            ' AU12_c': 'AU12_c',
                                            ' AU14_c': 'AU14_c',
                                            ' AU15_c': 'AU15_c',
                                            ' AU17_c': 'AU17_c',
                                            ' AU20_c': 'AU20_c',
                                            ' AU23_c': 'AU23_c',
                                            ' AU25_c': 'AU25_c',
                                            ' AU26_c': 'AU26_c',
                                            ' AU28_c': 'AU28_c',
                                            ' AU45_c': 'AU45_c'
                                            })

In [31]:
# Print the column names
print("Column names:", action_units.columns.tolist())
print(len(action_units.columns.tolist()))

Column names: ['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage']
41


# Check number of unique IDs
Unfortunately, the data I received is not complete. We only have data from 184 IDs. However, we don't have information from all stages for every one of those IDs. So the data is very limited. 

May 30: 174 unique IDs. 

June 3: 184 unique IDs. 

June 11: 281 unique IDs. 

In [32]:
# Get number of unique IDs
num_unique_ids = len(action_units['ID'].unique())
print("Number of Unique IDs:", num_unique_ids)

Number of Unique IDs: 281


# Save file 
Edit the file below to the correct date. 

Saving the csv file took around 2 minutes. 

In [33]:
print(action_units.columns)

Index(['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r',
       'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r',
       'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r',
       'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c',
       'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c',
       'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage'],
      dtype='object')


In [37]:
action_units.to_csv('/Users/dionnespaltman/Desktop/V6/action_units_11-06-2024.csv', sep=',')

# Loading df 
The dataframe has 6521676 rows and 41

In [33]:
action_units = pd.read_csv('/Users/dionnespaltman/Desktop/V6/action_units_11-06-2024.csv', sep=',')

if 'Unnamed: 0' in action_units.columns:
    action_units.drop(columns=['Unnamed: 0'], inplace=True)

In [38]:
display(action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
19,20.0,0.0,0.88,1.0,0.43,0.05,0.02,0.0,0.80,0.79,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
20,21.0,0.0,0.88,1.0,0.43,0.05,0.11,0.0,1.26,1.11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
21,22.0,0.0,0.88,1.0,0.20,0.02,0.09,0.0,0.90,0.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
58,59.0,0.0,0.98,1.0,0.14,0.47,0.13,0.0,1.22,0.35,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
59,60.0,0.0,0.98,1.0,0.14,0.78,0.21,0.0,1.98,0.47,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,100,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6677962,26796.0,0.0,0.98,1.0,0.91,0.07,0.73,0.0,1.01,0.88,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]
6677963,26797.0,0.0,0.98,1.0,0.93,0.02,0.63,0.0,0.93,0.77,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]
6677964,26798.0,0.0,0.98,1.0,0.87,0.08,0.57,0.0,0.88,0.70,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]
6677965,26799.0,0.0,0.98,1.0,0.86,0.16,0.62,0.0,0.83,0.60,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,7,[4]


# Stages
Here I check information from what stages is available. At first, I wanted to use the recordings from the waiting room to add them as a feature. However, this is not possible, since there is so little data from these timepoints. So I am only going to use data from stage 4, 5 and 6 which is when the participants are in the donation chair. 

June 3: 

[4, 5, 6]       2313881

[3, 4, 5, 6]     311923

[7]              269554

[1]              163222

[3]              134364

[2]               78159

[4]               68762

[4, 5]            40338

[4, 6]            15713

June 11: 

[4, 5, 6]    6502500

[4]            19176

In [39]:
# Count occurrences of each value in the "Stage" column
stage_counts = action_units['Stage'].value_counts()

print(stage_counts)

[4, 5, 6]    6502500
[4]            19176
Name: Stage, dtype: int64


In [40]:
# Filter rows where Stage is 4
stage_4_ids = action_units[action_units['Stage'] == 4]['ID']

# Print the IDs
print(stage_4_ids)

Series([], Name: ID, dtype: object)


# Missing values 
June 3, 2024: There are quite some missing values. This is possible when participants look away from the camera. Then naturally OpenFace wasn't able to get any information from their faces. 

June 11, 2024: There are no missing values. 

In [43]:
nan_counts = action_units.isna().sum()
print(nan_counts)

Frame         0
Face_id       0
Confidence    0
Success       0
AU01_r        0
AU02_r        0
AU04_r        0
AU05_r        0
AU06_r        0
AU07_r        0
AU09_r        0
AU10_r        0
AU12_r        0
AU14_r        0
AU15_r        0
AU17_r        0
AU20_r        0
AU23_r        0
AU25_r        0
AU26_r        0
AU45_r        0
AU01_c        0
AU02_c        0
AU04_c        0
AU05_c        0
AU06_c        0
AU07_c        0
AU09_c        0
AU10_c        0
AU12_c        0
AU14_c        0
AU15_c        0
AU17_c        0
AU20_c        0
AU23_c        0
AU25_c        0
AU26_c        0
AU28_c        0
AU45_c        0
ID            0
Stage         0
dtype: int64


JUne 3, 2024: Percentage of missing values is 243394 / 3395916 * 100 = 7.167256198327639

In [39]:
print(243394/3395916*100)

7.167256198327639


# June 3, 2024: Dealing with missing values - MICE
Because we're missing about 7 percent of the action units, it would not be smart to simply delete these rows. We will use MICE to deal with the missing values. 
Link: https://www.machinelearningplus.com/machine-learning/mice-imputation/?utm_content=cmp-true 

In [42]:
# Define imputer
imputer = IterativeImputer(random_state=100, max_iter=10)

In [43]:
# Use Numeric Features
to_be_imputed = action_units.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']]
to_be_imputed

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00,0.00,0.00,0.0,1.34,0.22,0.57,1.56,1.52,0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.31,0.59,0.00,0.0,0.31,0.31,0.23,0.21,0.62,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.12,0.00,0.0,0.32,0.06,0.10,0.60,0.50,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.26,1.67,0.00,0.0,0.39,0.26,0.00,1.22,0.64,0.18,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,0.62,0.36,0.30,0.0,0.70,0.47,0.00,0.06,1.54,1.10,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3395912,0.63,0.38,0.25,0.0,0.78,0.56,0.00,0.04,1.45,1.07,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3395913,0.69,0.51,0.26,0.0,0.79,0.56,0.00,0.00,1.43,1.04,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3395914,0.67,0.46,0.32,0.0,0.81,0.56,0.00,0.02,1.43,1.16,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


Running everything up until the imputer was very fast. Running the imputer took around 6 minutes. 

June 3: running the imputer took around 7 minutes. 

In [44]:
# fit on the dataset
imputer.fit(to_be_imputed)

In [45]:
imputed = imputer.transform(to_be_imputed)

In [46]:
# Replace with imputed values
action_units.loc[:, ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 
                     'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 
                     'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 
                     'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c']] = imputed

imputed_action_units = action_units

imputed_action_units

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


In [44]:
num_unique_ids = imputed_action_units['ID'].nunique()
print(num_unique_ids)

NameError: name 'imputed_action_units' is not defined

Then we will double check if everything went correctly. We should not have any missing values anymore. This is important to be able to implement TS Fresh. 

In [48]:
nan_counts = imputed_action_units.isna().sum()
print(nan_counts)

Frame         0
Face_id       0
Confidence    0
Success       0
AU01_r        0
AU02_r        0
AU04_r        0
AU05_r        0
AU06_r        0
AU07_r        0
AU09_r        0
AU10_r        0
AU12_r        0
AU14_r        0
AU15_r        0
AU17_r        0
AU20_r        0
AU23_r        0
AU25_r        0
AU26_r        0
AU45_r        0
AU01_c        0
AU02_c        0
AU04_c        0
AU05_c        0
AU06_c        0
AU07_c        0
AU09_c        0
AU10_c        0
AU12_c        0
AU14_c        0
AU15_c        0
AU17_c        0
AU20_c        0
AU23_c        0
AU25_c        0
AU26_c        0
AU28_c        0
AU45_c        0
ID            0
Stage         0
dtype: int64


# June 3, 2024: Saving the action units file with the imputed values 
Don't forget to change the filename. 

In [49]:
imputed_action_units.to_csv("/Users/dionnespaltman/Desktop/V6/imputed_action_units_03-06-2024.csv", sep=',')

# June 3, 2024: Loading action units file with the imputed values 

In [50]:
imputed_action_units = pd.read_csv("/Users/dionnespaltman/Desktop/V6/imputed_action_units_03-06-2024.csv")

if 'Unnamed: 0' in imputed_action_units.columns:
    imputed_action_units.drop(columns=['Unnamed: 0'], inplace=True)

display(imputed_action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]


# Creating file with just Stage = [4, 5, 6]
Stage counts June 3, 2024: 

[4, 5, 6]       2313881

[3, 4, 5, 6]     311923

[7]              269554

[1]              163222

[3]              134364

[2]               78159

[4]               68762

[4, 5]            40338

[4, 6]            15713



Stage counts June 11, 2024: 

[4, 5, 6]    6502500

[4]            19176

In [45]:
# Count occurrences of Stage values in action_units DataFrame
stage_counts = action_units['Stage'].value_counts()
print(stage_counts)

[4, 5, 6]    6502500
[4]            19176
Name: Stage, dtype: int64


In [52]:
# Convert the 'Stage' column to a string representation
imputed_action_units['Stage_str'] = imputed_action_units['Stage'].apply(lambda x: str(x))

# Display the DataFrame to check the conversion
print("DataFrame with 'Stage' as string:")
display(imputed_action_units)

# Filter rows where 'Stage_str' is exactly "[4, 5, 6]"
filtered_action_units = imputed_action_units[imputed_action_units['Stage_str'] == str([4, 5, 6])]

# Display the filtered DataFrame
print("Filtered DataFrame where Stage is exactly '[4, 5, 6]':")
display(filtered_action_units)

DataFrame with 'Stage' as string:


Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3395911,2196.0,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395912,2197.0,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395913,2198.0,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]
3395914,2199.0,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7],[7]


Filtered DataFrame where Stage is exactly '[4, 5, 6]':


Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3301669,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,0.47,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301670,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,0.85,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301671,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,0.67,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301672,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,0.82,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"


May 30: only 82 participants in stages 4, 5, 6. 

June 3: 111 participants in stages 4, 5, 6. 

June 11: 

In [53]:
# Count the unique IDs in the filtered DataFrame
num_unique_ids = filtered_action_units['ID'].nunique()
print("Number of unique IDs in filtered DataFrame:", num_unique_ids)

print(filtered_action_units.shape)

Number of unique IDs in filtered DataFrame: 111
(2313881, 42)


# Get list of available IDs 

May 30: [ 80  87  78  92  38 129  95 118  49  94  39  93  79  81  48 119 114  57
 146 113  50  45 106  42 101  74 130  73  99  66  33 125  61  34 122  43
 100  44 107 112  51 115 140  35  60 123  32  98 124 131  72 136  75  46
  41 102 142 117  54 145  53  65  30  62  37 121 134  77  88 133  67 111
 144  52  40 103  47 132 135  36  63 120  31  64 127  91  69  96  29 138
  58  85 139  82  28  97  68  59 108  25  23  24  26  27]

June 3: [ 80  87  78  92  38 129  95 118  49  94  39  93  79  81  48 119 114  57
 146 113  50  45 106  42 101  74 130  73  99  66  33 125  61  34 122  43
 100  44 107 112  51 115 140  35  60 123  32  98 124 131  72 136  75  46
  41 102 142 117  54 145  53  65  30  62  37 121 134  77  88 133  67 111
 144  52 116 143  40 103  47 104 132  71 135  76  36  63 120  31  64 127
  91  69  96  29  83  84 138  58  85 139  82  28  97  68  59 108  25  23
  24  26  27]

In [58]:
# Get the list of unique IDs in the filtered DataFrame
unique_ids = filtered_action_units['ID'].unique()

# Print the list of unique IDs
print("List of unique IDs in filtered DataFrame:")
print(unique_ids)
print(len(unique_ids))

List of unique IDs in filtered DataFrame:
[ 80  87  78  92  38 129  95 118  49  94  39  93  79  81  48 119 114  57
 146 113  50  45 106  42 101  74 130  73  99  66  33 125  61  34 122  43
 100  44 107 112  51 115 140  35  60 123  32  98 124 131  72 136  75  46
  41 102 142 117  54 145  53  65  30  62  37 121 134  77  88 133  67 111
 144  52 116 143  40 103  47 104 132  71 135  76  36  63 120  31  64 127
  91  69  96  29  83  84 138  58  85 139  82  28  97  68  59 108  25  23
  24  26  27]
111


# Descriptives frames

In [55]:
# Group by 'ID' and count the number of frames per ID
frame_counts = filtered_action_units.groupby('ID').size()

# Calculate the minimum, maximum, and average number of frames per ID
min_frames_per_id = frame_counts.min()
max_frames_per_id = frame_counts.max()
average_frames_per_id = frame_counts.mean()

print("Minimum number of frames per ID:", min_frames_per_id)
print("Maximum number of frames per ID:", max_frames_per_id)
print("Average number of frames per ID:", average_frames_per_id)


Minimum number of frames per ID: 8
Maximum number of frames per ID: 37833
Average number of frames per ID: 20845.774774774774


In [56]:
# Group by 'ID' and count the number of frames per ID
frame_counts = filtered_action_units.groupby('ID').size()

# Convert the frame counts Series to a list and sort it in ascending order
sorted_frame_counts = sorted(frame_counts.tolist())

print("Counts of frames per ID sorted in ascending order:", sorted_frame_counts)

Counts of frames per ID sorted in ascending order: [8, 22, 91, 196, 798, 1172, 1233, 1421, 2068, 9114, 9686, 10702, 14706, 15372, 15429, 15478, 15747, 16382, 16407, 16446, 16562, 16662, 16744, 16872, 17407, 17413, 17458, 17500, 17611, 17666, 17718, 17975, 18239, 18291, 18346, 18432, 18567, 18689, 18703, 19148, 19300, 19544, 19573, 19672, 19857, 19884, 19998, 20219, 20261, 20330, 20695, 21019, 21043, 21189, 21339, 21777, 22020, 22137, 22145, 22222, 22376, 22435, 22592, 22658, 22879, 22898, 22963, 22997, 23031, 23044, 23099, 23127, 23572, 23807, 23892, 23906, 24287, 24338, 24662, 24744, 24917, 25133, 25270, 25897, 25902, 26036, 26107, 27205, 27518, 27661, 27804, 27874, 28201, 28324, 28497, 28716, 29140, 29523, 29700, 30314, 30361, 31039, 31310, 31610, 31749, 31842, 33323, 33943, 35252, 35868, 37833]


# Save filtered_action_units

In [57]:
filtered_action_units.to_csv("/Users/dionnespaltman/Desktop/V6/filtered_action_units_03-06-2024.csv", sep=',')

In [59]:
display(filtered_action_units)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage,Stage_str
0,104.0,0.0,0.88,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
1,105.0,0.0,0.98,1.0,0.00,0.00,0.00,0.00,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
2,112.0,0.0,0.98,1.0,0.31,0.59,0.00,0.00,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
3,115.0,0.0,0.98,1.0,0.00,0.12,0.00,0.00,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
4,154.0,0.0,0.98,1.0,1.26,1.67,0.00,0.00,0.39,0.26,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]","[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3301669,21069.0,0.0,0.93,1.0,2.08,2.22,0.89,0.16,0.22,0.47,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301670,21070.0,0.0,0.93,1.0,2.03,2.14,0.87,0.69,0.27,0.85,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301671,21071.0,0.0,0.93,1.0,2.02,2.11,1.12,0.49,0.47,0.67,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
3301672,21072.0,0.0,0.93,1.0,2.43,2.01,1.11,0.65,0.35,0.82,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,27,"[4, 5, 6]","[4, 5, 6]"
