# Action units - cleaning

In this notebook, the files we got from the FAINT project team is looked at, cleaned and made ready for preprocessing in TS Fresh. 

In [75]:
# import 
import zipfile
import os
import pandas as pd
import csv
import socket  
import pickle
import re

# Unzipping

The files I received from the FAINT project team is a zipped folder. So first, I need to unzip this. Doing this cost around 19 minutes. 

In [2]:
# Define paths to the zipped and unzipped folders 
zip_file_path = '/Users/dionnespaltman/Desktop/V4/full donation data.zip'
output_folder = '/Users/dionnespaltman/Desktop/V4/full donation data - unzipped'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Unzip the folder
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

print("Unzipped folder created at:", output_folder)

Unzipped folder created at: /Users/dionnespaltman/Desktop/V4/full donation data - unzipped


Check the size of the folder. It's 18684.90839099884 MB. 

In [6]:
# Define the path to the folder
folder_path_unzipped_data = '/Users/dionnespaltman/Desktop/V4/full donation data - unzipped/full donation data'

# Initialize total size variable
total_size = 0

# Iterate over all files in the folder
for filename in os.listdir(folder_path_unzipped_data):
    file_path = os.path.join(folder_path_unzipped_data, filename)
    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        # Get the size of the file and add it to total_size
        total_size += os.path.getsize(file_path)

# Convert total size to a human-readable format (e.g., bytes to megabytes)
total_size_mb = total_size / (1024 * 1024)  # Convert bytes to megabytes

print("Total size of all files in the folder:", total_size_mb, "MB")

Total size of all files in the folder: 18684.90839099884 MB


# Creating a dictionary with all the files 

I want to access the file in a easy way. Since the files have important information in their filename, I will put all the files in a pandas dictionary (very efficient in my experience). The filename will serve as a key. In total 403 files were uploaded in the dictionary, that is correct. 

In [None]:
# List all files in the folder
file_names = os.listdir(folder_path_unzipped_data)

# Read a subset of files into a dictionary
data = {}
num_files_to_read = 450  # Adjust the number of files to read as needed
for i, file_name in enumerate(file_names):
    if i >= num_files_to_read:
        break
    if file_name.endswith('.csv'):  # Assuming the files are CSV format
        file_path = os.path.join(folder_path_unzipped_data, file_name)
        try:
            data[file_name] = pd.read_csv(file_path)
            print("File loaded successfully:", file_name)
        except Exception as e:
            print("Error loading file:", file_name, "- Error:", e)

# Now you have a dictionary 'data' containing DataFrames for each file (up to the specified number)
# Access them using keys (file names)
# For example:
print("Number of files loaded:", len(data))

Double check if the dicitionary is not empty. Check the structure. 

In [4]:
# Check if the dictionary is not empty
if data:
    # Get the first key (file name) and its corresponding DataFrame
    first_file_name = next(iter(data.keys()))
    first_df = data[first_file_name]
    
    # Display the DataFrame
    print("DataFrame for the first file '{}' in the dictionary:".format(first_file_name))
    display(first_df.head(10))
else:
    print("The dictionary is empty. No files loaded.")

DataFrame for the first file '16-07.csv' in the dictionary:


Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU12_c,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c
0,1,0,0.0,0.98,1,-0.186231,-0.071084,-0.979931,-0.32102,-0.025671,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,0,0.04,0.98,1,-0.195272,-0.066113,-0.978518,-0.321741,-0.022495,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,0,0.08,0.98,1,-0.183891,-0.07279,-0.980248,-0.325824,-0.032522,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,0,0.12,0.98,1,-0.18905,-0.06148,-0.980041,-0.323833,-0.018319,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,0,0.16,0.98,1,-0.193372,-0.059505,-0.979319,-0.317909,-0.020461,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,6,0,0.2,0.98,1,-0.182038,-0.053976,-0.981809,-0.320776,-0.025344,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,7,0,0.24,0.98,1,-0.17406,-0.062096,-0.982775,-0.332763,-0.027208,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,8,0,0.28,0.98,1,-0.183968,-0.068336,-0.980554,-0.32563,-0.040619,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,9,0,0.32,0.98,1,-0.172816,-0.076083,-0.982011,-0.330309,-0.047691,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,10,0,0.36,0.98,1,-0.162287,-0.070443,-0.984226,-0.325321,-0.033416,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


Saving the dictionary with all files as a pickle file. 

In [5]:
# Specify the file path to save the dictionary
save_path = '/Users/dionnespaltman/Desktop/V4/action_units/dictionary.pkl'

# Save the dictionary to a file using pickle
with open(save_path, 'wb') as file:
    pickle.dump(data, file)

print("Dictionary saved to:", save_path)

Dictionary saved to: /Users/dionnespaltman/Desktop/V3/data_dictionary_temp.pkl


# Loading the dictionary 

If you've already done the previous steps, then you can go immediately to this step and load the dictionary. 

In [76]:
# Specify the file path from which to load the dictionary
load_path = '/Users/dionnespaltman/Desktop/V4/action_units/dictionary.pkl'  

# Load the dictionary from the file using pickle
with open(load_path, 'rb') as file:
    data = pickle.load(file)

print("Dictionary loaded from:", data)

Dictionary loaded from: {'16-07.csv':       frame   face_id   timestamp   confidence   success   gaze_0_x  \
0         1         0        0.00         0.98         1  -0.186231   
1         2         0        0.04         0.98         1  -0.195272   
2         3         0        0.08         0.98         1  -0.183891   
3         4         0        0.12         0.98         1  -0.189050   
4         5         0        0.16         0.98         1  -0.193372   
...     ...       ...         ...          ...       ...        ...   
2745   2746         0      109.80         0.98         1  -0.152439   
2746   2747         0      109.84         0.98         1  -0.148107   
2747   2748         0      109.88         0.98         1  -0.154183   
2748   2749         0      109.92         0.98         1  -0.152587   
2749   2750         0      109.96         0.98         1  -0.157160   

       gaze_0_y   gaze_0_z   gaze_1_x   gaze_1_y  ...   AU12_c   AU14_c  \
0     -0.071084  -0.979931  -0.321

# Dropping columns 

Now we are going to clean the dictionary and make it ready to use. 

In [None]:
# Check if the dictionary is not empty
if data:
    # Get the first key (file name) and its corresponding DataFrame
    first_file_name = next(iter(data.keys()))
    first_df = data[first_file_name]
    
    # Print the column names of the DataFrame
    print("Column names of the first file '{}' in the dictionary:".format(first_file_name))
    print(first_df.columns.tolist())
else:
    print("The dictionary is empty. No files loaded.")

I want to check one single dictionary and use a key to see if everything works correctly as it should be. The shape of the dataframe is (2750, 714). It has 2750 rows and 714 columns. A lot of the rows are not interesting for the current research. 

In [78]:
# Get the DataFrame with the key '16-07.csv'
df_16_07 = data['16-07.csv']

# Get the shape of the DataFrame
shape_16_07 = df_16_07.shape

# Print the shape
print("Shape of the DataFrame with key '16-07.csv':", shape_16_07)

Shape of the DataFrame with key '16-07.csv': (2750, 714)


Here I'm making a function to be able to delete the columns I'm not interested in. These include columns with the following substrings: 'gaze', 'p', 'x', 'X', 'Y', 'y', 'Z',  'pose', 'eye'. After I've made the function, I test it using some examples. 

In [79]:
def get_columns_to_delete(columns):
    # List of substrings to search for in column names
    substrings = ['gaze', 'p', 'x', 'X', 'Y', 'y', 'Z',  'pose', 'eye']

    # Initialize an empty list to store column names to delete
    columns_to_delete = []

    # Iterate through each column name
    for column in columns:
        # Check if any of the substrings are present in the column name
        if any(sub in column for sub in substrings):
            # If present, add the column name to the list of columns to delete
            columns_to_delete.append(column)

    return columns_to_delete

In [80]:
# Example usage:
columns = ['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'gaze_0_x', 'gaze_0_y', 'pose_Rx', 'pose_Ry']
columns_to_delete = get_columns_to_delete(columns)
print(columns_to_delete)

['gaze_0_x', 'gaze_0_y', 'pose_Rx', 'pose_Ry']


Then I test it on a seperate dataframe in the dictionary. From the 714 columsn in total, there are 675 columns that can be deleted. 

In [81]:
# Get the DataFrame from the dictionary
df_to_process = data['16-07.csv']

# Get the list of column names
columns = df_to_process.columns
print(len(columns))
#print(columns)

# Get the list of columns to delete
columns_to_delete = get_columns_to_delete(columns)

# Print the list of columns to delete
#print(list(columns))
#print(columns_to_delete)
print(len(columns_to_delete))

714
675


The new, cleaner version of the dictionary is called data_dropped. I iterate through all the keys and dataframes in the old dictionary data. 

In [82]:
# Define the new dictionary to store modified DataFrames
data_dropped = {}

# Iterate through all the DataFrames in the original dictionary
for key, df in data.items():
    # Get the list of column names for the current DataFrame
    columns = df.columns
    
    # Get the list of columns to delete
    columns_to_delete = get_columns_to_delete(columns)
    
    # Create a new DataFrame without the columns to delete
    df_dropped = df.drop(columns=columns_to_delete)
    
    # Add the new DataFrame to the new dictionary
    data_dropped[key] = df_dropped

Save the file. 

In [83]:
# Specify the file path to save the dictionary
save_path = '/Users/dionnespaltman/Desktop/V4/action_units/dictionary_dropped.pkl'

# Save the dictionary to a file using pickle
with open(save_path, 'wb') as file:
    pickle.dump(data_dropped, file)

print("Dictionary saved to:", save_path)

Dictionary saved to: /Users/dionnespaltman/Desktop/V4/action_units/dictionary_dropped.pkl


New shape of a single dataframe is (2750, 39). 

In [84]:
# Get the DataFrame with the key '16-07.csv'
df_16_07 = data_dropped['16-07.csv']

# Get the shape of the DataFrame
shape_16_07 = df_16_07.shape

# Print the shape
print("Shape of the DataFrame with key '16-07.csv':", shape_16_07)

# Print column names
print(df_16_07.columns)

Shape of the DataFrame with key '16-07.csv': (2750, 39)
Index(['frame', ' face_id', ' confidence', ' success', ' AU01_r', ' AU02_r',
       ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r',
       ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r',
       ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c',
       ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c',
       ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c',
       ' AU26_c', ' AU28_c', ' AU45_c'],
      dtype='object')


# Adding ID 

It is important to be able to easily access the ID. Here I create a new column in each dataframe and store the ID there. 

In [85]:
# Iterate over the keys of the dictionary
for key in data_dropped.keys():
    # Extract the ID from the key
    id = re.search(r'\d+', key).group()
    
    # Add a new column to the dataframe with the extracted ID
    data_dropped[key]['ID'] = id

Check if it's correctly implemented. 

In [86]:
# Get the DataFrame from the dictionary
df_16_07 = data_dropped['16-07.csv']

display(df_16_07)

Unnamed: 0,frame,face_id,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID
0,1,0,0.98,1,0.00,0.00,0.92,0.0,0.23,1.15,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
1,2,0,0.98,1,0.07,0.00,0.78,0.0,0.16,0.95,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2,3,0,0.98,1,0.15,0.00,0.81,0.0,0.16,0.91,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,16
3,4,0,0.98,1,0.14,0.10,0.82,0.0,0.15,1.01,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
4,5,0,0.98,1,0.15,0.11,0.87,0.0,0.15,0.99,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745,2746,0,0.98,1,0.17,0.10,0.54,0.0,0.01,1.33,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2746,2747,0,0.98,1,0.22,0.13,0.57,0.0,0.01,1.29,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2747,2748,0,0.98,1,0.14,0.03,0.61,0.0,0.00,1.24,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16
2748,2749,0,0.98,1,0.09,0.03,0.60,0.0,0.00,1.06,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16


# Adding Stage (timepoint)

Each file has the information of one or more stages. The stages are the different stages of the entire donation proccess. 

- Stage 1: Recorded a one-minute video. Initial VVR measurement was taken.
- Stage 2: Donors completed standard registration forms at Sanquin. Participants were recorded as they progressed through the blood donation procedure. Continued to take VVR measurements.
- Stage 3: Donors either waited in the waiting area or were sent directly to the donation chair, depending on BCC location. Recorded a third video, lasting 1 to 2 minutes, for those waiting. Donors self-reported VVR during this time.
- Stages 4–6: Continuous video recording in the donation chair. VVR levels assessed three times: at needle insertion (stage 4), around 300 mL of donated blood (stage 5), and during needle uncoupling (stage 6).
- Stage 7: Final recording and VVR level assessment in the waiting area. Donors recuperated from the donation process.

Unfortunately the formatting is not the same for each file. So it was necessary to do some testing. 

In [87]:
def extract_timeframe(filename):
    # Extract numerical values from the filename
    numerical_values = re.findall(r'\d+', filename)
    
    # Convert each numerical value to an integer
    stages = [int(value) for value in numerical_values]
    
    return stages

In [88]:
# Test cases
filename1 = '16-07.csv'
filename2 = '6-03,04,05,06.csv'

stages_file1 = extract_timeframe(filename1)
stages_file2 = extract_timeframe(filename2)

print("Timeframes from filename 1:", stages_file1)
print("Timeframes from filename 2:", stages_file2)

Timeframes from filename 1: [16, 7]
Timeframes from filename 2: [6, 3, 4, 5, 6]


In [90]:
# Check to see if all the keys have the same format 
def check_key_format(keys):
    # Regular expression patterns to match the two formats
    pattern_1 = re.compile(r'^\d+_\d+(\,\d+)*\.csv$')
    pattern_2 = re.compile(r'^\d+(-\d+(\,\d+)*)*\.csv$')

    # Lists to store keys with different formats
    format_1_keys = []
    format_2_keys = []
    other_keys = []

    # Iterate through all keys
    for key in keys:
        # Match the key format with both patterns
        match_1 = pattern_1.match(key)
        match_2 = pattern_2.match(key)
        if match_1:
            format_1_keys.append(key)
        elif match_2:
            format_2_keys.append(key)
        else:
            other_keys.append(key)

    return format_1_keys, format_2_keys, other_keys

# Usage example:
keys = data_dropped.keys()  
format_1_keys, format_2_keys, other_keys = check_key_format(keys)
print("Keys with format 1:", format_1_keys)
print("Keys with format 2:", format_2_keys)
print("Other keys:", other_keys)


Keys with format 1: ['80_04,05,06.csv', '324_02.csv', '87_04,05,06.csv', '78_04,05,06.csv', '328_02.csv', '92_04,05,06.csv', '38_4,5,6.csv', '129_04,05,06.csv', '85_07.csv', '95_04,05,06.csv', '300_02.csv', '97_07.csv', '290_01.csv', '312_02.csv', '127_03.csv', '40_07.csv', '101_07.csv', '118_04,05,06.csv', '324_01.csv', '144_03.csv', '113_07.csv', '52_07.csv', '328_01.csv', '31_07.csv', '125_07.csv', '64_07.csv', '146_07.csv', '300_01.csv', '49_04,05,06.csv', '89_04.csv', '76_07.csv', '68_07.csv', '129_07.csv', '290_02.csv', '312_01.csv', '68_03.csv', '33_07.csv', '94_04,05,06.csv', '111_07.csv', '39_04,05,06.csv', '93_04,05,06.csv', '50_07.csv', '326_01.csv', '64_03.csv', '42_07.csv', '103_07.csv', '79_04,05,06.csv', '292_02.csv', '310_01.csv', '31_03.csv', '81_04,05,06.csv', '52_03.csv', '113_03.csv', '74_07.csv', '135_07.csv', '144_07.csv', '302_01.csv', '139_07.csv', '78_07.csv', '127_07.csv', '66_07.csv', '48_04,05,06.csv', '326_02.csv', '95_07.csv', '292_01.csv', '310_02.csv', '

There are a few keys with a different format: ['105_04 donation not completed. No blood flow.csv', 'DSCN2370.csv', '126_01 (5).csv', '126_01 (4).csv', '126_01 (3).csv']. 

Since it seems these are not correct or double, we delete these from the data. 

Data_filtered is now the new dataframe we're working with (so from data > data_dropped > data_filtered). 

In [91]:
# Filter out dataframes with keys in other_keys
data_filtered = {key: value for key, value in data_dropped.items() if key not in other_keys}

# Verify that dataframes with keys in other_keys are deleted
# print("Remaining keys after deletion:", data_filtered.keys())
print(len(data_filtered))

398


In [92]:
# Specify the file path to save the dictionary
save_path = '/Users/dionnespaltman/Desktop/V4/action_units/dictionary_filtered.pkl'

# Save the dictionary to a file using pickle
with open(save_path, 'wb') as file:
    pickle.dump(data_filtered, file)

print("Dictionary saved to:", save_path)

Dictionary saved to: /Users/dionnespaltman/Desktop/V4/action_units/dictionary_filtered.pkl


We used to have 403 files (all were put into a dictionary). Then 5 dataframes in the dictionary were dropped, so we have a length of 398. 

In [None]:
# Usage example:
keys = data_filtered.keys()  
format_1_keys, format_2_keys, other_keys = check_key_format(keys)
print("Keys with format 1:", format_1_keys)
print("Keys with format 2:", format_2_keys)
print("Other keys:", other_keys)

Then we iterate through all the keys to make them into one format. 

In [94]:
# Iterate through the keys and modify keys to the desired format
for key in list(data_filtered.keys()):
    parts = key.split('-')
    if len(parts) >= 2 and parts[-1].endswith('.csv'):
        id_part = parts[0]
        stage_part = ",".join(parts[1].split(','))
        new_key = f"{id_part}_{stage_part}.csv"
        data_filtered[new_key] = data_filtered.pop(key)

# Verify the modified keys
print("Modified keys in desired format:", data_filtered.keys())
print(len(data_filtered))

Modified keys in desired format: dict_keys(['80_04,05,06.csv', '324_02.csv', '87_04,05,06.csv', '78_04,05,06.csv', '328_02.csv', '92_04,05,06.csv', '38_4,5,6.csv', '129_04,05,06.csv', '85_07.csv', '95_04,05,06.csv', '300_02.csv', '97_07.csv', '290_01.csv', '312_02.csv', '127_03.csv', '40_07.csv', '101_07.csv', '118_04,05,06.csv', '324_01.csv', '144_03.csv', '113_07.csv', '52_07.csv', '328_01.csv', '31_07.csv', '125_07.csv', '64_07.csv', '146_07.csv', '300_01.csv', '49_04,05,06.csv', '89_04.csv', '76_07.csv', '68_07.csv', '129_07.csv', '290_02.csv', '312_01.csv', '68_03.csv', '33_07.csv', '94_04,05,06.csv', '111_07.csv', '39_04,05,06.csv', '93_04,05,06.csv', '50_07.csv', '326_01.csv', '64_03.csv', '42_07.csv', '103_07.csv', '79_04,05,06.csv', '292_02.csv', '310_01.csv', '31_03.csv', '81_04,05,06.csv', '52_03.csv', '113_03.csv', '74_07.csv', '135_07.csv', '144_07.csv', '302_01.csv', '139_07.csv', '78_07.csv', '127_07.csv', '66_07.csv', '48_04,05,06.csv', '326_02.csv', '95_07.csv', '292_0

Create a column in the dataframes for stage. 

In [95]:
# Iterate through the dataframes in the data_filtered dictionary
for key, df in data_filtered.items():
    # Remove the '.csv' suffix from the key
    key = key.replace('.csv', '')
    # Extract the timepoints from the key
    stages_str = key.split('_')[1]
    # Remove leading zeros from the timepoints string
    stages_str = ','.join(str(int(tp.lstrip('0'))) for tp in stages_str.split(','))
    # Convert the timepoints string into a list of integers
    stages_list = [int(tp) for tp in stages_str.split(',')]
    # Add the Timeframe column to the dataframe
    df['Stage'] = [stages_list] * len(df)

Check if it worked. 

In [96]:
# Get the DataFrame from the dictionary
df_95 = data_filtered['95_04,05,06.csv']

display(df_95)

Unnamed: 0,frame,face_id,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,1,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.43,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
1,2,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.41,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
2,3,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.37,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
3,4,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.29,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
4,5,0.0,0.98,1.0,0.00,0.00,0.00,0.0,0.18,0.00,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,95,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17880,17881,0.0,0.98,1.0,2.59,0.59,0.58,0.0,0.00,0.16,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"
17881,17882,0.0,0.98,1.0,2.73,0.81,0.65,0.0,0.00,0.26,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"
17882,17883,0.0,0.98,1.0,2.73,0.79,0.55,0.0,0.00,0.19,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"
17883,17884,0.0,0.98,1.0,2.77,0.82,0.44,0.0,0.00,0.32,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,95,"[4, 5, 6]"


# Merge into one big dataframe 

To be able to work sufficiently and preprocess using TS Fresh, all the dataframes in the dicitonary should be merged into one. 
The new big dataframe is just called df. 

In [97]:
df = pd.concat(data_filtered.values(), ignore_index=True)

In [98]:
# Rename columns using a dictionary where keys are the current column names and values are the new column names
df = df.rename(columns={'frame': 'Frame',
                        ' face_id': 'Face_id', 
                        ' confidence': 'Confidence', 
                        ' success': 'Success'
                        # Add more columns as needed
                        })

In [99]:
display(df.head(5))

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"


Save this first merged df. 

In [100]:
# Specify the file path to save the dictionary
save_path = '/Users/dionnespaltman/Desktop/V4/action_units/df.pkl'

# Save the dictionary to a file using pickle
with open(save_path, 'wb') as file:
    pickle.dump(df, file)

print("Dictionary saved to:", save_path)

Dictionary saved to: /Users/dionnespaltman/Desktop/V4/action_units/df.pkl


# Drop columns with confidence below 0.80

If values are not confident enough, they should be dropped. We go from 3250418 to 3174403. 

In [101]:
print(df.shape)
df = df[df['Confidence'] >= 0.80]
print(df.shape)

(3250418, 41)
(3174403, 41)


# Check for missing values

In [102]:
# Check for missing values in 'ID' and 'Timeframe' columns
missing_id = df['ID'].isna().any()
missing_stage = df['Stage'].isna().any()

# Print the results
print("Missing values in 'ID' column:", missing_id)
print("Missing values in 'Timeframe' column:", missing_stage)

Missing values in 'ID' column: False
Missing values in 'Timeframe' column: False


# Check column names

The column names should be consistent, so no spaces at the beginning and all starting with a capital letter. 

In [103]:
# Print the column names
print("Column names:", df.columns.tolist())
print(len(df.columns.tolist()))

Column names: ['Frame', 'Face_id', 'Confidence', 'Success', ' AU01_r', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r', ' AU01_c', ' AU02_c', ' AU04_c', ' AU05_c', ' AU06_c', ' AU07_c', ' AU09_c', ' AU10_c', ' AU12_c', ' AU14_c', ' AU15_c', ' AU17_c', ' AU20_c', ' AU23_c', ' AU25_c', ' AU26_c', ' AU28_c', ' AU45_c', 'ID', 'Stage']
41


In [104]:
# Rename columns using a dictionary where keys are the current column names and values are the new column names
df = df.rename(columns={' AU01_r': 'AU01_r',
                        ' AU02_r': 'AU02_r',
                        ' AU04_r': 'AU04_r',
                        ' AU05_r': 'AU05_r',
                        ' AU06_r': 'AU06_r',
                        ' AU07_r': 'AU07_r',
                        ' AU09_r': 'AU09_r',
                        ' AU10_r': 'AU10_r',
                        ' AU12_r': 'AU12_r',
                        ' AU14_r': 'AU14_r',
                        ' AU15_r': 'AU15_r',
                        ' AU17_r': 'AU17_r',
                        ' AU20_r': 'AU20_r',
                        ' AU23_r': 'AU23_r',
                        ' AU25_r': 'AU25_r',
                        ' AU26_r': 'AU26_r',
                        ' AU45_r': 'AU45_r',
                        ' AU01_c': 'AU01_c',
                        ' AU02_c': 'AU02_c',
                        ' AU04_c': 'AU04_c',
                        ' AU05_c': 'AU05_c',
                        ' AU06_c': 'AU06_c',
                        ' AU07_c': 'AU07_c',
                        ' AU09_c': 'AU09_c',
                        ' AU10_c': 'AU10_c',
                        ' AU12_c': 'AU12_c',
                        ' AU14_c': 'AU14_c',
                        ' AU15_c': 'AU15_c',
                        ' AU17_c': 'AU17_c',
                        ' AU20_c': 'AU20_c',
                        ' AU23_c': 'AU23_c',
                        ' AU25_c': 'AU25_c',
                        ' AU26_c': 'AU26_c',
                        ' AU28_c': 'AU28_c',
                        ' AU45_c': 'AU45_c'
                        })

In [105]:
# Print the column names
print("Column names:", df.columns.tolist())
print(len(df.columns.tolist()))

Column names: ['Frame', 'Face_id', 'Confidence', 'Success', 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c', 'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c', 'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c', 'AU45_c', 'ID', 'Stage']
41


# Check number of unique IDs

Unfortunately, the data I received is not complete. We only have data from 184 IDs. However, we don't have information from all stages for every one of those IDs. So the data is very limited. 

In [106]:
# Get number of unique IDs
num_unique_ids = len(df['ID'].unique())

print("Number of Unique IDs:", num_unique_ids)

Number of Unique IDs: 184


# Saving my clean dataframe 

In [107]:
# Save big_dataframe as a pickle file
df.to_pickle("/Users/dionnespaltman/Desktop/V4/action_units/clean_df.pkl")

# Loading pickle file

Load it again to see if it's clean and how it's supposed to be. 

In [108]:
# Read the pickle file into a DataFrame
clean_df = pd.read_pickle('/Users/dionnespaltman/Desktop/V4/action_units/clean_df.pkl')

# Display the DataFrame
display(clean_df)

Unnamed: 0,Frame,Face_id,Confidence,Success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,...,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,ID,Stage
103,104,0.0,0.88,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
104,105,0.0,0.98,1.0,0.00,0.00,0.00,0.0,1.34,0.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
111,112,0.0,0.98,1.0,0.31,0.59,0.00,0.0,0.31,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
114,115,0.0,0.98,1.0,0.00,0.12,0.00,0.0,0.32,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80,"[4, 5, 6]"
153,154,0.0,0.98,1.0,1.26,1.67,0.00,0.0,0.39,0.26,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,80,"[4, 5, 6]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3250413,2196,0.0,0.98,1.0,0.62,0.36,0.30,0.0,0.70,0.47,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3250414,2197,0.0,0.98,1.0,0.63,0.38,0.25,0.0,0.78,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3250415,2198,0.0,0.98,1.0,0.69,0.51,0.26,0.0,0.79,0.56,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
3250416,2199,0.0,0.98,1.0,0.67,0.46,0.32,0.0,0.81,0.56,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,25,[7]
