# Processing 
In this file, the following steps will be taken: 
1. New versions of the AUs files will be created with the dropped columns. By doing this step first, we gain time when all the files will be added together.  
-  Output: "full donation data - dropped columns" 
2. Once we have a new folder with all the files with dropped columns, we will add two columns to each file. One column will be 'ID' and the other will be 'Time_point'. 
- Output: "full donation data - ID and time point"
3. Then we will create the big file in which we combine all the files in the folder 'full donation data'. 
- Output: "processed data"

In [23]:
# import 
import zipfile
import os
import pandas as pd
import csv
import socket  # Import the socket module

# Unzipping the zipped folder (only have to do this once!)

Running it once, took me around 2 minutes and 5 seconds. 
From then, the file path will be: '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped/full donation data'

In [10]:
# Define paths
zip_file_path = '/Users/dionnespaltman/Desktop/downloading/full donation data.zip'
output_folder = '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Unzip the folder
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

print("Unzipped folder created at:", output_folder)

Unzipped folder created at: /Users/dionnespaltman/Desktop/downloading/full donation data - unzipped


# If you've unzipped already, only load the unzipped folder 

### Size of all the files 

In [16]:
import os

# Define the path to the folder
folder_path = '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped/full donation data'

# Initialize total size variable
total_size = 0

# Iterate over all files in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        # Get the size of the file and add it to total_size
        total_size += os.path.getsize(file_path)

# Convert total size to a human-readable format (e.g., bytes to megabytes)
total_size_mb = total_size / (1024 * 1024)  # Convert bytes to megabytes

print("Total size of all files in the folder:", total_size_mb, "MB")


Total size of all files in the folder: 18684.90839099884 MB


### Loading all the files into a dictionary 

#### I got a lot of errors when I wanted to load all files at once. So first trying to load 10 files. That took around 14.2 seconds. 

In [21]:
# Set the timeout limit to 60 seconds (adjust as needed)
socket.setdefaulttimeout(60)

# Define the path to the folder containing the unzipped files
folder_path = '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped/full donation data'

# List all files in the folder
file_names = os.listdir(folder_path)

# Read a subset of files into a dictionary
data = {}
num_files_to_read = 10  # Adjust the number of files to read as needed
for i, file_name in enumerate(file_names):
    if i >= num_files_to_read:
        break
    if file_name.endswith('.csv'):  # Assuming the files are CSV format
        file_path = os.path.join(folder_path, file_name)
        try:
            data[file_name] = pd.read_csv(file_path)
            print("File loaded successfully:", file_name)
        except Exception as e:
            print("Error loading file:", file_name, "- Error:", e)

# Now you have a dictionary 'data' containing DataFrames for each file (up to the specified number)
# Access them using keys (file names)
# For example:
print("Number of files loaded:", len(data))


File loaded successfully: 16-07.csv
File loaded successfully: 80_04,05,06.csv
File loaded successfully: 7-04-05-06.csv
File loaded successfully: 324_02.csv
File loaded successfully: 87_04,05,06.csv
File loaded successfully: 78_04,05,06.csv
File loaded successfully: 328_02.csv
File loaded successfully: 105_04 donation not completed. No blood flow.csv
File loaded successfully: 92_04,05,06.csv
File loaded successfully: 38_4,5,6.csv
Number of files loaded: 10


#### Loading 99 files took a little over 2 minutes. 

In [24]:
# Define the path to the folder containing the unzipped files
folder_path = '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped/full donation data'

# List all files in the folder
file_names = os.listdir(folder_path)

# Read a subset of files into a dictionary
data = {}
num_files_to_read = 100  # Adjust the number of files to read as needed
for i, file_name in enumerate(file_names):
    if i >= num_files_to_read:
        break
    if file_name.endswith('.csv'):  # Assuming the files are CSV format
        file_path = os.path.join(folder_path, file_name)
        try:
            data[file_name] = pd.read_csv(file_path)
            print("File loaded successfully:", file_name)
        except Exception as e:
            print("Error loading file:", file_name, "- Error:", e)

# Now you have a dictionary 'data' containing DataFrames for each file (up to the specified number)
# Access them using keys (file names)
# For example:
print("Number of files loaded:", len(data))


File loaded successfully: 16-07.csv
Error loading file: 80_04,05,06.csv - Error: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.
File loaded successfully: 7-04-05-06.csv
File loaded successfully: 324_02.csv
File loaded successfully: 87_04,05,06.csv
File loaded successfully: 78_04,05,06.csv
File loaded successfully: 328_02.csv
File loaded successfully: 105_04 donation not completed. No blood flow.csv
File loaded successfully: 92_04,05,06.csv
File loaded successfully: 38_4,5,6.csv
File loaded successfully: 129_04,05,06.csv
File loaded successfully: 85_07.csv
File loaded successfully: DSCN2370.csv
File loaded successfully: 95_04,05,06.csv
File loaded successfully: 300_02.csv
File loaded successfully: 20-07.csv
File loaded successfully: 97_07.csv
File loaded successfully: 290_01.csv
File loaded successfully: 312_02.csv
File loaded successfully: 127_03.csv
File loaded successfully: 40_07.csv
File loaded successfully: 101_07.csv
File loaded success

#### Loading all files (took a little less than 8 minutes)

In [25]:
# Define the path to the folder containing the unzipped files
folder_path = '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped/full donation data'

# List all files in the folder
file_names = os.listdir(folder_path)

# Read a subset of files into a dictionary
data = {}
num_files_to_read = 450  # Adjust the number of files to read as needed
for i, file_name in enumerate(file_names):
    if i >= num_files_to_read:
        break
    if file_name.endswith('.csv'):  # Assuming the files are CSV format
        file_path = os.path.join(folder_path, file_name)
        try:
            data[file_name] = pd.read_csv(file_path)
            print("File loaded successfully:", file_name)
        except Exception as e:
            print("Error loading file:", file_name, "- Error:", e)

# Now you have a dictionary 'data' containing DataFrames for each file (up to the specified number)
# Access them using keys (file names)
# For example:
print("Number of files loaded:", len(data))


File loaded successfully: 16-07.csv
File loaded successfully: 80_04,05,06.csv
File loaded successfully: 7-04-05-06.csv
File loaded successfully: 324_02.csv
File loaded successfully: 87_04,05,06.csv
File loaded successfully: 78_04,05,06.csv
File loaded successfully: 328_02.csv
File loaded successfully: 105_04 donation not completed. No blood flow.csv
File loaded successfully: 92_04,05,06.csv
File loaded successfully: 38_4,5,6.csv
File loaded successfully: 129_04,05,06.csv
File loaded successfully: 85_07.csv
File loaded successfully: DSCN2370.csv
File loaded successfully: 95_04,05,06.csv
File loaded successfully: 300_02.csv
File loaded successfully: 20-07.csv
File loaded successfully: 97_07.csv
File loaded successfully: 290_01.csv
File loaded successfully: 312_02.csv
File loaded successfully: 127_03.csv
File loaded successfully: 40_07.csv
File loaded successfully: 101_07.csv
File loaded successfully: 118_04,05,06.csv
File loaded successfully: 14-03,04,05,06.csv
File loaded successfully: 

  data[file_name] = pd.read_csv(file_path)


File loaded successfully: 108_04,05,06.csv
File loaded successfully: 80_07.csv
File loaded successfully: 305_02.csv
File loaded successfully: 299_01.csv
Number of files loaded: 411


Double check if it worked. 

In [28]:
# Check if the dictionary is not empty
if data:
    # Get the first key (file name) and its corresponding DataFrame
    first_file_name = next(iter(data.keys()))
    first_df = data[first_file_name]
    
    # Display the DataFrame
    print("DataFrame for the first file '{}' in the dictionary:".format(first_file_name))
    display(first_df)
else:
    print("The dictionary is empty. No files loaded.")


DataFrame for the first file '16-07.csv' in the dictionary:


Unnamed: 0,frame,face_id,timestamp,confidence,success,gaze_0_x,gaze_0_y,gaze_0_z,gaze_1_x,gaze_1_y,...,AU14_c,AU15_c,AU17_c,AU20_c,AU23_c,AU25_c,AU26_c,AU28_c,AU45_c,Filename
0,1,0,0.00,0.98,1,-0.186231,-0.071084,-0.979931,-0.321020,-0.025671,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
1,2,0,0.04,0.98,1,-0.195272,-0.066113,-0.978518,-0.321741,-0.022495,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
2,3,0,0.08,0.98,1,-0.183891,-0.072790,-0.980248,-0.325824,-0.032522,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
3,4,0,0.12,0.98,1,-0.189050,-0.061480,-0.980041,-0.323833,-0.018319,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
4,5,0,0.16,0.98,1,-0.193372,-0.059505,-0.979319,-0.317909,-0.020461,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745,2746,0,109.80,0.98,1,-0.152439,0.008936,-0.988272,-0.382937,0.032291,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
2746,2747,0,109.84,0.98,1,-0.148107,0.010306,-0.988918,-0.384507,0.019742,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
2747,2748,0,109.88,0.98,1,-0.154183,0.015469,-0.987921,-0.387649,0.025655,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv
2748,2749,0,109.92,0.98,1,-0.152587,0.013783,-0.988194,-0.396512,0.031367,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,16-07.csv


# Saving my progress so far (dictionary with all files including the filename as a column)

Saving the dictionary took a bit more than 3 minutes. 

In [30]:
import pickle

# Specify the file path to save the dictionary
save_path = '/Users/dionnespaltman/Desktop/downloading/data_dictionary.pkl'

# Save the dictionary to a file using pickle
with open(save_path, 'wb') as file:
    pickle.dump(data, file)

print("Dictionary saved to:", save_path)


Dictionary saved to: /Users/dionnespaltman/Desktop/downloading/data_dictionary.pkl


# Dropping columns 

### First creating the variable that contains all the column names that need to be deleted 

First getting a list of all column names. 

In [31]:
# Check if the dictionary is not empty
if data:
    # Get the first key (file name) and its corresponding DataFrame
    first_file_name = next(iter(data.keys()))
    first_df = data[first_file_name]
    
    # Print the column names of the DataFrame
    print("Column names of the first file '{}' in the dictionary:".format(first_file_name))
    print(first_df.columns.tolist())
else:
    print("The dictionary is empty. No files loaded.")


Column names of the first file '16-07.csv' in the dictionary:
['frame', ' face_id', ' timestamp', ' confidence', ' success', ' gaze_0_x', ' gaze_0_y', ' gaze_0_z', ' gaze_1_x', ' gaze_1_y', ' gaze_1_z', ' gaze_angle_x', ' gaze_angle_y', ' eye_lmk_x_0', ' eye_lmk_x_1', ' eye_lmk_x_2', ' eye_lmk_x_3', ' eye_lmk_x_4', ' eye_lmk_x_5', ' eye_lmk_x_6', ' eye_lmk_x_7', ' eye_lmk_x_8', ' eye_lmk_x_9', ' eye_lmk_x_10', ' eye_lmk_x_11', ' eye_lmk_x_12', ' eye_lmk_x_13', ' eye_lmk_x_14', ' eye_lmk_x_15', ' eye_lmk_x_16', ' eye_lmk_x_17', ' eye_lmk_x_18', ' eye_lmk_x_19', ' eye_lmk_x_20', ' eye_lmk_x_21', ' eye_lmk_x_22', ' eye_lmk_x_23', ' eye_lmk_x_24', ' eye_lmk_x_25', ' eye_lmk_x_26', ' eye_lmk_x_27', ' eye_lmk_x_28', ' eye_lmk_x_29', ' eye_lmk_x_30', ' eye_lmk_x_31', ' eye_lmk_x_32', ' eye_lmk_x_33', ' eye_lmk_x_34', ' eye_lmk_x_35', ' eye_lmk_x_36', ' eye_lmk_x_37', ' eye_lmk_x_38', ' eye_lmk_x_39', ' eye_lmk_x_40', ' eye_lmk_x_41', ' eye_lmk_x_42', ' eye_lmk_x_43', ' eye_lmk_x_44', ' eye_lm

I am interested in all the fields that have AU and end with 'r', e.g. AU45_r - these columns show the intensity of each extracted AU. 
So if it includes ... 

### The function to drop the columns + doing so 

In [None]:
# DOES NOT WORK YET

# Define the file path and the folder to save modified files
input_file_path = '/Users/dionnespaltman/Desktop/downloading/full donation data - unzipped'
output_folder = '/Users/dionnespaltman/Desktop/downloading/full donation data - dropped columns'

# Function to drop columns 
def drop_columns(file_name): 
    #name_file = os.path.basename(file_path)
    df = pd.read_csv(input_file_path + '/' + file_name)
    df_dropped_columns = df.drop(columns=columns_to_drop)
    return df_dropped_columns 

# List all files in the directory
file_names = os.listdir(input_file_path)

# Loop through each file
for file_name in file_names:
    if file_name.endswith('.csv'):
        # Call the function to drop columns and save the modified file
        df_dropped_columns = drop_columns(os.path.join(output_folder, file_name))
                
        # Save the modified DataFrame to the output folder with the original file name
        output_file_path = os.path.join('/Users/dionnespaltman/Desktop/downloading/full donation data - dropped columns', file_name)
        df_dropped_columns.to_csv(output_file_path, index=False)

In [10]:
file_path = '/Users/dionnespaltman/Desktop/downloading/full donation data.zip'
name_file = os.path.basename(file_path)

print(name_file)

full donation data.zip


See if it really worked: 

In [7]:
path_name = "/Users/dionnespaltman/Desktop/downloading/full donation data - dropped columns/full donation data/5-03-04-05-06.csv"

df_example = pd.read_csv(path_name)

# Get the number of rows and columns
num_rows, num_cols = df_example.shape

# Print the number of rows and columns
print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

Number of rows: 23000
Number of columns: 714


In [None]:
display(df_example)

# Adding ID 

In [2]:


output_folder = '/Users/dionnespaltman/Desktop/downloading/modified_data'

# Get list of files in the modified_data folder
file_names = os.listdir(output_folder)

# Iterate over each file
for file_name in file_names:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(os.path.join(output_folder, file_name))
    
    # Extract ID and timeframe from file name
    ID = ''
    timeframe = ''
    parts = file_name.split('_')
    if len(parts) > 1:  # If underscore is present
        ID = parts[0]  # ID is the part before the underscore
        timeframe = parts[1].split('.')[0]  # Timeframe is the part after the underscore, removing the file extension
    else:
        parts = file_name.split('-')
        ID = parts[0][:3]  # ID is the first three characters before the hyphen
        timeframe = '-'.join(parts[1:]).split('.')[0]  # Timeframe is the part after the hyphen, removing the file extension
    
    # Add ID and timeframe columns
    df['ID'] = ID
    df['Timeframe'] = timeframe
    
    # Display the DataFrame
    pd.set_option('display.max_columns', None)  # Display all columns
    pd.set_option('display.expand_frame_repr', False)  # Do not wrap DataFrame


# Adding timeframe 

# Creating processed_data.csv