In [3]:
# Import necessary modules
import numpy as np
import pandas as pd
import os
from itertools import product  # This line imports the product function

## This is the preprocessed python scripit given when extracted

In [3]:
# Define the preprocessed folder and file index
preprocessed_dir = 'preprocessed/'
file_idx = 78

# Channels
channels = ['Dribbiling', 'Taste/smelling', 'Swallowing', 'Vomiting', 'Constipation',
            'Bowel inconsistence', 'Bowel emptying incomplete', 'Urgency', 'Nocturia', 'Pains',
            'Weight', 'Remembering', 'Loss of interest', 'Hallucinations', 'Concentrating',
            'Sad, blues', 'Anxiety', 'Sex drive', 'Sex difficulty', 'Dizzy',
            'Falling', 'Daytime sleepiness', 'Insomnia', 'Intense vivid dreams', 'Acting out during dreams',
            'Restless legs', 'Swelling', 'Sweating', 'Diplopia', 'Delusions']

# Construct the path for the file_list.csv
file_list_path = os.path.join(preprocessed_dir, 'file_list.csv')

# Check if the file exists
if not os.path.exists(file_list_path):
    print(f"Error: The file '{file_list_path}' does not exist!")
else:
    # Read the file list CSV
    df = pd.read_csv(file_list_path)

    # Filter for the subject with the specified file index
    subject = df[df['id'] == file_idx].reset_index().loc[0, :]
    print(subject)

    # Construct the path for the binary data file
    bin_file_path = os.path.join(preprocessed_dir, 'questionnaire', f'{file_idx:03d}_ml.bin')

    # Check if the binary file exists
    if not os.path.exists(bin_file_path):
        print(f"Error: The binary file '{bin_file_path}' does not exist!")
    else:
        # Read the binary file data as np.float32
        data = np.fromfile(bin_file_path, dtype=np.float32)
        
        # Transform binary values into 'yes' and 'no'
        data = np.where(data == 1.0, 'yes', 'no')

        # Convert the data to a DataFrame
        data_df = pd.DataFrame([data], columns=channels)

        # Save DataFrame to CSV
        output_csv_path = os.path.join(preprocessed_dir, f'hc_{file_idx:03d}_questionnaire.csv')
        #data_df.to_csv(output_csv_path, index=False)

        print(f"Data saved as '{output_csv_path}'")


index                                     77
resource_type                        patient
id                                        78
study_id                                PADS
condition                            Healthy
disease_comment                            -
age_at_diagnosis                          63
age                                       63
height                                   154
weight                                    56
gender                                female
handedness                              left
appearance_in_kinship                  False
appearance_in_first_grade_kinship      False
effect_of_alcohol_on_tremor          Unknown
label                                      0
Name: 0, dtype: object
Data saved as 'preprocessed/hc_078_questionnaire.csv'


In [None]:
# Define the preprocessed folder and channels
preprocessed_dir = 'preprocessed/'
questionnaire_dir = os.path.join(preprocessed_dir, 'questionnaire')
channels = [
    'Dribbling', 'Taste/smelling', 'Swallowing', 'Vomiting', 'Constipation',
    'Bowel inconsistence', 'Bowel emptying incomplete', 'Urgency', 'Nocturia', 'Pains',
    'Weight', 'Remembering', 'Loss of interest', 'Hallucinations', 'Concentrating',
    'Sad, blues', 'Anxiety', 'Sex drive', 'Sex difficulty', 'Dizzy',
    'Falling', 'Daytime sleepiness', 'Insomnia', 'Intense vivid dreams', 'Acting out during dreams',
    'Restless legs', 'Swelling', 'Sweating', 'Diplopia', 'Delusions'
]

# Construct the path for the file_list.csv
file_list_path = os.path.join(preprocessed_dir, 'file_list.csv')

# Check if file_list.csv exists
if not os.path.exists(file_list_path):
    print(f"Error: The file '{file_list_path}' does not exist!")
else:
    # Read the file list CSV
    df = pd.read_csv(file_list_path)

    # Initialize an empty DataFrame to hold all data
    all_data_df = pd.DataFrame(columns=['id'] + channels)

    # Process all .bin files in the questionnaire directory
    for _, row in df.iterrows():
        file_idx = int(row['id'])
        bin_file_path = os.path.join(questionnaire_dir, f'{file_idx:03d}_ml.bin')

        if not os.path.exists(bin_file_path):
            print(f"Warning: The binary file '{bin_file_path}' does not exist, skipping...")
            continue

        # Read the binary file data as np.float32
        data = np.fromfile(bin_file_path, dtype=np.float32)

        # Transform binary values into 'yes' and 'no'
        data = np.where(data == 1.0, 'yes', 'no')

        # Add the data to the cumulative DataFrame
        row_data = pd.DataFrame([[file_idx] + data.tolist()], columns=['id'] + channels)
        all_data_df = pd.concat([all_data_df, row_data], ignore_index=True)

    # Save the consolidated DataFrame to a single CSV file
    consolidated_csv_path = os.path.join(preprocessed_dir, 'all_questionnaire_data.csv')
    all_data_df.to_csv(consolidated_csv_path, index=False)
    print(f"Consolidated data saved as '{consolidated_csv_path}'")


In [7]:
# Define the preprocessed folder and channels
preprocessed_dir = 'preprocessed/'
questionnaire_dir = os.path.join(preprocessed_dir, 'questionnaire')
channels = [
    'Dribbling', 'Taste/smelling', 'Swallowing', 'Vomiting', 'Constipation',
    'Bowel inconsistence', 'Bowel emptying incomplete', 'Urgency', 'Nocturia', 'Pains',
    'Weight', 'Remembering', 'Loss of interest', 'Hallucinations', 'Concentrating',
    'Sad, blues', 'Anxiety', 'Sex drive', 'Sex difficulty', 'Dizzy',
    'Falling', 'Daytime sleepiness', 'Insomnia', 'Intense vivid dreams', 'Acting out during dreams',
    'Restless legs', 'Swelling', 'Sweating', 'Diplopia', 'Delusions'
]

# Construct the path for the file_list.csv
file_list_path = os.path.join(preprocessed_dir, 'file_list.csv')

# Check if file_list.csv exists
if not os.path.exists(file_list_path):
    print(f"Error: The file '{file_list_path}' does not exist!")
else:
    # Read the file list CSV
    df = pd.read_csv(file_list_path)

    # Initialize an empty DataFrame to hold all data
    all_data_df = pd.DataFrame(columns=['id'] + channels)

    # Process all .bin files in the questionnaire directory
    for _, row in df.iterrows():
        file_idx = int(row['id'])
        bin_file_path = os.path.join(questionnaire_dir, f'{file_idx:03d}_ml.bin')

        if not os.path.exists(bin_file_path):
            print(f"Warning: The binary file '{bin_file_path}' does not exist, skipping...")
            continue

        # Read the binary file data as np.float32
        data = np.fromfile(bin_file_path, dtype=np.float32)

        # Add the data to the cumulative DataFrame
        row_data = pd.DataFrame([[file_idx] + data.tolist()], columns=['id'] + channels)
        all_data_df = pd.concat([all_data_df, row_data], ignore_index=True)

    # Save the consolidated DataFrame to a single CSV file
    consolidated_csv_path = os.path.join(preprocessed_dir, 'all_questionnaire_data.csv')
    all_data_df.to_csv(consolidated_csv_path, index=False)
    print(f"Consolidated data saved as '{consolidated_csv_path}'")


  all_data_df = pd.concat([all_data_df, row_data], ignore_index=True)


Consolidated data saved as 'preprocessed/all_questionnaire_data.csv'


In [9]:
# Define the preprocessed folder and movement channels
preprocessed_dir = 'preprocessed/'
movement_dir = os.path.join(preprocessed_dir, 'movement')

# Function to load all files from a directory
def load_all_files(directory):
    file_list = []
    for filename in sorted(os.listdir(directory)):
        if filename.endswith('.bin'):
            file_list.append(os.path.join(directory, filename))
    return file_list

# Check if movement directory exists
if not os.path.exists(movement_dir):
    print(f"Error: The directory '{movement_dir}' does not exist!")
else:
    # Load all binary files from the movement directory
    bin_files = load_all_files(movement_dir)

    # Initialize an empty DataFrame to hold all data
    all_data_df = pd.DataFrame()

    for bin_file_path in bin_files:
        # Extract file index from the file name
        file_name = os.path.basename(bin_file_path)
        file_idx = int(file_name.split('_')[0])

        # Read the binary file data as np.float32
        data = np.fromfile(bin_file_path, dtype=np.float32)

        # Convert the binary data into a DataFrame row
        row_data = pd.DataFrame([[file_idx] + data.tolist()], columns=['id'] + [f'feature_{i+1}' for i in range(len(data))])
        all_data_df = pd.concat([all_data_df, row_data], ignore_index=True)

    # Save the consolidated DataFrame to a single CSV file
    consolidated_csv_path = os.path.join(preprocessed_dir, 'movement_file_list.csv')
    all_data_df.to_csv(consolidated_csv_path, index=False, sep=',')
    print(f"Consolidated movement data saved as '{consolidated_csv_path}'")


Consolidated movement data saved as 'preprocessed/movement_file_list.csv'


In [35]:
# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Generate the channel names by combining task, wrist, sensor, and axis
channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
            product(tasks, wrists, sensors, axes)]

# Verify the number of channels
print(f"Number of channels: {len(channels)}")

# Function to process each binary file
def process_movement_file(file_path, channels):
    data = np.fromfile(file_path, dtype=np.float32)
    
    # Check if the length of data matches the number of channels
    print(f"File '{file_path}' contains {len(data)} data points.")
    
    # If the data length doesn't match, we attempt reshaping
    expected_data_points = len(channels)
    if len(data) % expected_data_points != 0:
        print(f"Warning: Data length in file '{file_path}' does not divide evenly by {expected_data_points}.")
        # Handle reshaping: Try reshaping the data into multiple rows corresponding to time steps
        num_samples = len(data) // expected_data_points
        reshaped_data = data[:num_samples * expected_data_points].reshape((num_samples, expected_data_points))
    else:
        reshaped_data = data.reshape((-1, expected_data_points))  # Reshape into the expected number of channels per time step
    
    # Create a DataFrame with reshaped data
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    
    return data_df

# List of CSV outputs to save
output_csv_files = []

# Process each binary file in the directory
for file_name in sorted(os.listdir(movement_dir)):
    if file_name.endswith('.bin'):
        file_path = os.path.join(movement_dir, file_name)
        
        # Process the binary file and map data to channels
        result_df = process_movement_file(file_path, channels)
        
        if result_df is not None:
            # Construct output CSV path
            output_csv_path = file_path.replace('.bin', '_movement.csv')
            result_df.to_csv(output_csv_path, index=False, sep=',')
            print(f"Data for {file_name} saved as '{output_csv_path}'")
            output_csv_files.append(output_csv_path)

# Combine all the CSVs into a single file (optional)
combined_df = pd.concat([pd.read_csv(file) for file in output_csv_files], ignore_index=True)
combined_df.to_csv('combined_movement_data.csv', index=False, sep=',')
print("Combined data saved as 'combined_movement_data.csv'")


Number of channels: 132
File 'preprocessed/movement/001_ml.bin' contains 128832 data points.
Data for 001_ml.bin saved as 'preprocessed/movement/001_ml_movement.csv'
File 'preprocessed/movement/002_ml.bin' contains 128832 data points.
Data for 002_ml.bin saved as 'preprocessed/movement/002_ml_movement.csv'
File 'preprocessed/movement/003_ml.bin' contains 128832 data points.
Data for 003_ml.bin saved as 'preprocessed/movement/003_ml_movement.csv'
File 'preprocessed/movement/004_ml.bin' contains 128832 data points.
Data for 004_ml.bin saved as 'preprocessed/movement/004_ml_movement.csv'
File 'preprocessed/movement/005_ml.bin' contains 128832 data points.
Data for 005_ml.bin saved as 'preprocessed/movement/005_ml_movement.csv'
File 'preprocessed/movement/006_ml.bin' contains 128832 data points.
Data for 006_ml.bin saved as 'preprocessed/movement/006_ml_movement.csv'
File 'preprocessed/movement/007_ml.bin' contains 128832 data points.
Data for 007_ml.bin saved as 'preprocessed/movement/007

KeyboardInterrupt: 

In [1]:
# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

def infer_channels_from_file(file_path, tasks, wrists, sensors, axes):
    """Infer the channel configuration and validate data points in the binary file."""
    # Generate channel names dynamically
    channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
                product(tasks, wrists, sensors, axes)]

    # Read binary data
    data = np.fromfile(file_path, dtype=np.float32)
    print(f"File '{file_path}' contains {len(data)} data points.")

    # Infer the expected number of data points per time step (channels)
    expected_data_points = len(channels)
    print(f"Expected data points per time step: {expected_data_points}")

    if len(data) % expected_data_points != 0:
        print(f"Warning: Data length in file '{file_path}' does not divide evenly by {expected_data_points}.")
        num_samples = len(data) // expected_data_points
        reshaped_data = data[:num_samples * expected_data_points].reshape((num_samples, expected_data_points))
    else:
        reshaped_data = data.reshape((-1, expected_data_points))

    # Create a DataFrame to validate channels
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    print("Sample of inferred data:")
    print(data_df.head())

    return channels, reshaped_data

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Select the first binary file to infer channels and data points
binary_files = [f for f in sorted(os.listdir(movement_dir)) if f.endswith('.bin')]
if not binary_files:
    raise FileNotFoundError("No .bin files found in the directory.")

# Analyze the first binary file
first_file_path = os.path.join(movement_dir, binary_files[0])
print(f"Analyzing first file: {first_file_path}")
channels, data = infer_channels_from_file(first_file_path, tasks, wrists, sensors, axes)

print(f"Inferred {len(channels)} channels:")
print(channels)


Analyzing first file: preprocessed/movement/001_ml.bin
File 'preprocessed/movement/001_ml.bin' contains 128832 data points.
Expected data points per time step: 132
Sample of inferred data:
   Relaxed1_Left_Accelerometer_X  Relaxed1_Left_Accelerometer_Y  \
0                      -0.007183                      -0.007979   
1                       0.001043                       0.001997   
2                      -0.001907                       0.000037   
3                       0.002346                       0.002297   
4                      -0.000287                      -0.000247   

   Relaxed1_Left_Accelerometer_Z  Relaxed1_Left_Gyroscope_X  \
0                      -0.007813                  -0.006712   
1                       0.002999                   0.003058   
2                       0.001010                   0.002977   
3                       0.002318                   0.003289   
4                      -0.001204                  -0.003151   

   Relaxed1_Left_Gyroscope_Y 

In [3]:
import numpy as np
import os
import pandas as pd
from itertools import product

# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Generate the channel names by combining task, wrist, sensor, and axis
channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
            product(tasks, wrists, sensors, axes)]
expected_data_points = len(channels)  # Number of channels

# Function to process and extract data from a binary file
def extract_data(file_path, channels, expected_data_points):
    data = np.fromfile(file_path, dtype=np.float32)
    total_data_points = len(data)
    print(f"Processing file '{file_path}' with {total_data_points} data points.")
    
    # Check if the total data points divide evenly by the number of channels
    if total_data_points % expected_data_points != 0:
        print(f"Warning: Data points ({total_data_points}) do not divide evenly by {expected_data_points}.")
        # Truncate to the largest divisible portion
        valid_length = (total_data_points // expected_data_points) * expected_data_points
        data = data[:valid_length]
        print(f"Data truncated to {valid_length} points.")
    
    # Calculate the number of time steps
    num_time_steps = len(data) // expected_data_points
    print(f"Number of time steps: {num_time_steps}")
    
    # Reshape the data into a 2D array
    reshaped_data = data.reshape((num_time_steps, expected_data_points))
    
    # Create a DataFrame with channel names as column headers
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    
    return data_df

# Process each binary file in the directory
output_csv_files = []
for file_name in sorted(os.listdir(movement_dir)):
    if file_name.endswith('.bin'):
        file_path = os.path.join(movement_dir, file_name)
        result_df = extract_data(file_path, channels, expected_data_points)
        
        # Save the result as a CSV file
        output_csv_path = file_path.replace('.bin', '_movement.csv')
        result_df.to_csv(output_csv_path, index=False, sep=',')
        print(f"Data saved to '{output_csv_path}'.")
        output_csv_files.append(output_csv_path)

# Optional: Combine all CSVs into one file
'''combined_df = pd.concat([pd.read_csv(file) for file in output_csv_files], ignore_index=True)
combined_csv_path = 'combined_movement_data.csv'
combined_df.to_csv(combined_csv_path, index=False, sep=',')
print(f"Combined data saved as '{combined_csv_path}'.")'''


'combined_df = pd.concat([pd.read_csv(file) for file in output_csv_files], ignore_index=True)\ncombined_csv_path = \'combined_movement_data.csv\'\ncombined_df.to_csv(combined_csv_path, index=False, sep=\',\')\nprint(f"Combined data saved as \'{combined_csv_path}\'.")'

In [7]:
# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Generate the channel names by combining task, wrist, sensor, and axis
channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
            product(tasks, wrists, sensors, axes)]
expected_data_points = len(channels)  # Number of channels

# Function to process and extract data from a binary file
def extract_data(file_path, channels, expected_data_points):
    data = np.fromfile(file_path, dtype=np.float32)
    total_data_points = len(data)
    print(f"Processing file '{file_path}' with {total_data_points} data points.")
    
    # Check if the total data points divide evenly by the number of channels
    if total_data_points % expected_data_points != 0:
        print(f"Warning: Data points ({total_data_points}) do not divide evenly by {expected_data_points}.")
        # Truncate to the largest divisible portion
        valid_length = (total_data_points // expected_data_points) * expected_data_points
        data = data[:valid_length]
        print(f"Data truncated to {valid_length} points.")
    
    # Calculate the number of time steps
    num_time_steps = len(data) // expected_data_points
    print(f"Number of time steps: {num_time_steps}")
    
    # Reshape the data into a 2D array
    reshaped_data = data.reshape((num_time_steps, expected_data_points))
    
    # Create a DataFrame with channel names as column headers
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    
    return data_df

# Process each binary file in the directory
output_csv_files = []
for file_name in sorted(os.listdir(movement_dir)):
    if file_name.endswith('.bin'):
        file_path = os.path.join(movement_dir, file_name)
        result_df = extract_data(file_path, channels, expected_data_points)
        
        # Save the result as a CSV file
        output_csv_path = file_path.replace('.bin', '_movement.csv')
        result_df.to_csv(output_csv_path, index=False, sep=',')
        print(f"Data saved to '{output_csv_path}'.")
        output_csv_files.append(output_csv_path)

# Optional: Combine all CSVs into one file
combined_df = pd.concat([pd.read_csv(file) for file in output_csv_files], ignore_index=True)
combined_csv_path = 'combined_movement_data.csv'
combined_df.to_csv(combined_csv_path, index=False, sep=',')
print(f"Combined data saved as '{combined_csv_path}'.")


Processing file 'preprocessed/movement/001_ml.bin' with 128832 data points.
Number of time steps: 976
Data saved to 'preprocessed/movement/001_ml_movement.csv'.
Processing file 'preprocessed/movement/002_ml.bin' with 128832 data points.
Number of time steps: 976
Data saved to 'preprocessed/movement/002_ml_movement.csv'.
Processing file 'preprocessed/movement/003_ml.bin' with 128832 data points.
Number of time steps: 976
Data saved to 'preprocessed/movement/003_ml_movement.csv'.
Processing file 'preprocessed/movement/004_ml.bin' with 128832 data points.
Number of time steps: 976
Data saved to 'preprocessed/movement/004_ml_movement.csv'.
Processing file 'preprocessed/movement/005_ml.bin' with 128832 data points.
Number of time steps: 976
Data saved to 'preprocessed/movement/005_ml_movement.csv'.
Processing file 'preprocessed/movement/006_ml.bin' with 128832 data points.
Number of time steps: 976
Data saved to 'preprocessed/movement/006_ml_movement.csv'.
Processing file 'preprocessed/move

## Final movement bin extract

In [11]:
# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Generate the channel names by combining task, wrist, sensor, and axis
channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
            product(tasks, wrists, sensors, axes)]
expected_data_points = len(channels)  # Number of channels

# Function to process and extract data from a binary file
def extract_data(file_path, channels, expected_data_points):
    data = np.fromfile(file_path, dtype=np.float32)
    total_data_points = len(data)
    print(f"Processing file '{file_path}' with {total_data_points} data points.")
    
    # Check if the total data points divide evenly by the number of channels
    if total_data_points % expected_data_points != 0:
        print(f"Warning: Data points ({total_data_points}) do not divide evenly by {expected_data_points}.")
        # Truncate to the largest divisible portion
        valid_length = (total_data_points // expected_data_points) * expected_data_points
        data = data[:valid_length]
        print(f"Data truncated to {valid_length} points.")
    
    # Calculate the number of time steps
    num_time_steps = len(data) // expected_data_points
    print(f"Number of time steps: {num_time_steps}")
    
    # Reshape the data into a 2D array
    reshaped_data = data.reshape((num_time_steps, expected_data_points))
    
    # Create a DataFrame with channel names as column headers
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    
    return data_df

# Process each binary file in the directory and store results in a dictionary
processed_data = {}
for file_name in sorted(os.listdir(movement_dir)):
    if file_name.endswith('.bin'):
        file_path = os.path.join(movement_dir, file_name)
        result_df = extract_data(file_path, channels, expected_data_points)
        
        # Store the DataFrame in the dictionary
        processed_data[file_name] = result_df
        print(f"Data for '{file_name}' processed and stored in memory.")

# Optional: Combine all data into a single DataFrame
combined_df = pd.concat(processed_data.values(), ignore_index=True)
print("Combined data stored in memory as a single DataFrame.")


Processing file 'preprocessed/movement/001_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '001_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/002_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '002_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/003_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '003_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/004_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '004_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/005_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '005_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/006_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '006_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/007_ml.bin' with 128832 d

In [13]:
combined_df 

Unnamed: 0,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,Relaxed1_Right_Accelerometer_Y,Relaxed1_Right_Accelerometer_Z,Relaxed1_Right_Gyroscope_X,...,Entrainment2_Left_Accelerometer_Z,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z
0,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,-0.005024,-0.005753,-0.005496,...,0.000559,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819
1,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,-0.000653,0.000377,-0.000588,...,0.001436,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884
2,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,-0.000555,-0.001443,0.000603,...,-0.004281,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356
3,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,0.002417,0.002434,0.003462,...,-0.001967,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652
4,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,0.001448,0.003319,0.002320,...,-0.002292,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,-0.014465,-0.010212,0.003600,...,0.088472,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336
457740,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,-0.150486,-0.145166,-0.141974,...,0.036328,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585
457741,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,0.077828,0.067208,0.040647,...,-0.014567,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413
457742,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,0.098097,0.086409,0.066226,...,0.025794,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804


In [15]:
# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Generate the channel names by combining task, wrist, sensor, and axis
channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
            product(tasks, wrists, sensors, axes)]
expected_data_points = len(channels)  # Number of channels

# Function to process and extract data from a binary file
def extract_data(file_path, channels, expected_data_points):
    data = np.fromfile(file_path, dtype=np.float32)
    total_data_points = len(data)
    print(f"Processing file '{file_path}' with {total_data_points} data points.")
    
    # Check if the total data points divide evenly by the number of channels
    if total_data_points % expected_data_points != 0:
        print(f"Warning: Data points ({total_data_points}) do not divide evenly by {expected_data_points}.")
        # Truncate to the largest divisible portion
        valid_length = (total_data_points // expected_data_points) * expected_data_points
        data = data[:valid_length]
        print(f"Data truncated to {valid_length} points.")
    
    # Calculate the number of time steps
    num_time_steps = len(data) // expected_data_points
    print(f"Number of time steps: {num_time_steps}")
    
    # Reshape the data into a 2D array
    reshaped_data = data.reshape((num_time_steps, expected_data_points))
    
    # Create a DataFrame with channel names as column headers
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    
    # Add an ID column to identify the file's data points
    data_df.insert(0, 'ID', range(1, num_time_steps + 1))
    
    return data_df

# Process each binary file in the directory and store results in a dictionary
processed_data = {}
for file_name in sorted(os.listdir(movement_dir)):
    if file_name.endswith('.bin'):
        file_path = os.path.join(movement_dir, file_name)
        result_df = extract_data(file_path, channels, expected_data_points)
        
        # Add a column to specify the file name as an identifier
        result_df['File'] = file_name
        
        # Store the DataFrame in the dictionary
        processed_data[file_name] = result_df
        print(f"Data for '{file_name}' processed and stored in memory.")

# Optional: Combine all data into a single DataFrame
combined_df = pd.concat(processed_data.values(), ignore_index=True)
print("Combined data stored in memory as a single DataFrame.")


Processing file 'preprocessed/movement/001_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '001_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/002_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '002_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/003_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '003_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/004_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '004_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/005_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '005_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/006_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '006_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/007_ml.bin' with 128832 d

In [17]:
combined_df

Unnamed: 0,ID,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,Relaxed1_Right_Accelerometer_Y,Relaxed1_Right_Accelerometer_Z,...,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z,File
0,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,-0.005024,-0.005753,...,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819,001_ml.bin
1,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,-0.000653,0.000377,...,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884,001_ml.bin
2,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,-0.000555,-0.001443,...,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356,001_ml.bin
3,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,0.002417,0.002434,...,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652,001_ml.bin
4,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,0.001448,0.003319,...,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238,001_ml.bin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,972,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,-0.014465,-0.010212,...,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336,469_ml.bin
457740,973,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,-0.150486,-0.145166,...,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585,469_ml.bin
457741,974,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,0.077828,0.067208,...,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413,469_ml.bin
457742,975,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,0.098097,0.086409,...,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804,469_ml.bin


In [40]:
# Directory where the binary files are stored
movement_dir = 'preprocessed/movement/'

# Task, wrist, sensor, and axis options
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold",
         "HoldWeight", "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
wrists = ["Left", "Right"]
sensors = ["Accelerometer", "Gyroscope"]
axes = ["X", "Y", "Z"]

# Generate the channel names by combining task, wrist, sensor, and axis
channels = [f"{task}_{wrist}_{sensor}_{axis}" for task, wrist, sensor, axis in 
            product(tasks, wrists, sensors, axes)]
expected_data_points = len(channels)  # Number of channels

# Function to process and extract data from a binary file
def extract_data(file_path, channels, expected_data_points):
    data = np.fromfile(file_path, dtype=np.float32)
    total_data_points = len(data)
    print(f"Processing file '{file_path}' with {total_data_points} data points.")
    
    # Check if the total data points divide evenly by the number of channels
    if total_data_points % expected_data_points != 0:
        print(f"Warning: Data points ({total_data_points}) do not divide evenly by {expected_data_points}.")
        # Truncate to the largest divisible portion
        valid_length = (total_data_points // expected_data_points) * expected_data_points
        data = data[:valid_length]
        print(f"Data truncated to {valid_length} points.")
    
    # Calculate the number of time steps
    num_time_steps = len(data) // expected_data_points
    print(f"Number of time steps: {num_time_steps}")
    
    # Reshape the data into a 2D array
    reshaped_data = data.reshape((num_time_steps, expected_data_points))
    
    # Create a DataFrame with channel names as column headers
    data_df = pd.DataFrame(reshaped_data, columns=channels)
    
    # Add an ID column to identify the file's data points
    data_df.insert(0, 'ID', range(1, num_time_steps + 1))

    # Add a column to specify the file name for identification
    data_df['File'] = os.path.basename(file_path)
    
    return data_df

# Process each binary file in the directory and store results in a dictionary
processed_data = {}
for file_name in sorted(os.listdir(movement_dir)):
    if file_name.endswith('.bin'):
        file_path = os.path.join(movement_dir, file_name)
        result_df = extract_data(file_path, channels, expected_data_points)
        
        # Store the DataFrame in the dictionary
        processed_data[file_name] = result_df
        print(f"Data for '{file_name}' processed and stored in memory.")

# Optional: Combine all data into a single DataFrame
combined_df = pd.concat(processed_data.values(), ignore_index=True)
print("Combined data stored in memory as a single DataFrame.")


Processing file 'preprocessed/movement/001_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '001_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/002_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '002_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/003_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '003_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/004_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '004_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/005_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '005_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/006_ml.bin' with 128832 data points.
Number of time steps: 976
Data for '006_ml.bin' processed and stored in memory.
Processing file 'preprocessed/movement/007_ml.bin' with 128832 d

In [42]:
combined_df

Unnamed: 0,ID,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,Relaxed1_Right_Accelerometer_Y,Relaxed1_Right_Accelerometer_Z,...,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z,File
0,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,-0.005024,-0.005753,...,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819,001_ml.bin
1,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,-0.000653,0.000377,...,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884,001_ml.bin
2,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,-0.000555,-0.001443,...,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356,001_ml.bin
3,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,0.002417,0.002434,...,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652,001_ml.bin
4,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,0.001448,0.003319,...,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238,001_ml.bin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,972,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,-0.014465,-0.010212,...,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336,469_ml.bin
457740,973,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,-0.150486,-0.145166,...,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585,469_ml.bin
457741,974,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,0.077828,0.067208,...,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413,469_ml.bin
457742,975,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,0.098097,0.086409,...,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804,469_ml.bin


In [44]:
# Step 1: Rename the 'Id' column to 'Number of time steps'
combined_df.rename(columns={'ID': 'Number of time steps'}, inplace=True)

# Step 2: Move the 'file' column to the first position
# First, drop the 'file' column and store it in a temporary variable
file_column = combined_df.pop('File')

# Step 3: Insert the 'file' column at the first position
combined_df.insert(0, 'file', file_column)

# Now, 'file' will be the first column and 'Id' will be renamed to 'Number of time steps'


In [46]:
combined_df

Unnamed: 0,file,Number of time steps,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,Relaxed1_Right_Accelerometer_Y,...,Entrainment2_Left_Accelerometer_Z,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z
0,001_ml.bin,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,-0.005024,...,0.000559,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819
1,001_ml.bin,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,-0.000653,...,0.001436,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884
2,001_ml.bin,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,-0.000555,...,-0.004281,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356
3,001_ml.bin,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,0.002417,...,-0.001967,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652
4,001_ml.bin,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,0.001448,...,-0.002292,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,469_ml.bin,972,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,-0.014465,...,0.088472,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336
457740,469_ml.bin,973,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,-0.150486,...,0.036328,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585
457741,469_ml.bin,974,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,0.077828,...,-0.014567,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413
457742,469_ml.bin,975,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,0.098097,...,0.025794,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804


In [50]:

# Assuming combined_df is already defined
combined_df['id'] = combined_df['file'].apply(lambda x: f"{int(x.split('_')[0]):03}_ml.bin")


In [52]:
combined_df

Unnamed: 0,file,Number of time steps,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,Relaxed1_Right_Accelerometer_Y,...,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z,id
0,001_ml.bin,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,-0.005024,...,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819,001_ml.bin
1,001_ml.bin,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,-0.000653,...,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884,001_ml.bin
2,001_ml.bin,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,-0.000555,...,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356,001_ml.bin
3,001_ml.bin,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,0.002417,...,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652,001_ml.bin
4,001_ml.bin,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,0.001448,...,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238,001_ml.bin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,469_ml.bin,972,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,-0.014465,...,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336,469_ml.bin
457740,469_ml.bin,973,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,-0.150486,...,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585,469_ml.bin
457741,469_ml.bin,974,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,0.077828,...,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413,469_ml.bin
457742,469_ml.bin,975,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,0.098097,...,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804,469_ml.bin


In [54]:
# Drop the old 'id' column if it exists
combined_df = combined_df.drop(columns=['id'], errors='ignore')

# Extract the new 'id' from the 'file' column
combined_df['id'] = combined_df['file'].apply(lambda x: f"{int(x.split('_')[0]):d}_ml.bin")

# Reorder columns to place 'id' first
combined_df = combined_df[['id'] + [col for col in combined_df.columns if col != 'id']]

# View the updated DataFrame
combined_df


Unnamed: 0,id,file,Number of time steps,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,...,Entrainment2_Left_Accelerometer_Z,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z
0,1_ml.bin,001_ml.bin,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,...,0.000559,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819
1,1_ml.bin,001_ml.bin,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,...,0.001436,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884
2,1_ml.bin,001_ml.bin,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,...,-0.004281,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356
3,1_ml.bin,001_ml.bin,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,...,-0.001967,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652
4,1_ml.bin,001_ml.bin,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,...,-0.002292,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,469_ml.bin,469_ml.bin,972,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,...,0.088472,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336
457740,469_ml.bin,469_ml.bin,973,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,...,0.036328,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585
457741,469_ml.bin,469_ml.bin,974,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,...,-0.014567,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413
457742,469_ml.bin,469_ml.bin,975,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,...,0.025794,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804


In [56]:
# Drop the old 'id' column if it exists
combined_df = combined_df.drop(columns=['id'], errors='ignore')

# Extract the new 'id' as a number from the 'file' column
combined_df['id'] = combined_df['file'].apply(lambda x: int(x.split('_')[0]))

# Reorder columns to place 'id' first
combined_df = combined_df[['id'] + [col for col in combined_df.columns if col != 'id']]

# View the updated DataFrame
combined_df


Unnamed: 0,id,file,Number of time steps,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,...,Entrainment2_Left_Accelerometer_Z,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z
0,1,001_ml.bin,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,...,0.000559,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819
1,1,001_ml.bin,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,...,0.001436,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884
2,1,001_ml.bin,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,...,-0.004281,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356
3,1,001_ml.bin,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,...,-0.001967,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652
4,1,001_ml.bin,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,...,-0.002292,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457739,469,469_ml.bin,972,-0.028380,-0.030496,-0.027287,-0.025132,-0.021923,-0.016598,-0.013408,...,0.088472,0.085307,0.081067,0.074700,0.060897,0.048153,0.032222,0.020531,0.006726,-0.028336
457740,469,469_ml.bin,973,-0.051722,-0.078292,-0.091057,-0.111241,-0.126104,-0.137775,-0.145190,...,0.036328,0.026759,0.029935,0.047994,0.048008,0.053333,0.065027,0.088408,0.117115,0.141585
457741,469,469_ml.bin,974,0.157558,0.154404,0.144858,0.127850,0.102335,0.085313,0.083160,...,-0.014567,-0.005010,-0.000765,0.002415,0.017275,0.030004,0.044854,0.055456,0.059699,0.054413
457742,469,469_ml.bin,975,0.064008,0.081033,0.104426,0.105513,0.105531,0.098107,0.095981,...,0.025794,0.035347,0.048082,0.056569,0.062938,0.062925,0.072472,0.067174,0.055518,0.059804


In [58]:
combined_df[combined_df['id'] == 1]



Unnamed: 0,id,file,Number of time steps,Relaxed1_Left_Accelerometer_X,Relaxed1_Left_Accelerometer_Y,Relaxed1_Left_Accelerometer_Z,Relaxed1_Left_Gyroscope_X,Relaxed1_Left_Gyroscope_Y,Relaxed1_Left_Gyroscope_Z,Relaxed1_Right_Accelerometer_X,...,Entrainment2_Left_Accelerometer_Z,Entrainment2_Left_Gyroscope_X,Entrainment2_Left_Gyroscope_Y,Entrainment2_Left_Gyroscope_Z,Entrainment2_Right_Accelerometer_X,Entrainment2_Right_Accelerometer_Y,Entrainment2_Right_Accelerometer_Z,Entrainment2_Right_Gyroscope_X,Entrainment2_Right_Gyroscope_Y,Entrainment2_Right_Gyroscope_Z
0,1,001_ml.bin,1,-0.007183,-0.007979,-0.007813,-0.006712,-0.007526,-0.006415,-0.008140,...,0.000559,0.001522,0.000559,0.001519,-0.002423,-0.002487,-0.000593,0.000311,-0.001722,-0.000819
1,1,001_ml.bin,2,0.001043,0.001997,0.002999,0.003058,0.001165,-0.000729,0.001272,...,0.001436,0.004418,0.001537,0.000647,0.000701,-0.000202,-0.000140,-0.002978,-0.003918,-0.003884
2,1,001_ml.bin,3,-0.001907,0.000037,0.001010,0.002977,0.002096,0.000236,0.000322,...,-0.004281,-0.006298,-0.006403,-0.004579,-0.000845,0.000921,0.000764,0.003550,0.003449,0.003356
3,1,001_ml.bin,4,0.002346,0.002297,0.002318,0.003289,-0.000583,0.001389,0.003343,...,-0.001967,-0.000019,-0.001957,-0.001020,-0.003008,-0.002099,-0.001175,-0.000244,-0.000305,0.000652
4,1,001_ml.bin,5,-0.000287,-0.000247,-0.001204,-0.003151,-0.003194,-0.000343,-0.000421,...,-0.002292,-0.000363,-0.000385,0.001572,0.003541,0.004588,0.005647,0.004855,0.003061,0.001238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971,1,001_ml.bin,972,-0.106946,-0.089964,-0.071920,-0.058122,-0.049630,-0.039021,-0.034776,...,0.037450,0.026832,0.004533,-0.011401,-0.023084,-0.031575,-0.034748,-0.028368,-0.027295,-0.019846
972,1,001_ml.bin,973,-0.010281,-0.009212,-0.008148,-0.008143,-0.014517,-0.006031,-0.025151,...,-0.002938,0.004496,0.007683,0.010867,0.014054,0.022544,0.031032,0.037404,0.041654,0.037411
973,1,001_ml.bin,974,0.034229,0.032113,0.022564,0.010891,-0.002906,-0.015645,-0.017762,...,-0.007136,-0.005012,-0.003955,-0.010333,-0.012466,-0.019910,-0.024163,-0.029480,-0.030547,-0.030544
974,1,001_ml.bin,975,-0.019922,-0.009298,0.005573,0.019383,0.033189,0.043804,0.050172,...,-0.084654,-0.086779,-0.081468,-0.077212,-0.075080,-0.066580,-0.055959,-0.046401,-0.035790,-0.029426


In [60]:
# Specify the path to the 'pre_processed' directory
output_path = 'preprocessed/combined_df.csv'

# Export the DataFrame to CSV
combined_df.to_csv(output_path, index=False)

# Confirm the export
print(f"File exported successfully to {output_path}")


File exported successfully to preprocessed/combined_df.csv


In [62]:
combined_df.shape

(457744, 135)

In [64]:
combined_df.isna().sum()

id                                    0
file                                  0
Number of time steps                  0
Relaxed1_Left_Accelerometer_X         0
Relaxed1_Left_Accelerometer_Y         0
                                     ..
Entrainment2_Right_Accelerometer_Y    0
Entrainment2_Right_Accelerometer_Z    0
Entrainment2_Right_Gyroscope_X        0
Entrainment2_Right_Gyroscope_Y        0
Entrainment2_Right_Gyroscope_Z        0
Length: 135, dtype: int64