Working piece of code to combine the pieces of data into one csv file with the right information

In [None]:
import os
import pandas as pd

# Define the path to the root folder containing the action folders
root_folder = "datasets/testSet"

# Create an empty list to hold the summarized data
summary_data = []

# Iterate through all folders in the root folder
for folder_name in os.listdir(root_folder):
    folder_path = os.path.join(root_folder, folder_name)

    # Check if the folder_path is a directory
    if os.path.isdir(folder_path):
        # Extract person and action from the folder name
        parts = folder_name.split("-")
        if len(parts) >= 2:
            person = parts[0]
            action_with_index = parts[1].rsplit(" ", 1)[0]
            action = ''.join([i for i in action_with_index if not i.isdigit()])

            summary_row = {
                'subject': person,
                'Activity': action,
                
            }

            # Iterate through all CSV files in the folder
            for file_name in os.listdir(folder_path):
                if file_name.endswith(".csv"):
                    file_path = os.path.join(folder_path, file_name)
                    try:
                        # Read the CSV file
                        data = pd.read_csv(file_path)

                        # Skip empty files
                        if data.empty:
                            print(f"Skipping empty file: {file_path}")
                            continue
                        
                        
                        if {"X (m/s^2)", "Y (m/s^2)", "Z (m/s^2)"}.issubset(data.columns) and file_name == "Accelerometer.csv":
                            data.rename(columns={
                                "X (m/s^2)": "Acceleration x (m/s^2)",
                                "Y (m/s^2)": "Acceleration y (m/s^2)",
                                "Z (m/s^2)": "Acceleration z (m/s^2)"
                            }, inplace=True)
                        elif {"X (rad/s)", "Y (rad/s)", "Z (rad/s)"}.issubset(data.columns):
                            data.rename(columns={
                                "X (rad/s)": 'Gyroscope x (rad/s)',
                                "Y (rad/s)": 'Gyroscope y (rad/s)',
                                "Z (rad/s)": 'Gyroscope z (rad/s)'
                            }, inplace=True)
                        elif {"X (m/s^2)","Y (m/s^2)","Z (m/s^2)"}.issubset(data.columns):
                            data.rename(columns={
                                "X (m/s^2)": 'Linear Acceleration x (m/s^2)',
                                "Y (m/s^2)": 'Linear Acceleration y (m/s^2)',
                                "Z (m/s^2)": 'Linear Acceleration z (m/s^2)'
                            }, inplace=True)



                        # Determine which type of data (Accelerometer, Gyroscope, or Linear Acceleration)
                        if {'Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)'}.issubset(data.columns):
                            data_type = "Accelerometer"
                            columns = ['Acceleration x (m/s^2)', 'Acceleration y (m/s^2)', 'Acceleration z (m/s^2)']

                        elif {'Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)'}.issubset(data.columns):
                            data_type = "Gyroscope"
                            columns = ['Gyroscope x (rad/s)', 'Gyroscope y (rad/s)', 'Gyroscope z (rad/s)']

                        elif {'Linear Acceleration x (m/s^2)', 'Linear Acceleration y (m/s^2)', 'Linear Acceleration z (m/s^2)'}.issubset(data.columns):
                            data_type = "Linear Acceleration"
                            columns = ['Linear Acceleration x (m/s^2)', 'Linear Acceleration y (m/s^2)', 'Linear Acceleration z (m/s^2)']

                        else:
                            print(f"File {file_path} does not contain recognized column names. Skipping.")
                            continue

                        # Calculate statistics for relevant columns
                        mean_values = data[columns].mean()
                        std_values = data[columns].std()
                        min_values = data[columns].min()
                        max_values = data[columns].max()

                        # add to the summary row
                        for col in columns:
                            summary_row[f'{col}_mean'] = mean_values[col]
                            summary_row[f'{col}_std'] = std_values[col]
                            summary_row[f'{col}_min'] = min_values[col]
                            summary_row[f'{col}_max'] = max_values[col]
                        
                    except Exception as e:
                        print(f"Error reading file {file_path}: {e}")
            summary_data.append(summary_row)
# Create a dataframe from the summary data
if not summary_data:
    print("No valid data found. Summary CSV will not be created.")
else:
    summary_df = pd.DataFrame(summary_data)

    # Save the summarized data to a single CSV file
    output_path = os.path.join(root_folder, "data_total.csv")
    summary_df.to_csv(output_path, index=False)

    print(f"Summary data saved to {output_path}")

Summary data saved to datasets/testSet\data_total.csv


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'datasets/testSet/data_total.csv'
data = pd.read_csv(file_path)

# Set your split ratio (e.g., 0.8 for 80% training, 20% testing)
split_ratio = 0.8  # Change this value as desired (0 < split_ratio < 1)

# Split the dataset into train and test sets
train_set, test_set = train_test_split(data, test_size=(1 - split_ratio), random_state=42)

# Save the splits to new CSV files (optional)
train_set.to_csv('datasets/train_set.csv', index=False)
test_set.to_csv('datasets/test_set.csv', index=False)

# Print the sizes of the splits
print(f"Train set size: {train_set.shape}")
print(f"Test set size: {test_set.shape}")


Train set size: (72, 38)
Test set size: (18, 38)
