In [1]:
import pandas as pd
import numpy as np
from typing import Literal

### Format the data into pandas.DataFrame
After this preprocessing procedure, we can easily get the data as a DataFrame through pandas

In [2]:
def get_data(flag: Literal['train', 'test']) -> tuple[pd.DataFrame, pd.Series]:
    """
    Load and process the UCI HAR Dataset for the specified data type ('train' or 'test').
    :param flag: A string indicating the type of data to load ('train' or 'test').
    :return: A pandas DataFrame containing the processed data with features and labels.
    """

    # Read the data from the text files and create a DataFrame
    return (pd.DataFrame((line.strip().split() for line in open(f'UCI HAR Dataset/{flag}/X_{flag}.txt', 'r')),
                         columns=(feature.split()[1] for feature in open(
                             'UCI HAR Dataset/features.txt', 'r')),
                         dtype=np.float64),
            pd.Series((label.strip() for label in open(
                f'UCI HAR Dataset/{flag}/y_{flag}.txt')), name='label').astype(np.int8)-1)

### Describe the data and check invalid values
As we can see, there is no invalid values and all data has been normalized to $[-1,1]$.  
So there is no need to do more normalization.  

In [3]:
# fetch the data
X_train, y_train = get_data('train')
X_test, y_test = get_data('test')

# # check the data again
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_train.isnull().sum().sum(), y_train.isnull().sum())
print(X_test.isnull().sum().sum(), y_test.isnull().sum())
print(np.isinf(X_train).sum().sum(), np.isinf(y_train).sum())
print(np.isinf(X_test).sum().sum(), np.isinf(y_test).sum())

print(X_train.describe())

(7352, 561) (7352,)
(2947, 561) (2947,)
0 0
0 0
0 0
0 0
       tBodyAcc-mean()-X  tBodyAcc-mean()-Y  tBodyAcc-mean()-Z  \
count        7352.000000        7352.000000        7352.000000   
mean            0.274488          -0.017695          -0.109141   
std             0.070261           0.040811           0.056635   
min            -1.000000          -1.000000          -1.000000   
25%             0.262975          -0.024863          -0.120993   
50%             0.277193          -0.017219          -0.108676   
75%             0.288461          -0.010783          -0.097794   
max             1.000000           1.000000           1.000000   

       tBodyAcc-std()-X  tBodyAcc-std()-Y  tBodyAcc-std()-Z  tBodyAcc-mad()-X  \
count       7352.000000       7352.000000       7352.000000       7352.000000   
mean          -0.605438         -0.510938         -0.604754         -0.630512   
std            0.448734          0.502645          0.418687          0.424073   
min           -1.000000  

### Save the data into csv files

In [4]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)