In [8]:
import pandas as pd
import numpy as np
import pickle
import bz2

In [2]:
path = "../capture24"

In [4]:
def create_windows_np(data, window_size, stride):
    num_samples, num_channels = data.shape
    num_windows = (num_samples - window_size) // stride + 1

    shape = (num_windows, window_size, num_channels)
    strides = (data.strides[0] * stride, data.strides[0], data.strides[1])

    windows = np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)

    # transpose the windows array to the desired shape
    windows = np.transpose(windows, axes=(0, 2, 1))

    return windows

In [4]:
def create_sample_datasets(lower_bound, upper_bound, dataset_name, path):
    df_all = pd.DataFrame({'x': [], 'y': [], 'z': [], 'user_id': []})  # add a 'user_id' column to the DataFrame
    for i in range(lower_bound, upper_bound):

        user_id = f'P{i+1:03d}'  # set the user ID for this iteration

        df = pd.read_csv(f'{path}/{user_id}.csv.gz', compression='gzip', low_memory=False)
        df = df.drop(columns=['annotation'])
        
        # convert the 'time' column to a datetime object
        df['time'] = pd.to_datetime(df['time'], origin='unix')
        
        # set the 'time' column as the DataFrame index
        df.set_index('time', inplace=True)
        
        # resample the DataFrame from 100 Hz to 50 Hz
        df = df.resample('20ms').mean()
        
        df = df.reset_index().reset_index(drop=True)
        df['user_id'] = i+1  # set the user ID for this DataFrame
        df_all = pd.concat([df_all, df.loc[:df.shape[0] - df.shape[0] % 100 - 1]])
        print(f'{user_id}: {df.shape[0]}')
        del df

    # reset the index of the df_all DataFrame
    df_all = df_all.reset_index(drop=True)

    # save the df_all DataFrame to a Feather file
    df_all.to_feather(f'{path}/{dataset_name}.feather')
    del df_all

In [5]:
create_sample_datasets(0, 20, 'training', path)
create_sample_datasets(20, 22, 'validation', path)
create_sample_datasets(22, 24, 'test', path)

P021: 4770001
P022: 5340001
P023: 5040001
P024: 5040001


In [3]:
from sklearn.preprocessing import StandardScaler

df_train = pd.read_feather(f'{path}/training.feather')

std_scaler = StandardScaler()
std_scaler.fit(df_train[['x', 'y', 'z']])

df_train[['x', 'y', 'z']] = std_scaler.transform(df_train[['x', 'y', 'z']])
df_train.to_feather(f'{path}/training.feather')
del df_train

df_validation = pd.read_feather(f'{path}/validation.feather')
df_test = pd.read_feather(f'{path}/test.feather')

df_validation[['x', 'y', 'z']] = std_scaler.transform(df_validation[['x', 'y', 'z']])
df_validation.to_feather(f'{path}/validation.feather')
df_test[['x', 'y', 'z']] = std_scaler.transform(df_test[['x', 'y', 'z']])
df_test.to_feather(f'{path}/test.feather')

del df_validation, df_test

In [4]:
df_train = pd.read_feather(f'{path}/training.feather')
df_train.head()

Unnamed: 0,x,y,z,user_id,time
0,-0.414665,-1.035181,0.823605,1.0,2016-11-13 02:18:00.000
1,-0.414665,-1.035181,0.823605,1.0,2016-11-13 02:18:00.020
2,-0.414665,-1.035181,0.823605,1.0,2016-11-13 02:18:00.040
3,-0.414665,-1.050719,0.823605,1.0,2016-11-13 02:18:00.060
4,-0.414665,-1.050719,0.823605,1.0,2016-11-13 02:18:00.080


In [5]:
def noise_transform_vectorized(X, sigma=0.05):
    """
    Adding random Gaussian noise with mean 0
    """
    noise = np.random.normal(loc=0, scale=sigma, size=X.shape)
    return X + noise

def scaling_transform_vectorized(X, sigma=0.1):
    """
    Scaling by a random factor
    """
    scaling_factor = np.random.normal(loc=1.0, scale=sigma, size=(X.shape[0], 1, X.shape[2]))
    return X * scaling_factor

def negate_transform_vectorized(X):
    """
    Inverting the signals
    """
    return X * -1

def time_flip_transform_vectorized(X):
    """
    Reversing the direction of time
    """
    return X[:, ::-1, :]

In [6]:
# Define the list of transformations to be applied
transformations = [
    lambda x: noise_transform_vectorized(x), 
    lambda x: scaling_transform_vectorized(x),
    lambda x: negate_transform_vectorized(x),
    lambda x: time_flip_transform_vectorized(x),
    ]

In [11]:
output_path = '/media/darshana/Software/dataset'
user_dataset = {}

df = pd.read_feather(f'{path}/training.feather', columns=['x', 'y', 'z', 'user_id'])

for i in range(1):
    
    user_data = create_windows_np(df.loc[df['user_id'] == i+1].values.astype(np.float32), 100, 50)

    # Get the number of windows and window size for the user's data
    num_windows, window_size, num_channels = user_data.shape

    # Apply the transformations to the user's data
    transformed_data = np.concatenate([transform_fn(user_data) for transform_fn in transformations], axis=0)
    transformed_data = np.concatenate([transformed_data, user_data], axis=0)
    transformed_data = np.array(transformed_data)

    # Create the labels for the transformed data
    transformed_labels = np.array([False for _ in range(4)])
    transformed_labels = np.append(transformed_labels, True)
    transformed_labels = np.repeat(transformed_labels, num_windows)

    print(transformed_data.shape)
    print(transformed_labels.shape)

    transformed_user_data = {
        'X': transformed_data,
        'y': transformed_labels
    }

    del transformed_data,  transformed_labels

    with bz2.BZ2File(f'{output_path}/P{i+1:03d}.pbz2', 'wb') as f:
        pickle.dump(transformed_user_data, f)

    del transformed_user_data


(500995, 4, 100)
(500995,)


In [8]:
p001file = open(f'/media/darshana/Software/dataset/P001.obj', 'rb')
p001Dataset = pickle.load(p001file)

In [10]:
print(p001Dataset)
del p001Dataset

{'X': array([[[-0.55305357, -0.51706906,  0.69314428],
        [-0.50472098, -0.56538143,  0.65664381],
        [-0.44077426, -0.53410961,  0.59784726],
        ...,
        [-0.47601569, -0.48092514,  0.68703596],
        [-0.46727538, -0.53516123,  0.74507716],
        [-0.43284447, -0.50125229,  0.6164534 ]],

       [[-0.47427553, -0.53181389,  0.56354433],
        [-0.43548332, -0.6421605 ,  0.69263543],
        [-0.45257494, -0.47719815,  0.6296225 ],
        ...,
        [-0.49897984, -0.51259347,  0.61318453],
        [-0.42932175, -0.53151222,  0.64826092],
        [-0.47932165, -0.53827612,  0.61613918]],

       [[-0.43956137, -0.50406777,  0.6121599 ],
        [-0.44457909, -0.50826508,  0.64317895],
        [-0.43738584, -0.51463917,  0.68245852],
        ...,
        [-0.42746603, -0.51584196,  0.71400161],
        [-0.48124537, -0.57680277,  0.71200579],
        [-0.42053982, -0.44312963,  0.61433704]],

       ...,

       [[ 0.0494156 , -0.79784578,  0.56569976],
     

In [None]:
df = pd.read_csv(f'{path}/P{i+1:03d}.csv.gz', compression='gzip', low_memory=False)