# Préparation du Jeu de Test

<br>

Le jeu de test est constitué de deux longs enregistrements de plusieurs heures non labelisés. Dans ce notebook, on échantillonne les signaux brutes sur de fenêtres de 4 secondes sans chevauchement. Le jeu de donnée obtenu est enregistré pour effectuer une classification des ADL à partir de réseaux de neurones.

In [1]:
import h5py
import pandas as pd
import numpy as np
from numpy import savetxt

## 1. Chargement du Jeu de Données

In [2]:
# acceletrometry data, test set
filename = "../Accel_dataset/Accel_test.h5"

In [3]:
# input dataframe
df = pd.DataFrame()

# read hdf file
with h5py.File(filename, "r") as f:
    
    # loop over groups / samples
    count = 0
    for key in f.keys():
        sample = f.get(key)
        
        # retrieve data
        data = pd.DataFrame(sample.get('data'))
        
        # retrieve label
        label = np.array(sample.get('label'))[()]
        
        # format label
        label = str(label)[2:-1]
        
        # add label to data
        data['label'] = label
                
        # sample formatting
        sample_n = key.replace("sample_", "")
        
        # add sample number
        data['sample'] = sample_n
        
        
        # add index numbers
        #data = data.reset_index()
        
        # add sample to final output
        df = pd.concat([df, data])
        
        count = count + 1
        
        #if count > 10:
        #    break

In [4]:
# time index
df = df.reset_index().rename(columns={'index' : 'time_steps'})

# rename features (0, 1, 2) to (x, y, z)
coor_mapping = {0 : 'x',
                1 : 'y',
                2 : 'z'
               }

df = df.rename(columns=coor_mapping)

df

Unnamed: 0,time_steps,x,y,z,label,sample
0,0,-0.410481,-0.455467,-0.759411,n,1
1,1,-0.372263,-0.482616,-0.807557,n,1
2,2,-0.368343,-0.479600,-0.815418,n,1
3,3,-0.299745,-0.488650,-0.791836,n,1
4,4,-0.296805,-0.526860,-0.787906,n,1
...,...,...,...,...,...,...
2912995,646495,0.551844,0.320815,-0.687683,n,2
2912996,646496,0.567524,0.320815,-0.701439,n,2
2912997,646497,0.603783,0.323831,-0.737794,n,2
2912998,646498,0.639061,0.324837,-0.747620,n,2


In [5]:
# number of time steps per record
df['sample'].value_counts() 

1    2266500
2     646500
Name: sample, dtype: int64

*Nombre de time steps par enregistrements.*

In [6]:
# number of hours per record
df['sample'].value_counts() / 25 / 60 / 60 

1    25.183333
2     7.183333
Name: sample, dtype: float64

*Durée en heure de chaque enregistrement.*

## 2. Échantillonnage

In [7]:
def addMagnitude(df):
    '''
    add spherical coordinates to df based on its x, y, z features
    '''
    ro2 = df['x']**2 + df['y']**2
    
    df['r'] = np.sqrt(ro2 + df['z']**2)    
    #df['theta'] = np.arctan2(np.sqrt(ro2), df['z'])
    #df['phi'] = np.arctan2(df['y'], df['x'])
    
    return df

In [8]:
# add the norm of (x, y, z) vector
addMagnitude(df)

Unnamed: 0,time_steps,x,y,z,label,sample,r
0,0,-0.410481,-0.455467,-0.759411,n,1,0.976038
1,1,-0.372263,-0.482616,-0.807557,n,1,1.011755
2,2,-0.368343,-0.479600,-0.815418,n,1,1.015184
3,3,-0.299745,-0.488650,-0.791836,n,1,0.977563
4,4,-0.296805,-0.526860,-0.787906,n,1,0.993212
...,...,...,...,...,...,...,...
2912995,646495,0.551844,0.320815,-0.687683,n,2,0.938276
2912996,646496,0.567524,0.320815,-0.701439,n,2,0.957613
2912997,646497,0.603783,0.323831,-0.737794,n,2,1.006857
2912998,646498,0.639061,0.324837,-0.747620,n,2,1.035787


*La norme du vecteur accélération r est ajoutée au dataframe.*

In [9]:
# number of time steps per window
window_size = 100

# add window index per record
window_idx = df['time_steps'] / 100
df['window_idx'] = window_idx.astype(int)

# add a counter for time steps per window
windo_timestep = df.groupby(['sample', 'window_idx']).cumcount()
df['wndw_time_steps'] = windo_timestep
df

Unnamed: 0,time_steps,x,y,z,label,sample,r,window_idx,wndw_time_steps
0,0,-0.410481,-0.455467,-0.759411,n,1,0.976038,0,0
1,1,-0.372263,-0.482616,-0.807557,n,1,1.011755,0,1
2,2,-0.368343,-0.479600,-0.815418,n,1,1.015184,0,2
3,3,-0.299745,-0.488650,-0.791836,n,1,0.977563,0,3
4,4,-0.296805,-0.526860,-0.787906,n,1,0.993212,0,4
...,...,...,...,...,...,...,...,...,...
2912995,646495,0.551844,0.320815,-0.687683,n,2,0.938276,6464,95
2912996,646496,0.567524,0.320815,-0.701439,n,2,0.957613,6464,96
2912997,646497,0.603783,0.323831,-0.737794,n,2,1.006857,6464,97
2912998,646498,0.639061,0.324837,-0.747620,n,2,1.035787,6464,98


*L'indice de la fenêtre par échantillon est ajouté dans le dataframe.*

In [10]:
# check if there is any nan
#print(pivoted.isnull().sum().sum())

In [11]:
directions = ['x', 'y', 'z', 'r']

# loop over directions x, y ...
for X in directions:
    
    # df : 1 row = 1 window
    #      1 column = 1 time step
    df_nn_w = df.pivot_table(index=['sample', 'window_idx'],
                               columns='wndw_time_steps',
                               values=X)
    
    
    # first record
    df_nn_w1 = df_nn_w.loc["1", :]
   
    # second record
    df_nn_w2 = df_nn_w.loc["2", :]
    
    # numpy conversion
    df_nn_w1 = df_nn_w1.values
    df_nn_w2 = df_nn_w2.values

    
    # save data into csv file
    savetxt('mydata/NN/test1_data_feature_nn_' + X + '.csv', df_nn_w1, delimiter=',')
    savetxt('mydata/NN/test2_data_feature_nn_' + X + '.csv', df_nn_w2, delimiter=',')

*Enregistrement des signaux échantillonées.*