In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
import math
import h5py

In [2]:
def read_data(file_path):
    print("reading data")
    column_names = ['user-id','activity','timestamp', 'x-axis', 'y-axis', 'z-axis']
    data = pd.read_csv(file_path,header = None, names = column_names)
    print("finished reading data")
    return data

In [4]:
#READING DATA
dataset = read_data('..//Datasets//WISDM_Dataset//WISDM_ar_v1.1_raw.txt')

reading data
finished reading data


In [5]:
dataset.isnull().sum()

user-id      0
activity     0
timestamp    0
x-axis       0
y-axis       0
z-axis       1
dtype: int64

In [6]:
dataset.dropna(axis=0, how='any', inplace= True)

In [7]:
dataset.isnull().sum()

user-id      0
activity     0
timestamp    0
x-axis       0
y-axis       0
z-axis       0
dtype: int64

In [8]:
['Walking','Jogging','Upstairs','Downstairs','Sitting','Standing']

['Walking', 'Jogging', 'Upstairs', 'Downstairs', 'Sitting', 'Standing']

In [9]:
def reset_label(dataCollection): 
    # Converting original labels {'Walking','Jogging','Upstairs','Downstairs','Sitting','Standing'} to new labels, to store it in a .h5 file format
    mapping = {'Walking':0,'Jogging':1,'Upstairs':2,'Downstairs':3,'Sitting':4,'Standing':5} # old activity Id to new activity Id 
    for i in  ['Walking','Jogging','Upstairs','Downstairs','Sitting','Standing']:
        dataCollection.loc[dataCollection.activity == i, 'activity'] = mapping[i]

    return dataCollection

In [10]:
dataset_reset=reset_label(dataset)

In [11]:
dataset_reset.head()

Unnamed: 0,user-id,activity,timestamp,x-axis,y-axis,z-axis
0,33,1,49105962326000,-0.694638,12.680544,0.503953
1,33,1,49106062271000,5.012288,11.264028,0.953424
2,33,1,49106112167000,4.903325,10.882658,-0.081722
3,33,1,49106222305000,-0.612916,18.496431,3.023717
4,33,1,49106332290000,-1.18497,12.108489,7.205164


In [12]:
def class_breakdown(data,col):
    # group data by the class value and calculate the number of rows
    counts = data.groupby(col).size()
    # retrieve raw rows
    counts = counts.values
    # summarize
    for i in range(len(counts)):
        percent = counts[i] / len(data) * 100
        print('Class=%d, total=%d, percentage=%.3f' % (i+1, counts[i], percent))

In [13]:
class_breakdown(dataset_reset,'activity')

Class=1, total=424397, percentage=38.645
Class=2, total=342176, percentage=31.158
Class=3, total=122869, percentage=11.188
Class=4, total=100427, percentage=9.145
Class=5, total=59939, percentage=5.458
Class=6, total=48395, percentage=4.407


In [14]:
X=dataset_reset.drop(['activity'],axis=1)
y=dataset_reset['activity']

#### code for remove class imbalance

In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
def class_balance(X,y):
    oversample = SMOTE()
    y = LabelEncoder().fit_transform(y)
    X, y = oversample.fit_resample(X, y)
    return X,y

#### code for data segmentation into windows

In [16]:
# ##code_1:
# def windows(data, size):
#     start = 0
#     while start < data.count():
#         yield int(start), int(start + size)
#         start += (size / 2)

# def segment_signal(data,window_size):
#     segments = np.empty((0,window_size,3))
#     labels = np.empty((0))
#     for (start, end) in windows(data["timestamp"], window_size):
#         x = data["x-axis"][start:end]
#         y = data["y-axis"][start:end]
#         z = data["z-axis"][start:end]
#         if(len(dataset["timestamp"][start:end]) == window_size and len(y)==window_size and len(x)==window_size and len(z)==window_size):
#             segments = np.vstack([segments,np.dstack([x,y,z])])
#             labels = np.append(labels,stats.mode(data["activity"][start:end])[0][0])
#     return {'input' : np.asarray(segments), 'label': labels}

In [17]:
# code_2
def segment_signal(data,window_size):
    segments = np.empty((0,window_size,3))
    labels = np.empty((0))
    start=0
    while start+window_size < data.shape[0]:
        end=start + window_size
        x = data["x-axis"][start:end]
        y = data["y-axis"][start:end]
        z = data["z-axis"][start:end]
        # if(len(dataset["timestamp"][start:end]) == window_size and len(y)==window_size and len(x)==window_size and len(z)==window_size):
        segments = np.vstack([segments,np.dstack([x,y,z])])
        labels = np.append(labels,stats.mode(data["activity"][start:end])[0][0])
        start+=int(window_size/2)
    return {'input' : np.asarray(segments), 'label': labels}

In [18]:
#SEGMENT DATA, LABELS INTO WINDOW_SIZE
window_size=128

### save the data in h5 format


In [20]:
def save_data(data,file_name): 
    f = h5py.File(file_name,'w')
    for key in data:
        print(key)
        f.create_dataset(key,data = data[key])       
    f.close()
    print('Done.')    

### sample the whole data first, then segmented & saved:

In [21]:
#data sampled
X_sampled,y_sampled=class_balance(X,y)

In [22]:
data_sampled=pd.concat([pd.DataFrame(X_sampled),pd.DataFrame(y_sampled,columns = ['activity'])],axis=1)

In [23]:
class_breakdown(data_sampled,'activity')

Class=1, total=424397, percentage=16.667
Class=2, total=424397, percentage=16.667
Class=3, total=424397, percentage=16.667
Class=4, total=424397, percentage=16.667
Class=5, total=424397, percentage=16.667
Class=6, total=424397, percentage=16.667


In [24]:
# data segmented
data_segmented=segment_signal(data_sampled,window_size)

In [25]:
file="wisdm_sampled_segmented.h5"
save_data(data_segmented,file)

input
label
Done.
