<div class="alert" style="background-color:#29C5F6; color:white; padding:0px 10px; border-radius:5px;">
    <h1 style='margin:15px 15px; color:#000000; font-size:32px'><b>Data Generation (Processing)</b></h1>
        <h2 style='margin:15px 15px; color:#000000; font-size:24px'>Human Activity Recognition Problem</h2>
            <div style='color:#000000'>
                <ul>
                  <li><b>WISDM - WIreless Sensor Data Mining</b></li>
                  <li>UCI HAR - Human Activity Recognition using Smartphones at UCI</li>
                  <li>MotionSense</li>
                </ul>
            </div>
</div>

The work is under the **"Master Thesis"** by **Chau Tran** with the supervision from **Prof. Roland Olsson**.

<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>6_1. WISDM v1.1 (2012)</h3></div>
<div>
    <p>
        Source: <a href="url">https://www.cis.fordham.edu/wisdm/dataset.php#activityprediction</a> <br>
        Raw's format: <b>[user],[activity],[timestamp],[x-acceleration],[y-accel],[z-accel]</b> <br>
        Number of samples for non-hand-oriented activities (6 activities): <b>1,098,207</b><br>
    </p> 
    <ul>
      <li>Walking:  424,400</li>
      <li>Jogging:  342,177</li>
      <li>UpStairs: 122,869</li>
      <li>Downstairs: 100,427</li>
      <li>Sitting:  59,939</li>
      <li><b>Standing: 48,395</b></li>
    </ul> 
    <p>Fields:<br></p>
    <ul>
      <li>user: 1..36</li>
      <li>activity: {Walking, Jogging, Sitting, Standing, UpDownstairs, <b>Downstairs</b>}</li>
      <li>timestamp: microsecond (Unix Time)</li>
      <li>x-accel: floating-point values between -20 .. 20</li>
      <li>y-accel: floating-point values between -20 .. 20</li>
      <li>z-accel: floating-point values between -20 .. 20</li>
    </ul>
    <p> The acceleration in the x direction as measured by the android phone's accelerometer. A value of 10 = 1g = 9.81 m/s^2, and 0 = no acceleration. The acceleration recorded includes gravitational acceleration toward the center of the Earth, so that when the phone is at rest on a flat surface the vertical axis will register +-10. <br></p>
</div>

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys, os

TIME_STEPS_arr = [90, 60, 50, 40]
isSTEPS_arr = [True, False]
SPLIT = 0.5
NO_IN, NO_OUT = 3, 6
COLUMNS = ['x_axis', 'y_axis', 'z_axis']

def divideData_perUser(data, per=0.5):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    X_df = pd.DataFrame()
    for user in np.unique(data['user']):
        dataPerUser = data[data['user']==user]
        for tag in np.unique(dataPerUser['activity']):
            dataPerActivity = dataPerUser[dataPerUser['activity']==tag]
            n = len(dataPerActivity)
            train_df = train_df.append(dataPerActivity[0:int(n*per)])
            val_df = val_df.append(dataPerActivity[int(n*per):int(n)])
            X_df = X_df.append(dataPerActivity)        
    return X_df, train_df, val_df

# Utils functions for segmenting windows
def windows(data,window_size,step):
    start = 0
    while start< data.count():
        yield int(start), int(start + window_size)
        start+= step
def segment_signal(data, window_size = 90, step=40,columns=[]):
    segments = np.empty((0,window_size,len(columns)))
    labels= np.empty((0))
    for user in np.unique(data['user']):
        userdata = data[(data.user == user)]
        for tag in np.unique(userdata['activity']):
            sub_class_data = userdata[(userdata.activity == tag)]
            for (start, end) in windows(pd.Series(sub_class_data.index.values),window_size,step):
                if end > sub_class_data.shape[0] - 1:
                    end = sub_class_data.shape[0]
                    true_length = end - start
                    remaining_data_length = window_size - true_length
                    start -= remaining_data_length
                if (sub_class_data[start:end].isnull().values.any()):
                    print(sub_class_data[start:end].isnull().sum())
                if(sub_class_data[start:end].shape[0] == window_size):
                    segments = np.vstack([segments,np.dstack([sub_class_data[column][start:end] for column in columns])])
                    labels = np.append(labels, tag) 
    return segments, labels.reshape(-1, 1)

wisdmdataset_path = '../../../../Datasets/6_har/0_WISDM/WISDM_ar_v1.1'
os.mkdir(f"{wisdmdataset_path}/WISDM_ar_v1.1_processed/WISDM_ar_v1.1_wt_overlap/") if os.path.isdir(f"{wisdmdataset_path}/WISDM_ar_v1.1_processed/WISDM_ar_v1.1_wt_overlap/") == False else None
os.mkdir(f"{wisdmdataset_path}/WISDM_ar_v1.1_processed/WISDM_ar_v1.1_w_overlap/") if os.path.isdir(f"{wisdmdataset_path}/WISDM_ar_v1.1_processed/WISDM_ar_v1.1_w_overlap/") == False  else None

print(f'{wisdmdataset_path}/WISDM_ar_v1.1_source/WISDM_ar_v1.1_raw.txt')
wisdm_phone_data = pd.read_csv(f'{wisdmdataset_path}/WISDM_ar_v1.1_source/WISDM_ar_v1.1_raw.txt', header=None, names=['user', 'activity', 'timestamp', 'x_axis', 'y_axis', 'z_axis'])
wisdm_phone_data.z_axis.replace(regex=True, inplace=True, to_replace=r';', value=r'')
wisdm_phone_data['x_axis'] = wisdm_phone_data.x_axis.astype(np.float64)
wisdm_phone_data['y_axis'] = wisdm_phone_data.y_axis.astype(np.float64)
wisdm_phone_data['z_axis'] = wisdm_phone_data.z_axis.astype(np.float64)
wisdm_phone_data['timestamp'].apply(lambda x: float(x))
wisdm_phone_data.dropna(axis=0, how='any', inplace=True)
# {Walking - A - 1, Jogging - B - 2, Up/DownStairs - C - 5, Sitting - D - 3, Standing - E - 4, LyingDown - ?? - 6} There is a typo mistake in Downstair so it is the number 5 in the current datasets (from 0 to 5)
mapping_dict = {'Walking': 1,'Jogging': 2, 'Sitting': 3, 'Standing': 4, 'Stairs': 5, 'Upstairs': 5, 'Downstairs ': 5, 'LyingDown': 6}
# ['Downstairs', 'Jogging', 'Sitting', 'Standing', 'Upstairs','Walking']
display(wisdm_phone_data[(wisdm_phone_data['user'] == 33) & (wisdm_phone_data['activity'] == 'Downstairs')])
display(np.unique(wisdm_phone_data['activity'].isnull()))
display(np.unique(wisdm_phone_data['activity']))
display(wisdm_phone_data.iloc[[1093538]])
wisdm_phone_data['activity'] = wisdm_phone_data.activity.map(mapping_dict)
display(np.unique(wisdm_phone_data['activity'].isnull()))
display(np.unique(wisdm_phone_data['activity']))
display(wisdm_phone_data[(wisdm_phone_data['user'] == 33) & (wisdm_phone_data['activity'] != 1.) 
                             & ( wisdm_phone_data['activity'] != 2. ) & ( wisdm_phone_data['activity'] != 3. ) 
                             & ( wisdm_phone_data['activity'] != 4. ) & ( wisdm_phone_data['activity'] != 5. )])
# display(wisdm_phone_data[(wisdm_phone_data['activity'] == NaN)])
display(np.isnan(wisdm_phone_data['activity']))
# display(wisdm_phone_data.loc[:, ~wisdm_phone_data.columns.isin(['timestamp'])].head(5))
sys.exit()

X_df, train_df, val_df = divideData_perUser(wisdm_phone_data, SPLIT)

for isSTEPS in isSTEPS_arr:
    for TIME_STEPS in TIME_STEPS_arr:
        STEP = int(round(TIME_STEPS/2,-1)) if isSTEPS else TIME_STEPS
        print(TIME_STEPS, STEP)

        X, y = segment_signal(X_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_train, y_train = segment_signal(train_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_val, y_val = segment_signal(val_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)

        enc = OneHotEncoder().fit(np.array(list(mapping_dict.values())).reshape(-1,1))
        y_train = enc.transform(y_train).toarray()
        y_val   = enc.transform(y_val).toarray()
        y       = enc.transform(y).toarray()

        y_train = np.tile(y_train, TIME_STEPS).reshape((y_train.shape[0], TIME_STEPS, y_train.shape[1]))
        y_val   = np.tile(y_val, TIME_STEPS).reshape((y_val.shape[0], TIME_STEPS, y_val.shape[1]))
        y       = np.tile(y, TIME_STEPS).reshape((y.shape[0], TIME_STEPS, y.shape[1]))

        df_train = np.concatenate((X_train, y_train), axis=2).reshape((X_train.shape[0], -1))
        df_val = np.concatenate((X_val, y_val), axis=2).reshape((X_val.shape[0], -1))
        df = np.concatenate((X,y), axis=2).reshape((X.shape[0], -1))
        
        print(X_train.shape, y_train.shape, df_train.shape)
        print(X_val.shape, y_val.shape, df_val.shape)
        print(X.shape, y.shape, df.shape)

        wisdm_phone_result_path = f"{wisdmdataset_path}/WISDM_ar_v1.1_processed/WISDM_ar_v1.1_wt_overlap" if TIME_STEPS==STEP else f"{wisdmdataset_path}/WISDM_ar_v1.1_processed/WISDM_ar_v1.1_w_overlap"
        
        with open(fr'{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv','w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'a') as csvfile:
            np.savetxt(csvfile, df, fmt='%.4f', delimiter=",")

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'a') as csvfile:
            np.savetxt(csvfile, df_train, fmt='%.4f', delimiter=",")

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'a') as csvfile:
            np.savetxt(csvfile, df_val, fmt='%.4f', delimiter=",")

../../../../Datasets/6_har/0_WISDM/WISDM_ar_v1.1/WISDM_ar_v1.1_source/WISDM_ar_v1.1_raw.txt


Unnamed: 0,user,activity,timestamp,x_axis,y_axis,z_axis
1769,33,Downstairs,49646322311000,-0.040861,4.985047,6.510526
1770,33,Downstairs,49646422317000,-0.463092,4.372132,7.436710
1771,33,Downstairs,49646522323000,-0.299648,4.603678,6.510526
1772,33,Downstairs,49646572281000,-0.272407,4.481094,6.360703
1773,33,Downstairs,49646672317000,-1.525479,5.175732,7.164303
...,...,...,...,...,...,...
52296,33,Downstairs,10455565318000,1.610000,9.000000,-1.525479
52297,33,Downstairs,10455565318000,1.610000,9.000000,-1.525479
52298,33,Downstairs,10455565318000,1.610000,9.000000,-1.525479
52299,33,Downstairs,10455622844000,0.990000,9.720000,-2.179256


array([False])

array(['Downstairs', 'Jogging', 'Sitting', 'Standing', 'Upstairs',
       'Walking'], dtype=object)

Unnamed: 0,user,activity,timestamp,x_axis,y_axis,z_axis
1093539,19,Downstairs,131357191438000,-7.01,9.43,0.91


array([False])

array([1, 2, 3, 4, 5], dtype=int64)

Unnamed: 0,user,activity,timestamp,x_axis,y_axis,z_axis


0          False
1          False
2          False
3          False
4          False
           ...  
1098204    False
1098205    False
1098206    False
1098207    False
1098208    False
Name: activity, Length: 1098208, dtype: bool

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>6_1. WISDM v2.0 (2013) - Drop -> Too much noises</h3></div>
<div>
    <p>
        Source: <a href="url">https://www.cis.fordham.edu/wisdm/dataset.php#actitracker</a> <br>
        Raw's format: <b>[user],[activity],[timestamp],[x-acceleration],[y-accel],[z-accel]</b> <br>
        Number of samples for non-hand-oriented activities (6 activities): <b>2.980.765</b><br>
    </p> 
    <ul>
      <li>Walking:    1,255,923</li>
      <li>Jogging:    438,871</li>
      <li>Stairs:     57,425</li>
      <li>Sitting:    663,706</li>
      <li>Standing:   288,873</li>
      <li><b>LyingDown</b>: 275,967</li>
    </ul> 
    <p>Fields:<br></p>
    <ul>
      <li>user: 1..225????</li>
      <li>activity: {Walking, Jogging, Sitting, Standing, Up/DownStairs, <b>LyingDown</b>}</li>
      <li>timestamp: microsecond (Unix Time)</li>
      <li>x-accel: floating-point values between -20 .. 20</li>
      <li>y-accel: floating-point values between -20 .. 20</li>
      <li>z-accel: floating-point values between -20 .. 20</li>
    </ul>
    <p> The acceleration in the x direction as measured by the android phone's accelerometer. A value of 10 = 1g = 9.81 m/s^2, and 0 = no acceleration. The acceleration recorded includes gravitational acceleration toward the center of the Earth, so that when the phone is at rest on a flat surface the vertical axis will register +-10. <br></p>
</div>

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys, os

TIME_STEPS_arr = [90, 60, 50, 40]
isSTEPS_arr = [True, False]
SPLIT = 0.5
NO_IN, NO_OUT = 3, 6
COLUMNS = ['x_axis', 'y_axis', 'z_axis']

def divideData_perUser(data, per=0.5):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    X_df = pd.DataFrame()
    for user in np.unique(data['user']):
        dataPerUser = data[data['user']==user]
        for tag in np.unique(dataPerUser['activity']):
            dataPerActivity = dataPerUser[dataPerUser['activity']==tag]
            n = len(dataPerActivity)
            train_df = train_df.append(dataPerActivity[0:int(n*per)])
            val_df = val_df.append(dataPerActivity[int(n*per):int(n)])
            X_df = X_df.append(dataPerActivity)        
    return X_df, train_df, val_df

# Utils functions for segmenting windows
def windows(data,window_size,step):
    start = 0
    while start< data.count():
        yield int(start), int(start + window_size)
        start+= step
def segment_signal(data, window_size = 90, step=40,columns=[]):
    segments = np.empty((0,window_size,len(columns)))
    labels= np.empty((0))
    for user in np.unique(data['user']):
        userdata = data[(data.user == user)]
        for tag in np.unique(userdata['activity']):
            sub_class_data = userdata[(userdata.activity == tag)]
            for (start, end) in windows(pd.Series(sub_class_data.index.values),window_size,step):
                if end > sub_class_data.shape[0] - 1:
                    end = sub_class_data.shape[0]
                    true_length = end - start
                    remaining_data_length = window_size - true_length
                    start -= remaining_data_length
                if (sub_class_data[start:end].isnull().values.any()):
                    print(sub_class_data[start:end].isnull().sum())
                if(sub_class_data[start:end].shape[0] == window_size):
                    segments = np.vstack([segments,np.dstack([sub_class_data[column][start:end] for column in columns])])
                    labels = np.append(labels, tag) 
    return segments, labels.reshape(-1, 1)

wisdmdataset_path = '../../../../Datasets/6_har/0_WISDM/WISDM_ar_v2.0/'
os.mkdir(f"{wisdmdataset_path}/WISDM_ar_v2_processed/WISDM_ar_v2_wt_overlap/") if os.path.isdir(f"{wisdmdataset_path}/WISDM_ar_v2_processed/WISDM_ar_v2_wt_overlap/") == False else None
os.mkdir(f"{wisdmdataset_path}/WISDM_ar_v2_processed/WISDM_ar_v2_w_overlap/") if os.path.isdir(f"{wisdmdataset_path}/WISDM_ar_v2_processed/WISDM_ar_v2_w_overlap/") == False else None

wisdm_phone_data = pd.read_csv(f'{wisdmdataset_path}/WISDM_ar_v2_source/WISDM_at_v2.0_raw.txt', header=None, names=['user', 'activity', 'timestamp', 'x_axis', 'y_axis', 'z_axis'])
wisdm_phone_data.dropna(axis=0, how='any', inplace=True)
wisdm_phone_data.z_axis.replace(regex=True, inplace=True, to_replace=r';', value=r'')
wisdm_phone_data['x_axis'] = wisdm_phone_data.x_axis.astype(np.float64)
wisdm_phone_data['y_axis'] = wisdm_phone_data.y_axis.astype(np.float64)
wisdm_phone_data['z_axis'] = wisdm_phone_data.z_axis.astype(np.float64)
wisdm_phone_data['timestamp'].apply(lambda x: int(x))
# {Walking - A - 1, Jogging - B - 2, Up/DownStairs - C - 5, Sitting - D - 3, Standing - E - 4, LyingDown - ?? - 6}
mapping_dict = {'Walking': 1,'Jogging': 2, 'Sitting': 3, 'Standing': 4, 'Stairs': 5, 'Upstairs': 5, 'Downstairs ': 5, 'LyingDown': 6}
wisdm_phone_data['activity'] = wisdm_phone_data.activity.map(mapping_dict)

X_df, train_df, val_df = divideData_perUser(wisdm_phone_data, SPLIT)

for isSTEPS in isSTEPS_arr:
    for TIME_STEPS in TIME_STEPS_arr:
        STEP = int(round(TIME_STEPS/2,-1)) if isSTEPS else TIME_STEPS
        print(TIME_STEPS, STEP)

        X, y = segment_signal(X_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_train, y_train = segment_signal(train_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_val, y_val = segment_signal(val_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)

        enc = OneHotEncoder().fit(np.array(list(mapping_dict.values())).reshape(-1,1))
        y_train = enc.transform(y_train).toarray()
        y_val   = enc.transform(y_val).toarray()
        y       = enc.transform(y).toarray()

        y_train = np.tile(y_train, TIME_STEPS).reshape((y_train.shape[0], TIME_STEPS, y_train.shape[1]))
        y_val   = np.tile(y_val, TIME_STEPS).reshape((y_val.shape[0], TIME_STEPS, y_val.shape[1]))
        y       = np.tile(y, TIME_STEPS).reshape((y.shape[0], TIME_STEPS, y.shape[1]))

        df_train = np.concatenate((X_train, y_train), axis=2).reshape((X_train.shape[0], -1))
        df_val = np.concatenate((X_val, y_val), axis=2).reshape((X_val.shape[0], -1))
        df = np.concatenate((X,y), axis=2).reshape((X.shape[0], -1))
        
        print(X_train.shape, y_train.shape, df_train.shape)
        print(X_val.shape, y_val.shape, df_val.shape)
        print(X.shape, y.shape, df.shape)
        
        wisdm_phone_result_path = f"{wisdmdataset_path}WISDM_ar_v2_processed/WISDM_ar_v2_wt_overlap/" if TIME_STEPS==STEP else f"{wisdmdataset_path}WISDM_ar_v2_processed/WISDM_ar_v2_w_overlap/"

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'a') as csvfile:
            np.savetxt(csvfile, df, fmt='%.4f', delimiter=",")

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'a') as csvfile:
            np.savetxt(csvfile, df_train, fmt='%.4f', delimiter=",")

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'a') as csvfile:
            np.savetxt(csvfile, df_val, fmt='%.4f', delimiter=",")

  exec(code_obj, self.user_global_ns, self.user_ns)


90 40
(37441, 90, 3) (37441, 90, 6) (37441, 810)
(37447, 90, 3) (37447, 90, 6) (37447, 810)
(74695, 90, 3) (74695, 90, 6) (74695, 810)
60 30
(49906, 60, 3) (49906, 60, 6) (49906, 540)
(49911, 60, 3) (49911, 60, 6) (49911, 540)
(99580, 60, 3) (99580, 60, 6) (99580, 540)
50 20
(74683, 50, 3) (74683, 50, 6) (74683, 450)
(74695, 50, 3) (74695, 50, 6) (74695, 450)
(149211, 50, 3) (149211, 50, 6) (149211, 450)
40 20
(74686, 40, 3) (74686, 40, 6) (74686, 360)
(74695, 40, 3) (74695, 40, 6) (74695, 360)
(149211, 40, 3) (149211, 40, 6) (149211, 360)
90 90
(16795, 90, 3) (16795, 90, 6) (16795, 810)
(16799, 90, 3) (16799, 90, 6) (16799, 810)
(33337, 90, 3) (33337, 90, 6) (33337, 810)
60 60
(25059, 60, 3) (25059, 60, 6) (25059, 540)
(25062, 60, 3) (25062, 60, 6) (25062, 540)
(49915, 60, 3) (49915, 60, 6) (49915, 540)
50 50
(29962, 50, 3) (29962, 50, 6) (29962, 450)
(29967, 50, 3) (29967, 50, 6) (29967, 450)
(59783, 50, 3) (59783, 50, 6) (59783, 450)
40 40
(37470, 40, 3) (37470, 40, 6) (37470, 360)


<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>6_1. WISDM v3.0</h3></div>
<div>
    <p>
        Source: <a href="url">https://archive.ics.uci.edu/ml/datasets/WISDM+Smartphone+and+Smartwatch+Activity+and+Biometrics+Dataset+</a> <br>
        Raw's format: <b>[subject-id],[activity],[timestamp],[x-accel],[y-accel],[z-accel]</b> <br>
        Number of samples for non-hand-oriented activities (5 activities): <b>4,347,890</b><br>
    </p> 
    <ul>
      <li>Phone acceleration: 1,338,067</li>
      <li>Watch acceleration: 1,053,141</li>
      <li>Phone gyroscope:    1,006,749</li>
      <li>Watch gyroscope:    0,949,933</li>
    </ul> 
    <p>Fields:<br></p>
    <ul>
      <li>subject-id: 1600..1650 (51 participants)</li>
      <li>activity: {Walking - <b>A</b>, Jogging - <b>B</b>, Stairs - <b>C</b>, Sitting - <b>D</b>, Standing - <b>E</b>}</li>
      <li>timestamp: microsecond (Unix Time)</li>
      <li>x-accel: floating-point (can be positive or negative)</li>
      <li>y-accel: floating-point (can be positive or negative)</li>
      <li>z-accel: floating-point (can be positive or negative)</li>
    </ul>
    <p> For the accelerometer sensor, the units are m/s2; while, for the gyroscope sensor, the units are radians/s. The force of gravity on Earth, which affects the accelerometer readings, is 9.8m/s2. <br></p>
</div>

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys, glob, os

TIME_STEPS_arr = [90, 60, 50, 40]
isSTEPS_arr = [True, False]
SPLIT = 0.5
NO_IN, NO_OUT = 6, 6
COLUMNS = ['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
activities_arr = ['A', 'B', 'C', 'D', 'E']

def divideData_perUser(data, per=0.5):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    X_df = pd.DataFrame()
    for user in np.unique(data['user']):
        dataPerUser = data[data['user']==user]
        for tag in np.unique(dataPerUser['activity']):
            # if tag in activities_arr:
            dataPerActivity = dataPerUser[dataPerUser['activity']==tag]
            n = len(dataPerActivity)
            train_df = train_df.append(dataPerActivity[0:int(n*per)])
            val_df = val_df.append(dataPerActivity[int(n*per):int(n)])
            X_df = X_df.append(dataPerActivity)        
    return X_df, train_df, val_df

# Utils functions for segmenting windows
def windows(data,window_size,step):
    start = 0
    while start< data.count():
        yield int(start), int(start + window_size)
        start+= step
def segment_signal(data, window_size = 90, step=40,columns=[]):
    segments = np.empty((0,window_size,len(columns)))
    labels= np.empty((0))
    for user in np.unique(data['user']):
        userdata = data[(data.user == user)]
        for tag in np.unique(userdata['activity']):
            sub_class_data = userdata[(userdata.activity == tag)]
            for (start, end) in windows(pd.Series(sub_class_data.index.values),window_size,step):
                if end > sub_class_data.shape[0] - 1:
                    end = sub_class_data.shape[0]
                    true_length = end - start
                    remaining_data_length = window_size - true_length
                    start -= remaining_data_length
                if (sub_class_data[start:end].isnull().values.any()):
                    print(sub_class_data[start:end].isnull().sum())
                if(sub_class_data[start:end].shape[0] == window_size):
                    segments = np.vstack([segments,np.dstack([sub_class_data[column][start:end] for column in columns])])
                    labels = np.append(labels, tag) 
    return segments, labels.reshape(-1, 1)

wisdmdataset_path = '../../../../Datasets/6_har/0_WISDM/WISDM_ar_v3.0/'
os.mkdir(f"{wisdmdataset_path}/WISDM_ar_v3_processed/WISDM_ar_v3_wt_overlap/") if os.path.isdir(f"{wisdmdataset_path}/WISDM_ar_v3_processed/WISDM_ar_v3_wt_overlap/") == False else None
os.mkdir(f"{wisdmdataset_path}/WISDM_ar_v3_processed/WISDM_ar_v3_w_overlap/") if os.path.isdir(f"{wisdmdataset_path}/WISDM_ar_v3_processed/WISDM_ar_v3_w_overlap/") == False else None

wisdm_phone_accel_path = f'{wisdmdataset_path}/WISDM_ar_v3_source/raw/phone/accel'
wisdm_phone_accel_files_mask = os.path.join(wisdm_phone_accel_path, '*.txt')
wisdm_phone_accel_files = sorted(glob.glob(wisdm_phone_accel_files_mask))

wisdm_phone_gyro_path = f'{wisdmdataset_path}/WISDM_ar_v3_source/raw/phone/gyro'
wisdm_phone_gyro_files_mask = os.path.join(wisdm_phone_gyro_path, '*.txt')
wisdm_phone_gyro_files = sorted(glob.glob(wisdm_phone_gyro_files_mask))

wisdm_phone_data = pd.DataFrame()
count = 0
for accel_file, gyro_file in zip(wisdm_phone_accel_files, wisdm_phone_gyro_files):
    accel_data = pd.read_csv(accel_file, header=None, names=['user', 'activity', 'timestamp', 'x_accel', 'y_accel', 'z_accel'], index_col=['user', 'activity', 'timestamp'])
    accel_data.z_accel.replace(regex=True, inplace=True, to_replace=r';', value=r'')
    accel_data = accel_data.loc[~accel_data.index.duplicated(keep='first')]
    gyro_data = pd.read_csv(gyro_file, header=None, names=['user', 'activity', 'timestamp', 'x_gyro', 'y_gyro', 'z_gyro'], index_col=['user', 'activity', 'timestamp'])
    gyro_data.z_gyro.replace(regex=True, inplace=True, to_replace=r';', value=r'')
    gyro_data = gyro_data.loc[~gyro_data.index.duplicated(keep='first')]
    user_data = pd.concat([accel_data, gyro_data], axis=1).dropna()
    wisdm_phone_data = wisdm_phone_data.append(user_data)


wisdm_phone_data = wisdm_phone_data.reset_index()
wisdm_phone_data['x_accel'] = wisdm_phone_data.x_accel.astype(np.float64)
wisdm_phone_data['y_accel'] = wisdm_phone_data.y_accel.astype(np.float64)
wisdm_phone_data['z_accel'] = wisdm_phone_data.z_accel.astype(np.float64)
wisdm_phone_data['x_gyro'] = wisdm_phone_data.x_gyro.astype(np.float64)
wisdm_phone_data['y_gyro'] = wisdm_phone_data.y_gyro.astype(np.float64)
wisdm_phone_data['z_gyro'] = wisdm_phone_data.z_accel.astype(np.float64)
wisdm_phone_data['timestamp'].apply(lambda x: float(x))
wisdm_phone_data.dropna(axis=0, how='any', inplace=True)
wisdm_phone_data = wisdm_phone_data[wisdm_phone_data.activity.isin(activities_arr) == True].reset_index()
# {Walking - A - 1, Jogging - B - 2, Up/DownStairs - C - 5, Sitting - D - 3, Standing - E - 4, LyingDown - Z - 6}
mapping_dict = {'A': 1,'B': 2, 'D': 3, 'E': 4, 'C': 5 , 'Z': 6}
wisdm_phone_data['activity'] = wisdm_phone_data.activity.map(mapping_dict)

X_df, train_df, val_df = divideData_perUser(wisdm_phone_data, SPLIT)

for isSTEPS in isSTEPS_arr:
    for TIME_STEPS in TIME_STEPS_arr:
        STEP = int(round(TIME_STEPS/2,-1)) if isSTEPS else TIME_STEPS
        print(TIME_STEPS, STEP)

        X_train, y_train = segment_signal(train_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_val, y_val = segment_signal(val_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X, y = segment_signal(X_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        
        enc = OneHotEncoder().fit(np.array(list(mapping_dict.values())).reshape(-1,1))
        y_train = enc.transform(y_train).toarray()
        y_val   = enc.transform(y_val).toarray()
        y       = enc.transform(y).toarray()

        y_train = np.tile(y_train, TIME_STEPS).reshape((y_train.shape[0], TIME_STEPS, y_train.shape[1]))
        y_val   = np.tile(y_val, TIME_STEPS).reshape((y_val.shape[0], TIME_STEPS, y_val.shape[1]))
        y       = np.tile(y, TIME_STEPS).reshape((y.shape[0], TIME_STEPS, y.shape[1]))

        df_train = np.concatenate((X_train, y_train), axis=2).reshape((X_train.shape[0], -1))
        df_val = np.concatenate((X_val, y_val), axis=2).reshape((X_val.shape[0], -1))
        df = np.concatenate((X,y), axis=2).reshape((X.shape[0], -1))
        
        print(X_train.shape, y_train.shape, df_train.shape)
        print(X_val.shape, y_val.shape, df_val.shape)
        print(X.shape, y.shape, df.shape)
        
        wisdm_phone_result_path = f"{wisdmdataset_path}WISDM_ar_v3_processed/WISDM_ar_v3_wt_overlap/" if TIME_STEPS==STEP else f"{wisdmdataset_path}WISDM_ar_v3_processed/WISDM_ar_v3_w_overlap/"
        
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'a') as csvfile:
            np.savetxt(csvfile, df, fmt='%.4f', delimiter=",")

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'a') as csvfile:
            np.savetxt(csvfile, df_train, fmt='%.4f', delimiter=",")

        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{wisdm_phone_result_path}/wisdm.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'a') as csvfile:
            np.savetxt(csvfile, df_val, fmt='%.4f', delimiter=",")

90 40
(9561, 90, 6) (9561, 90, 6) (9561, 1080)
(9561, 90, 6) (9561, 90, 6) (9561, 1080)
(19064, 90, 6) (19064, 90, 6) (19064, 1080)
60 30
(12742, 60, 6) (12742, 60, 6) (12742, 720)
(12742, 60, 6) (12742, 60, 6) (12742, 720)
(25311, 60, 6) (25311, 60, 6) (25311, 720)
50 20
(19061, 50, 6) (19061, 50, 6) (19061, 600)
(19061, 50, 6) (19061, 50, 6) (19061, 600)
(38022, 50, 6) (38022, 50, 6) (38022, 600)
40 20
(19064, 40, 6) (19064, 40, 6) (19064, 480)
(19064, 40, 6) (19064, 40, 6) (19064, 480)
(38025, 40, 6) (38025, 40, 6) (38025, 480)
90 90
(4287, 90, 6) (4287, 90, 6) (4287, 1080)
(4287, 90, 6) (4287, 90, 6) (4287, 1080)
(8517, 90, 6) (8517, 90, 6) (8517, 1080)
60 60
(6380, 60, 6) (6380, 60, 6) (6380, 720)
(6380, 60, 6) (6380, 60, 6) (6380, 720)
(12744, 60, 6) (12744, 60, 6) (12744, 720)
50 50
(7671, 50, 6) (7671, 50, 6) (7671, 600)
(7671, 50, 6) (7671, 50, 6) (7671, 600)
(15300, 50, 6) (15300, 50, 6) (15300, 600)
40 40
(9565, 40, 6) (9565, 40, 6) (9565, 480)
(9565, 40, 6) (9565, 40, 6) (9