<div class="alert" style="background-color:#29C5F6; color:white; padding:0px 10px; border-radius:5px;">
    <h1 style='margin:15px 15px; color:#000000; font-size:32px'><b>Data Generation (Processing)</b></h1>
        <h2 style='margin:15px 15px; color:#000000; font-size:24px'>Human Activity Recognition Problem</h2>
            <div style='color:#000000'>
                <ul>
                  <li>WISDM - WIreless Sensor Data Mining</li>
                  <li><b>UCI HAR - Human Activity Recognition using Smartphones at UCI</b></li>
                  <li>MotionSense</li>
                </ul>
            </div>
</div>

The work is under the **"Master Thesis"** by **Chau Tran** with the supervision from **Prof. Roland Olsson**.

<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>6_2. UCI-HAR</h3></div>
<div>
    <p>
        Source: <a href="url">https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones</a> <br>
        Raw label file's format: <b>[experience],[user],[activity],[start_point],[end_point]</b> <br>
        Number of samples for non-hand-oriented activities (6 activities): <b>747,550</b><br>
    </p> 
    <ul>
      <li>Walking (1):      121,964</li>
      <li>DownStairs (2):   116,524</li>
      <li>UpStairs (3):     107,775</li>
      <li>Sitting (4):      126,557</li>
      <li>Standing (5):     137,985</li>
      <li><b>LyingDown (6): 136,745</b></li>
    </ul> 
    <p>Fields:<br></p>
    <ul>
      <li>user: 1..30</li>
      <li>activity: {Walking, Upstarts, Sitting, Standing, Downstairs, <b>LyingDown</b>}</li>
      <li>x-accel: floating-point values between -20 .. 20</li>
      <li>y-accel: floating-point values between -20 .. 20</li>
      <li>z-accel: floating-point values between -20 .. 20</li>
      <li>x-gyro: floating-point values between -20 .. 20</li>
      <li>y-gyro: floating-point values between -20 .. 20</li>
      <li>z-gyro: floating-point values between -20 .. 20</li>
    </ul>
    <p> The acceleration in the x direction as measured by the android phone's accelerometer. A value of 10 = 1g = 9.81 m/s^2, and 0 = no acceleration. The acceleration recorded includes gravitational acceleration toward the center of the Earth, so that when the phone is at rest on a flat surface the vertical axis will register +-10. <br></p>
</div>

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys, os

TIME_STEPS_arr = [90, 60, 50, 40]
isSTEPS_arr = [True, False]
SPLIT = 0.5
COLUMNS = ['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
NO_IN = 6
NO_OUT = 6

def divideData_perUser(data, per=0.5):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    X_df = pd.DataFrame()
    for user in np.unique(data['user']):
        dataPerUser = data[data['user']==user]
        for tag in np.unique(dataPerUser['activity']):
            dataPerActivity = dataPerUser[dataPerUser['activity']==tag]
            n = len(dataPerActivity)
            train_df = train_df.append(dataPerActivity[0:int(n*per)])
            val_df = val_df.append(dataPerActivity[int(n*per):int(n)])
            X_df = X_df.append(dataPerActivity)        
    return X_df, train_df, val_df

# Utils functions for segmenting windows
def windows(data,window_size,step):
    start = 0
    while start< data.count():
        yield int(start), int(start + window_size)
        start+= step
def segment_signal(data, window_size = 90, step=40,columns=[]):
    segments = np.empty((0,window_size,len(columns)))
    labels= np.empty((0))
    for user in np.unique(data['user']):
        userdata = data[(data.user == user)]
        for tag in np.unique(userdata['activity']):
            sub_class_data = userdata[(userdata.activity == tag)]
            for (start, end) in windows(pd.Series(sub_class_data.index.values),window_size,step):
                if end > sub_class_data.shape[0] - 1:
                    end = sub_class_data.shape[0]
                    true_length = end - start
                    remaining_data_length = window_size - true_length
                    start -= remaining_data_length
                if (sub_class_data[start:end].isnull().values.any()):
                    print(sub_class_data[start:end].isnull().sum())
                if(sub_class_data[start:end].shape[0] == window_size):
                    segments = np.vstack([segments,np.dstack([sub_class_data[column][start:end] for column in columns])])
                    labels = np.append(labels, tag) 
    return segments, labels.reshape(-1, 1)

ucihar_phone_path = '../../../../Datasets/6_har/1_UCI_HAR'
os.mkdir(f"{ucihar_phone_path}/uci_har_v1_hapt_processed/uci_har_v1_hapt_w_overlap/") if os.path.isdir(f"{ucihar_phone_path}/uci_har_v1_hapt_processed/uci_har_v1_hapt_w_overlap/") == False else None
os.mkdir(f"{ucihar_phone_path}/uci_har_v1_hapt_processed/uci_har_v1_hapt_wt_overlap/") if os.path.isdir(f"{ucihar_phone_path}/uci_har_v1_hapt_processed/uci_har_v1_hapt_wt_overlap/") == False  else None

rawlabels = np.genfromtxt(f'{ucihar_phone_path}/uci_har_v1_hapt/RawData/labels.txt', delimiter=' ', dtype=np.int32)
ucihar_phone_data = pd.DataFrame(data=None, columns=['user', 'activity', 'x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro'])
for rawlabel in rawlabels:
    if (rawlabel[2] < 7):
        acc_rawfile = np.genfromtxt(f'{ucihar_phone_path}/uci_har_v1_hapt/RawData/acc_exp{rawlabel[0]:02d}_user{rawlabel[1]:02d}.txt', delimiter=' ', dtype=np.float32)
        gyro_rawfile = np.genfromtxt(f'{ucihar_phone_path}/uci_har_v1_hapt/RawData/gyro_exp{rawlabel[0]:02d}_user{rawlabel[1]:02d}.txt', delimiter=' ', dtype=np.float32)
        rawfile = np.concatenate([np.tile(rawlabel[1], (rawlabel[4]-rawlabel[3],1)), \
                                  np.tile(rawlabel[2], (rawlabel[4]-rawlabel[3],1)), \
                                  acc_rawfile[rawlabel[3]:rawlabel[4]], \
                                  gyro_rawfile[rawlabel[3]:rawlabel[4]]], \
                                 axis=1, dtype=object)
        ucihar_phone_data = ucihar_phone_data.append(pd.DataFrame(rawfile, columns=ucihar_phone_data.columns), ignore_index=True)

ucihar_phone_data.dropna(axis=0, how='any', inplace=True)

# {WALKING - 1, WALKING_UPSTAIRS - 2, WALKING_DOWNSTAIRS - 3, SITTING - 4, STANDING - 5, LAYING - 6}
mapping_dict = {1: 1, 4: 3, 5: 4, 2: 5, 3: 5, 6: 6}
ucihar_phone_data['activity'] = ucihar_phone_data.activity.map(mapping_dict)

X_df, train_df, val_df = divideData_perUser(ucihar_phone_data, SPLIT)

for isSTEPS in isSTEPS_arr:
    for TIME_STEPS in TIME_STEPS_arr:
        STEP = int(round(TIME_STEPS/2,-1)) if isSTEPS else TIME_STEPS
        print(TIME_STEPS, STEP)

        X, y = segment_signal(X_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_train, y_train = segment_signal(train_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_val, y_val = segment_signal(val_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)

        enc = OneHotEncoder().fit(np.array(list(mapping_dict.values())).reshape(-1,1))
        y_train = enc.transform(y_train).toarray()
        y_val   = enc.transform(y_val).toarray()
        y       = enc.transform(y).toarray()

        y_train = np.tile(y_train, TIME_STEPS).reshape((y_train.shape[0], TIME_STEPS, y_train.shape[1]))
        y_val   = np.tile(y_val, TIME_STEPS).reshape((y_val.shape[0], TIME_STEPS, y_val.shape[1]))
        y       = np.tile(y, TIME_STEPS).reshape((y.shape[0], TIME_STEPS, y.shape[1]))

        df_train = np.concatenate((X_train, y_train), axis=2).reshape((X_train.shape[0], -1))
        df_val = np.concatenate((X_val, y_val), axis=2).reshape((X_val.shape[0], -1))
        df = np.concatenate((X,y), axis=2).reshape((X.shape[0], -1))
        
        print(X_train.shape, y_train.shape, df_train.shape)
        print(X_val.shape, y_val.shape, df_val.shape)
        print(X.shape, y.shape, df.shape)

        ucihar_phone_result_path = f"{ucihar_phone_path}/uci_har_v1_hapt_processed/uci_har_v1_hapt_wt_overlap/" if TIME_STEPS==STEP else f"{ucihar_phone_path}/uci_har_v1_hapt_processed/uci_har_v1_hapt_w_overlap/"
        
        with open(fr"{ucihar_phone_result_path}/ucihar.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{ucihar_phone_result_path}/ucihar.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'a') as csvfile:
            np.savetxt(csvfile, df, fmt='%.4f', delimiter=",")

        with open(fr"{ucihar_phone_result_path}/ucihar.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{ucihar_phone_result_path}/ucihar.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'a') as csvfile:
            np.savetxt(csvfile, df_train, fmt='%.4f', delimiter=",")

        with open(fr"{ucihar_phone_result_path}/ucihar.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[NO_IN, NO_OUT]]),fmt='%d', delimiter=",")
        with open(fr"{ucihar_phone_result_path}/ucihar.ni={NO_IN}.no={NO_OUT}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'a') as csvfile:
            np.savetxt(csvfile, df_val, fmt='%.4f', delimiter=",")

90 40
(9414, 90, 6) (9414, 90, 5) (9414, 990)
(9415, 90, 6) (9415, 90, 5) (9415, 990)
(18758, 90, 6) (18758, 90, 5) (18758, 990)
60 30
(12523, 60, 6) (12523, 60, 5) (12523, 660)
(12524, 60, 6) (12524, 60, 5) (12524, 660)
(24980, 60, 6) (24980, 60, 5) (24980, 660)
50 20
(18757, 50, 6) (18757, 50, 5) (18757, 550)
(18758, 50, 6) (18758, 50, 5) (18758, 550)
(37442, 50, 6) (37442, 50, 5) (37442, 550)
40 20
(18757, 40, 6) (18757, 40, 5) (18757, 440)
(18758, 40, 6) (18758, 40, 5) (18758, 440)
(37442, 40, 6) (37442, 40, 5) (37442, 440)
90 90
(4231, 90, 6) (4231, 90, 5) (4231, 990)
(4231, 90, 6) (4231, 90, 5) (4231, 990)
(8380, 90, 6) (8380, 90, 5) (8380, 990)
60 60
(6299, 60, 6) (6299, 60, 5) (6299, 660)
(6300, 60, 6) (6300, 60, 5) (6300, 660)
(12524, 60, 6) (12524, 60, 5) (12524, 660)
50 50
(7543, 50, 6) (7543, 50, 5) (7543, 550)
(7544, 50, 6) (7544, 50, 5) (7544, 550)
(15019, 50, 6) (15019, 50, 5) (15019, 550)
40 40
(9414, 40, 6) (9414, 40, 5) (9414, 440)
(9415, 40, 6) (9415, 40, 5) (9415, 4