In [2]:
import os
import argparse
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def load_raw_dataset(input_file_s):
    df_s = pd.read_csv(input_file_s, sep="\t", header=None,
                       names=["date", "time", "sensor", "value", "activity", "log"])
    return df_s


def clean_and_prepare(df_s):
    df_s.log = df_s.log.fillna(method='ffill')
    df_s['activity'] = df_s['activity'].fillna(df_s['log'])
    df_s['activity'] = df_s['activity'].replace("end", "Other")
    df_s['activity'] = df_s['activity'].fillna("Other")
    df_s['activity'] = df_s['activity'].replace("begin", None)
    df_s['activity'] = df_s['activity'].fillna(method='ffill')
    return df_s


def save_activity_dict(df_s, input_file_s):
    filename = "aruba_activity_list_step1.pickle"
    activities = df_s.activity.unique()
    # activities.sort()
    dictActivities = {}
    for i_s, activity in enumerate(activities):
        dictActivities[activity] = i_s
    pickle_out = open(filename, "wb")
    pickle.dump(dictActivities, pickle_out)
    pickle_out.close()


def generate_sentence(df2):
    sentence = ""
    sensors = df2.sensor.values
    values = df2.value.values
    for i_s in range(len(sensors)):
        val = values[i_s]
        if i_s == len(sensors) - 1:
            sentence += "{}{}".format(sensors[i_s], val)
        else:
            sentence += "{}{} ".format(sensors[i_s], val)
    return sentence


def segment_activities(df_s):
    activitiesSeq = []
    ponentialIndex = df_s.activity.ne(df_s.activity.shift())
    ii = np.where(ponentialIndex == True)[0]
    for i_s, end in enumerate(ii):
        if i_s > 0:
            df_stmp = df_s[ii[i_s - 1]:end]
            activitiesSeq.append(df_stmp)
    return activitiesSeq


def sliding_window(sequence, win_size_s, step_s=1):
    try:
        iter(sequence)
    except TypeError:
        raise Exception("**ERROR** sequence must be iterable.")
    # if not (isinstance(type(win_size_s), type(0)) and (isinstance(type(step_s), type(0)))):
    #     raise Exception("**ERROR** type(win_size_s) and type(step_s) must be int.")
    # if step_s > win_size_s:
    #     raise Exception("**ERROR** step_s must not be larger than win_size_s.")
    numOfChunks = int(((len(sequence) - win_size_s) / step_s) + 1)

    if win_size_s > len(sequence):
        yield sequence[0:len(sequence)]
    else:
        for i_s in range(0, numOfChunks * step_s, step_s):
            yield sequence[i_s:i_s + win_size_s]


def sequences_to_sentences(activity_sequences_s):
    sentences_s = []
    label_sentences_s = []
    for i_s in range(len(activity_sequences_s)):
        sentence = generate_sentence(activity_sequences_s[i_s])
        sentences_s.append(sentence)
        label_sentences_s.append(activity_sequences_s[i_s].activity.values[0])
    return sentences_s, label_sentences_s

input_file = r"no_D123"
win_size = 100
step = 1

print("STEP 1: Load dataset")
df = pd.read_csv("data",
                 sep='\t', header=None, names=["date", "time", "sensor",
                                               "value", "activity", "log"])

print("STEP 2: prepare dataset")
df = clean_and_prepare(df)
save_activity_dict(df, input_file)

print("STEP 3: segment dataset in sequence of activity")
activity_sequences = segment_activities(df)
print(activity_sequences[0:5])
df_txt = df.iloc[:, :-2]

print("STEP 4: transform sequences of activity in sentences")
sentences, label_sentences = sequences_to_sentences(activity_sequences)
print(sentences[0:5])

print("STEP 5: sentences indexization")
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
indexed_sentences = tokenizer.texts_to_sequences(sentences)

print("STEP 6: split indexed sentences in sliding windows")
X_windowed = []
Y_windowed = []
X_windowed_sen = []
Y_windowed_sen = []
for i, s in enumerate(indexed_sentences):
    chunks = sliding_window(s, win_size, step)
    for chunk in chunks:
        X_windowed.append(chunk)
        Y_windowed.append(label_sentences[i])

print("STEP 7: pad sliding windows")
padded_windows = pad_sequences(X_windowed, padding ='post')
Y_windowed = np.array(Y_windowed)
print("done")


STEP 1: Load dataset
STEP 2: prepare dataset
STEP 3: segment dataset in sequence of activity
[          date  time sensor value  activity    log
0   2010-11-04     0   M003    ON  Sleeping  begin
1   2010-11-04     0   M003   OFF  Sleeping  begin
2   2010-11-04     2   M003    ON  Sleeping  begin
3   2010-11-04     2   M003   OFF  Sleeping  begin
4   2010-11-04     3   M003    ON  Sleeping  begin
5   2010-11-04     3   M003   OFF  Sleeping  begin
6   2010-11-04     3   M003    ON  Sleeping  begin
7   2010-11-04     3   M003   OFF  Sleeping  begin
8   2010-11-04     4   M003    ON  Sleeping  begin
9   2010-11-04     4   M002    ON  Sleeping  begin
10  2010-11-04     4   M002   OFF  Sleeping  begin
11  2010-11-04     4   M003   OFF  Sleeping  begin
12  2010-11-04     4   M003    ON  Sleeping  begin
13  2010-11-04     4   M002    ON  Sleeping  begin
14  2010-11-04     4   M002   OFF  Sleeping  begin
15  2010-11-04     4   M003   OFF  Sleeping  begin
16  2010-11-04     5   M003    ON  Slee