# TSFRESH Activity Recognition on WISDM Data

In [1]:
%matplotlib inline
import matplotlib.pylab as plt
import matplotlib
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
# import seaborn as sns
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from pandas import DataFrame
import pandas as pd
import numpy as np
import os
import math

import logging

In [2]:
# We set the logger to Error level
# This is not recommend for normal use as you can oversee important Warning messages
logging.basicConfig(level=logging.ERROR)

## Data Preprocessing for tsfresh
for tsfresh指通过为了tsfresh能提取特征，需要做些格式处理。
tsfresh能接受的pandas.DataFrame有三种，这里使用了Stack DataFrame形式，格式详情可以见[Data Formats](http://tsfresh.readthedocs.io/en/latest/text/data_formats.html)
下面，首先定义一些常量和函数：

In [3]:
# 六种行为
activity=["Walking", "Sitting", "Jogging", "Standing", "LyingDown", "Stairs"]
# 六个传感器数据
fields=["user", "activity", "timestamp", "x-accel", "y-accel", "z-accel"]
type(fields)
# 一些常量
file_name='WISDM_at_v2.0_my.txt'
sensornum=3
#窗口大小
win=40
#使用的样本数量(一个窗口的数据是一个样本)，建议设置陈6的倍数
use_num=600
datatmp=np.array

In [47]:
#根据Win，生成DataFrame格式中的Time列
def getTimeColumn(win):
    time=np.arange(win)
    for idx in range(1, sensornum):
        timetmp=np.arange(win)
        time=np.concatenate((time, timetmp), axis=0)
    time.shape=(len(time),1)
    return time

#根据Win和数据，生成DataFrame格式中的kind和value列
def getKindValueColumn(sd, win):
    for i in range(sensornum):
        kind=i*np.ones((win, 1),dtype=int)
        sensorcols=sd[i]
        sensorcols.shape=(win, 1)
        sdata=np.column_stack((kind, sensorcols))
        if i==0:
            sensorframe=sdata
        else:
            sensorframe=np.row_stack((sensorframe, sdata))
    return sensorframe

def getIdColumn(num, win):
    for i in range(num):
        if i==0:
            idarry=0*np.ones((sensornum*win, 1))
        else:
            idarry=np.row_stack((idarry, i*np.ones((sensornum*win, 1))))
    return idarry

def getDataLabelColumn(dataarry, label, df):
    y=np.array([])
    data=np.array([])
    users=pd.Series(df.user).value_counts().reset_index()
    userlist=list(np.array(users["index"]))
    for i in range(pd.Series(df.user).value_counts().size):
        if label.has_key(userlist[i]) and dataarry.has_key(userlist[i]):
            if y.size==0:
                y=label[userlist[i]]
                data=dataarry[userlist[i]]
            else:
                y=np.row_stack((y, label[userlist[i]]))
                data=np.row_stack((data, dataarry[userlist[i]]))
    y.shape=(len(y),)
    y=pd.Series(y)
    return data, y

def getSampleOneActivity(win, sensordata):
    global datatmp
    for j in range(0, int(math.floor(len(sensordata)/win))):
        sd=sensordata[win*j:win*(j+1),:]
        sd=sd.transpose()
        time=getTimeColumn(win)
        kindvalue=getKindValueColumn(sd, win)
        travary=np.column_stack((time, kindvalue ))                
        if j == 0:
            datatmp=travary
        else:
            datatmp=np.concatenate((datatmp, travary), axis=0)  
    return datatmp

def getSampleOneUser(win, df, user_id):
    label_user=np.array([])
    dataarray=np.array([])
    for i in range(len(activity)):#提取测试者一种行为数据的所有采样
        sensordata=df[df.user==user_id][df.activity==activity[i]].iloc[:,3:].values
        if sensordata.size>0:
            data = getSampleOneActivity(win, sensordata)
            label_act = i*np.ones((math.floor(len(sensordata)/win), 1), dtype=int)
            if label_act.size>0:
                if label_user.size == 0:
                    label_user=label_act
                    dataarray = data
                else:
                    label_user=np.concatenate((label_user, label_act), axis=0)
                    dataarray=np.concatenate((dataarray, data), axis=0)
    return label_user,dataarray
        

In [48]:
#读取所有数据，结果是字典，分别存储六种运动的传感器读数矩阵，每个矩阵的三列分别是time, kind, value
def loadData(win, df):
    labeldict={}
    datadict={}
    users=pd.Series(df.user).value_counts().reset_index()
    userlist=list(np.array(users["index"]))
    for i in range(pd.Series(df.user).value_counts().size): #根据测试者ID逐个读取每个测试者的数据
        label_user,dataarray = getSampleOneUser(win,df,userlist[i])
        if label_user.size>0 and dataarray.size>0:
            labeldict[userlist[i]]=label_user
            datadict[userlist[i]]=dataarray
        #print("user:%d" % userlist[i] + " loaded!")
    print("Done!")
    return datadict, labeldict

#从所有数据dataary和其标签label中，选出总数为num的样本，每个类别选num/sensornum个样本。
def genTrainSample(dataarry, label, df):
    #labelnum=int(num/sensornum)
    data, y=getDataLabelColumn(dataarry, label, df)
    idarry=getIdColumn(len(y), win)
    print(idarry.shape)
    print(data.shape)
    data=np.column_stack((idarry, data))
    dataframe = DataFrame(data, columns=['id', 'time', 'kind', 'value'])
    return dataframe, y

## Load and visualize data
读取6个文件中的数据，先存在一个dict里面，然后根据tsfresh的格式要求转化陈pandas.DataFrame形式。

In [39]:
df=pd.read_csv('/home/hadoop/data/WISDM_ar_v2.0_my.txt', header=None, names=fields,skip_blank_lines=True)
df.shape
data, label = loadData(win, df)



Done!


In [40]:
#print(data[0].shape)        #data["bus]矩阵三列分别是time, kind, value
#print(label[0].shape)       #只有一列，因为sensornum*win条读数才是一个样本，所以label的行数是data的1/(sensornum*win)

In [50]:
#取出use_num个样本，并将data转化成tsfresh需要的pandas.DataFrame类型，存储为df；将label转化为特征过滤需要的pandas.Series类型，存储为y
df, y=genTrainSample(data, label, df)    


(7917720, 1)
(7917720, 3)


In [52]:
df.shape
y.shape

(65981,)

In [49]:
#画出第idx个样本3个传感器的数据
def plotSample(data, kind, idx, win):
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(18.5, 10.5)
    data=data[kind]
    for i in range(sensornum):
        plt.subplot(3,2,i+1)
        plt.title(sensor[i] + ' readings')
        begin=idx*win*sensornum + win*i
        end=begin+win-1
        plt.plot(data[begin:end, 2])
    plt.show()

In [None]:
#展示bus数据，第idx条数据的传感器读数，注意这里的纵轴范围还没有统一，不能直接跨样本对比
kind="bus"; idx=0
plotSample(data, kind, idx, win)

## Extract Features

In [53]:
master_df = df

print(master_df.shape)
print(master_df)

(7917720, 4)
              id  time  kind      value
0            0.0   0.0   0.0  -2.683209
1            0.0   1.0   0.0  -2.315459
2            0.0   2.0   0.0  -3.023717
3            0.0   3.0   0.0  -3.541290
4            0.0   4.0   0.0  -2.601486
5            0.0   5.0   0.0  -0.858082
6            0.0   6.0   0.0  -1.661682
7            0.0   7.0   0.0  -2.315459
8            0.0   8.0   0.0  -2.152015
9            0.0   9.0   0.0  -2.247357
10           0.0  10.0   0.0  -2.451662
11           0.0  11.0   0.0  -3.445948
12           0.0  12.0   0.0  -1.797886
13           0.0  13.0   0.0  -2.860273
14           0.0  14.0   0.0  -2.152015
15           0.0  15.0   0.0  -3.309745
16           0.0  16.0   0.0  -1.253072
17           0.0  17.0   0.0  -1.212211
18           0.0  18.0   0.0  -1.293933
19           0.0  19.0   0.0   2.519764
20           0.0  20.0   0.0  -3.922660
21           0.0  21.0   0.0  -5.311935
22           0.0  22.0   0.0   0.013620
23           0.0  23.0   0.

In [54]:
# extraction_settings = ComprehensiveFCParameters()
extraction_settings = EfficientFCParameters()
# extraction_settings = MinimalFCParameters()

In [None]:
%time X = extract_features(master_df, default_fc_parameters=extraction_settings, column_id='id', column_sort="time", column_kind="kind", column_value="value");
# %time X = extract_features(master_df, column_id='id', column_sort="time", column_kind="kind", column_value="value");

Feature Extraction:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# in total we have transformed the sensor data into 222 features
impute(X)
X.shape

In [None]:
X.columns

## Train and evaluate classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(X.dtype)
print(y.dtype)

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print(classification_report(y_test, cl.predict(X_test)))

# Multiclass feature selection

+ 若extraction_settings=EfficientFCParameters()
计算的是所有特征中计算速度较快，time cost较小的特征，考虑所有传感器序列，共得到1614个特征。
+ 若extraction_settings=MinimalFCParameters()
只计算最少的特征，共得到48个

考虑使用tsfresh的select_features来筛选，但是该函数只对二分类或者回归任务有作用。对于这个6分类问题，需要将其转化成6个二分类问题，然后对每个问题进行特征选择。

In [None]:
relevant_features = set()

for label in y.unique():
    y_train_binary = y_train == label
    X_train_filtered = select_features(X_train, y_train_binary)
    print("Number of relevant features for class {}: {}/{}".format(label, X_train_filtered.shape[1], X_train.shape[1]))
    relevant_features = relevant_features.union(set(X_train_filtered.columns))

In [None]:
len(relevant_features)

于是特征数量从1614降到了878

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print(classification_report(y_test, cl.predict(X_test)))

测试精度也有一定降低

## Compare against naive classification accuracy

### Data Preprocessing for Native method
Native method指不提取特征，直接用所有传感器读数进行训练

In [None]:
def loadNative(win):
    labeldict={}
    datadict={}
    files = os.listdir('/home/hadoop/data')
    for i in range(0, filenum):
        with open('/home/hadoop/data/%s' % files[i], 'r') as f:
            sensordata=np.loadtxt(f, delimiter=",")
            for j in range(0, math.ceil(len(sensordata)/win)):
                sd=sensordata[win*j:win*(j+1),:]
                sd=sd.transpose()
                
                sdary=sd[0]
                for idx in range(1, sensornum):
                    sdary=np.concatenate((sdary, sd[idx]), axis=0)
                sdary.shape=(len(sdary), 1)
                if j == 0:
                    dataarray=sdary
                else:
                    dataarray=np.concatenate((dataarray, sdary), axis=1)
            
            dataarray=np.transpose(dataarray)
            labeldict[travel[i]]=i*np.ones((math.ceil(len(sensordata)/win), 1), dtype=int)
            datadict[travel[i]]=dataarray
            print(files[i]+" loaded!")
    print("Finished!") 
    return datadict, labeldict

def genNativeSample(dataary, label, num, win): 
    num=int(num/sensornum)
    for i in range(filenum):
        if i==0:
#             idary=0*np.ones((num, 1))
            y=label[travel[i]][:num]
            data=dataary[travel[i]][:num]
        else:
            y=np.row_stack((y, label[travel[i]][:num]))
            data=np.row_stack((data, dataary[travel[i]][:num]))
#             idary=np.row_stack((idary, i*np.ones((num, 1))))
#     data=np.column_stack((idary, data))
    dataframe = DataFrame(data)
    y.shape=(len(y),)
    y=pd.Series(y)
    return dataframe, y

In [None]:
data_1, label_1 = loadNative(win)

In [None]:
# print(data_1["bus"].shape)
print(data_1["bus"])

In [None]:
X_1, y=genNativeSample(data_1, label_1, use_num, win)

In [None]:
X_1.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=.2)

# print(X_train.head())
# print(X_test.head())
# print(y_train.head())
# print(y_test.head())

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)

# print(cl.predict(X_train))
# print(cl.predict(X_test))
print(classification_report(y_test, cl.predict(X_test)))