# TSFRESH Activity Recognition on WISDM Data

In [20]:
%matplotlib inline
import matplotlib.pylab as plt
import matplotlib
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
# import seaborn as sns
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from pandas import DataFrame
import pandas as pd
import numpy as np
import os
import math

import logging

In [21]:
# We set the logger to Error level
# This is not recommend for normal use as you can oversee important Warning messages
logging.basicConfig(level=logging.ERROR)

## Data Preprocessing for tsfresh
for tsfresh指通过为了tsfresh能提取特征，需要做些格式处理。
tsfresh能接受的pandas.DataFrame有三种，这里使用了Stack DataFrame形式，格式详情可以见[Data Formats](http://tsfresh.readthedocs.io/en/latest/text/data_formats.html)
下面，首先定义一些常量和函数：

In [22]:
# 六种行为
activity=["Walking", "Sitting", "Jogging", "Standing", "LyingDown", "Stairs"]
# 六个传感器数据
fields=["user", "activity", "timestamp", "x-accel", "y-accel", "z-accel"]
sensor=["x-accel", "y-accel", "z-accel"]
type(fields)
# 一些常量
file_name='WISDM_at_v2.0_my.txt'
sensornum=3
#窗口大小和步长
win_size=60
step_size=30
#使用的样本数量(一个窗口的数据是一个样本)，建议设置陈6的倍数
use_num=600
datatmp=np.array

In [23]:
#根据Win，生成DataFrame格式中的Time列
def getTimeColumn(win):
    time=np.arange(win)
    for idx in range(1, sensornum):
        timetmp=np.arange(win)
        time=np.concatenate((time, timetmp), axis=0)
    time.shape=(len(time),1)
    return time

#根据Win和数据，生成DataFrame格式中的kind和value列
def getKindValueColumn(sd, win):
    for i in range(sensornum):
        kind=i*np.ones((win, 1),dtype=int)
        sensorcols=sd[i]
        #print("size of sensorcol is %d" %sensorcols.size)
        sensorcols.shape=(win, 1)
        sdata=np.column_stack((kind, sensorcols))
        if i==0:
            sensorframe=sdata
        else:
            sensorframe=np.row_stack((sensorframe, sdata))
    return sensorframe

def getIdColumn(num, win):
    for i in range(num):
        if i==0:
            idarry=0*np.ones((sensornum*win, 1))
        else:
            idarry=np.row_stack((idarry, i*np.ones((sensornum*win, 1))))
    return idarry

def getDataLabelColumn(dataary,label, num, win):
    for i in range(len(activity)):
        if i==0:
            y=label[activity[i]][:num]
            data=dataary[activity[i]][:num*win*sensornum]
        else:
            y=np.row_stack((y, label[activity[i]][:num]))
            data=np.row_stack((data, dataary[activity[i]][:num*win*sensornum]))
    y.shape=(len(y),)
    y=pd.Series(y)
    return data, y

def getSampleOneUser(win, sensordata, step):
    global datatmp
    
    dataLen = len(sensordata)
    j = 0
    step_num = 0
    while j < dataLen - win:
        sd=sensordata[j:j+win,:]
        sd=sd.transpose()
        time=getTimeColumn(win)
        kindvalue=getKindValueColumn(sd, win)
        travary=np.column_stack((time, kindvalue))                
        if j == 0:
            datatmp=travary
        else:
            datatmp=np.concatenate((datatmp, travary), axis=0)
        j += step
        step_num +=1
    return datatmp, step_num

def getSampleOneActivity(win, df, act, step):
    label_user=np.array([])
    dataarray=np.array([])
    users=pd.Series(df.user).value_counts().reset_index()
    userlist=list(np.array(users["index"]))
    for i in range(pd.Series(df.user).value_counts().size):#提取测试者一种行为数据的所有采样
        sensordata=df[df.user==userlist[i]][df.activity==activity[act]].iloc[:,3:].values
        if sensordata.size>win*sensornum:
            data, step_num = getSampleOneUser(win, sensordata, step)
            label_act = act*np.ones((step_num, 1), dtype=int)
            if label_act.size>0 and data.size>0:
                if label_user.size == 0:
                    label_user=label_act
                    dataarray = data
                else:
                    label_user=np.concatenate((label_user, label_act), axis=0)
                    dataarray=np.concatenate((dataarray, data), axis=0)
    return label_user,dataarray
        

In [24]:
#读取所有数据，结果是字典，分别存储六种运动的传感器读数矩阵，每个矩阵的三列分别是time, kind, value
def loadData(win, df, step):
    labeldict={}
    datadict={}
    for i in range(len(activity)): #收集每种行为数据
        label_user,dataarray = getSampleOneActivity(win,df,i,step)
        if label_user.size>0 and dataarray.size>0:
            labeldict[activity[i]]=label_user
            datadict[activity[i]]=dataarray
        #print("user:%d" % userlist[i] + " loaded!")
    print("Done!")
    return datadict, labeldict

#从所有数据dataary和其标签label中，选出总数为num的样本，每个类别选num/sensornum个样本。
def genTrainSample(dataary, label, num, win):
    idary=getIdColumn(num, win)
    labelnum=int(num/len(activity)) ;
    data, y=getDataLabelColumn(dataary, label, labelnum, win)
    print(idary.shape)
    print(data.shape)
    data=np.column_stack((idary, data))
    dataframe = DataFrame(data, columns=['id', 'time', 'kind', 'value'])
    return dataframe, y

## Load and visualize data
读取6个文件中的数据，先存在一个dict里面，然后根据tsfresh的格式要求转化陈pandas.DataFrame形式。

In [25]:
df=pd.read_csv('/home/hadoop/data/WISDM_ar_v2.0_my.txt', header=None, names=fields,skip_blank_lines=True)
a=pd.Series(df.activity).value_counts().reset_index()

data, label = loadData(win_size, df, step_size)
a



Done!


Unnamed: 0,index,activity
0,Walking,1255923
1,Sitting,663706
2,Jogging,438871
3,Standing,288873
4,LyingDown,275967
5,Stairs,57425


In [26]:
#print(data[0].shape)        #data["bus]矩阵三列分别是time, kind, value
#print(label[0].shape)       #只有一列，因为sensornum*win条读数才是一个样本，所以label的行数是data的1/(sensornum*win)
print(pd.Series(label[activity[0]].reshape(label[activity[0]].size,)).value_counts())
print(pd.Series(label[activity[1]].reshape(label[activity[1]].size,)).value_counts())
print(pd.Series(label[activity[2]].reshape(label[activity[2]].size,)).value_counts())
print(pd.Series(label[activity[3]].reshape(label[activity[3]].size,)).value_counts())
print(pd.Series(label[activity[4]].reshape(label[activity[4]].size,)).value_counts())
print(pd.Series(label[activity[5]].reshape(label[activity[5]].size,)).value_counts())


0    41569
dtype: int64
1    21985
dtype: int64
2    14548
dtype: int64
3    9511
dtype: int64
4    9133
dtype: int64
5    1884
dtype: int64


In [27]:
#取出use_num个样本，并将data转化成tsfresh需要的pandas.DataFrame类型，存储为df；将label转化为特征过滤需要的pandas.Series类型，存储为y
df, y=genTrainSample(data, label, use_num, win_size)    


(108000, 1)
(108000, 3)


In [None]:
df.shape
y.shape

In [None]:
#画出第idx个样本3个传感器的数据
def plotSample(data, kind, idx, win):
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(18.5, 10.5)
    data=data[kind]
    for i in range(sensornum):
        plt.subplot(3,2,i+1)
        plt.title(sensor[i] + ' readings')
        begin=idx*win*sensornum + win*i
        end=begin+win-1
        plt.plot(data[begin:end, 2])
    plt.show()

In [None]:
#展示bus数据，第idx条数据的传感器读数，注意这里的纵轴范围还没有统一，不能直接跨样本对比
kind="Walking"; idx=0
plotSample(data, kind, idx, win)

## Extract Features

In [None]:
master_df = df

print(master_df.shape)
print(master_df)

In [None]:
extraction_settings = ComprehensiveFCParameters()
#extraction_settings = EfficientFCParameters()
#extraction_settings = MinimalFCParameters()
extraction_settings

In [None]:
%time X = extract_features(master_df, default_fc_parameters=extraction_settings, column_id='id', column_sort="time", column_kind="kind", column_value="value");
# %time X = extract_features(master_df, column_id='id', column_sort="time", column_kind="kind", column_value="value");

In [None]:
# in total we have transformed the sensor data into 222 features
impute(X)
X.shape

In [None]:
X.head()

In [None]:
X.tail()

In [None]:
y.value_counts()

## Train and evaluate classifier

In [None]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
ss=StandardScaler()
X_train_ss=ss.fit_transform(X_train)
X_test_ss=ss.transform(X_test)

test=DataFrame(X_train_ss)
test[test.isnull().values==True]

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train_ss, y_train)
print(classification_report(y_test, cl.predict(X_test_ss)))

# Multiclass feature selection

+ 若extraction_settings=EfficientFCParameters()
计算的是所有特征中计算速度较快，time cost较小的特征，考虑所有传感器序列，共得到1614个特征。
+ 若extraction_settings=MinimalFCParameters()
只计算最少的特征，共得到48个

考虑使用tsfresh的select_features来筛选，但是该函数只对二分类或者回归任务有作用。对于这个6分类问题，需要将其转化成6个二分类问题，然后对每个问题进行特征选择。

In [None]:
relevant_features = set()
for label in y.unique():
    y_train_binary = y_train == label
    X_train_filtered = select_features(DataFrame(X_train), y_train_binary)
    print("Number of relevant features for class {}: {}/{}".format(label, X_train_filtered.shape[1], X_train.shape[1]))
    relevant_features = relevant_features.union(set(X_train_filtered.columns))

In [None]:
len(relevant_features)

于是特征数量从807降到了469

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train_ss, y_train)
print(classification_report(y_test, cl.predict(X_test_ss)))

测试精度也有一定降低

## Compare against naive classification accuracy

### Data Preprocessing for Native method
Native method指不提取特征，直接用所有传感器读数进行训练

In [None]:
def loadNative(win):
    labeldict={}
    datadict={}
    files = os.listdir('/home/hadoop/data')
    for i in range(0, filenum):
        with open('/home/hadoop/data/%s' % files[i], 'r') as f:
            sensordata=np.loadtxt(f, delimiter=",")
            for j in range(0, math.ceil(len(sensordata)/win)):
                sd=sensordata[win*j:win*(j+1),:]
                sd=sd.transpose()
                
                sdary=sd[0]
                for idx in range(1, sensornum):
                    sdary=np.concatenate((sdary, sd[idx]), axis=0)
                sdary.shape=(len(sdary), 1)
                if j == 0:
                    dataarray=sdary
                else:
                    dataarray=np.concatenate((dataarray, sdary), axis=1)
            
            dataarray=np.transpose(dataarray)
            labeldict[travel[i]]=i*np.ones((math.ceil(len(sensordata)/win), 1), dtype=int)
            datadict[travel[i]]=dataarray
            print(files[i]+" loaded!")
    print("Finished!") 
    return datadict, labeldict

def genNativeSample(dataary, label, num, win): 
    num=int(num/sensornum)
    for i in range(filenum):
        if i==0:
#             idary=0*np.ones((num, 1))
            y=label[travel[i]][:num]
            data=dataary[travel[i]][:num]
        else:
            y=np.row_stack((y, label[travel[i]][:num]))
            data=np.row_stack((data, dataary[travel[i]][:num]))
#             idary=np.row_stack((idary, i*np.ones((num, 1))))
#     data=np.column_stack((idary, data))
    dataframe = DataFrame(data)
    y.shape=(len(y),)
    y=pd.Series(y)
    return dataframe, y

In [None]:
data_1, label_1 = loadNative(win)

In [None]:
# print(data_1["bus"].shape)
print(data_1["bus"])

In [None]:
X_1, y=genNativeSample(data_1, label_1, use_num, win)

In [None]:
X_1.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=.2)

# print(X_train.head())
# print(X_test.head())
# print(y_train.head())
# print(y_test.head())

In [None]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)

# print(cl.predict(X_train))
# print(cl.predict(X_test))
print(classification_report(y_test, cl.predict(X_test)))