In [40]:
%matplotlib inline
import matplotlib.pylab as plt
import matplotlib
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
# import seaborn as sns
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from pandas import DataFrame
import pandas as pd
import numpy as np
import os
import math

import logging

#from SFFS import SFFS

# We set the logger to Error level
# This is not recommend for normal use as you can oversee important Warning messages
logging.basicConfig(level=logging.ERROR)

#根据Win，生成DataFrame格式中的Time列
def getTimeColumn(win):
    time=np.arange(win)
    for idx in range(1, sensornum):
        timetmp=np.arange(win)
        time=np.concatenate((time, timetmp), axis=0)
    time.shape=(len(time),1)
    return time

#根据Win和数据，生成DataFrame格式中的kind和value列
def getKindValueColumn(sd, win):
    for i in range(sensornum):
        kind=i*np.ones((win, 1),dtype=int)
        sensorcols=sd[i]
        sensorcols.shape=(win, 1)
        sdata=np.column_stack((kind, sensorcols))
        if i==0:
            sensorframe=sdata
        else:
            sensorframe=np.row_stack((sensorframe, sdata))
    return sensorframe

def getIdColumn(num, win):
    for i in range(num):
        if i==0:
            idary=0*np.ones((sensornum*win, 1))
        else:
            idary=np.row_stack((idary, i*np.ones((sensornum*win, 1))))
    return idary

def getDataLabelColumn(dataary,label, num, win):
    for i in range(filenum):
        if i==0:
            y=label[travel[i]][:num]
            data=dataary[travel[i]][:num*win*sensornum]
        else:
            y=np.row_stack((y, label[travel[i]][:num]))
            data=np.row_stack((data, dataary[travel[i]][:num*win*sensornum]))
    y.shape=(len(y),)
    y=pd.Series(y)
    return data, y

#读取所有数据，结果是字典，分别存储六种运动的传感器读数矩阵，每个矩阵的三列分别是time, kind, value
def loadData(win):
    labeldict={}
    datadict={}
    files = os.listdir('/home/hadoop/data')
    for i in range(0, filenum):
        with open('/home/hadoop/data/%s' % files[i], 'r') as f:
            sensordata=np.loadtxt(f, delimiter=",")
            for j in range(0, math.ceil(len(sensordata)/win)):
                sd=sensordata[win*j:win*(j+1),:]
                sd=sd.transpose()
                
                time=getTimeColumn(win)
                kindvalue=getKindValueColumn(sd, win)
                travary=np.column_stack((time, kindvalue ))
                
                if j == 0:
                    dataarray=travary
                else:
                    dataarray=np.concatenate((dataarray, travary), axis=0)  

            labeldict[travel[i]]=i*np.ones((math.ceil(len(sensordata)/win), 1), dtype=int)
            datadict[travel[i]]=dataarray
            print(files[i]+" loaded!")
            
    return datadict, labeldict

#从所有数据dataary和其标签label中，选出总数为num的样本，每个类别选num/sensornum个样本。
def genTrainSample(dataary, label, num, win):
    idary=getIdColumn(num, win)
    labelnum=int(num/sensornum) ;
    data, y=getDataLabelColumn(dataary, label, labelnum, win)
    data=np.column_stack((idary, data))
    dataframe = DataFrame(data, columns=['id', 'time', 'kind', 'value'])
    return dataframe, y

In [2]:
# 六种出行方式
travel=["bus", "car", "running", "stationary", "subway", "walking"]
# 六个传感器数据
sensor=["azimath", "pitch", "roll", "north", "east", "up"]
# 一些常量
filenum=6
sensornum=6
#窗口大小
win=150
#使用的样本数量(一个窗口的数据是一个样本)，建议设置陈6的倍数
use_num=4800

data, label = loadData(win)
print(data["bus"].shape)        #data["bus]矩阵三列分别是time, kind, value
print(label["bus"].shape)       #只有一列，因为sensornum*win条读数才是一个样本，所以label的行数是data的1/(sensornum*win)

#取出use_num个样本，并将data转化成tsfresh需要的pandas.DataFrame类型，存储为df；将label转化为特征过滤需要的pandas.Series类型，存储为y
df, y=genTrainSample(data, label, use_num, win)    

print(df.shape)
print(type(y))
print(y.shape)

bus_data_set_101 loaded!
car_data_set_103 loaded!
running_data_set_8 loaded!
stationary_data_set_3 loaded!
subway_data_set_102 loaded!
walking_data_set_7 loaded!
(729900, 3)
(811, 1)
(4320000, 4)
<class 'pandas.core.series.Series'>
(4800,)


In [3]:
master_df = df

print(master_df.shape)
print(master_df)

(4320000, 4)
             id   time  kind      value
0           0.0    0.0   0.0   1.202669
1           0.0    1.0   0.0   1.200916
2           0.0    2.0   0.0   1.191972
3           0.0    3.0   0.0   1.186100
4           0.0    4.0   0.0   1.181217
5           0.0    5.0   0.0   1.191511
6           0.0    6.0   0.0   1.197771
7           0.0    7.0   0.0   1.205506
8           0.0    8.0   0.0   1.201536
9           0.0    9.0   0.0   1.206752
10          0.0   10.0   0.0   1.189720
11          0.0   11.0   0.0   1.188346
12          0.0   12.0   0.0   1.182130
13          0.0   13.0   0.0   1.191688
14          0.0   14.0   0.0   1.189974
15          0.0   15.0   0.0   1.203741
16          0.0   16.0   0.0   1.204384
17          0.0   17.0   0.0   1.202886
18          0.0   18.0   0.0   1.189893
19          0.0   19.0   0.0   1.186932
20          0.0   20.0   0.0   1.184310
21          0.0   21.0   0.0   1.179736
22          0.0   22.0   0.0   1.178087
23          0.0   23.0   0.

In [4]:
# extraction_settings = ComprehensiveFCParameters()
extraction_settings = EfficientFCParameters()
# extraction_settings = MinimalFCParameters()

%time X = extract_features(master_df, default_fc_parameters=extraction_settings, column_id='id', column_sort="time", column_kind="kind", column_value="value");
# %time X = extract_features(master_df, column_id='id', column_sort="time", column_kind="kind", column_value="value");

# in total we have transformed the sensor data into 222 features
impute(X)
X.shape
X.columns

Feature Extraction: 100%|██████████| 6/6 [1:45:05<00:00, 1030.18s/it]


CPU times: user 2.95 s, sys: 908 ms, total: 3.86 s
Wall time: 1h 45min 8s


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
X_train_fs, X_val, y_train_fs, y_val = train_test_split(X_train, y_train, test_size=.25)

In [41]:
# coding: utf-8
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


#Sequential Floating Forward Selection, the evaluation is based on Random Forest
def SFFS(X_train_fs, y_train_fs, X_val, y_val):
    column = X_train.columns
    ftr = np.array([])
    ind = np.zeros(column.shape, dtype=bool)
    max_score = 0

    count = 0

    while ftr.size < column.shape[0] and max_score < 1:
        ## Forward
        max_score_forward = max_score
        select_ftr = -1
        for i, col in enumerate(ind):
            if col:
                continue
            else:
                tmp_ftr = np.append(ftr, column[i])
                max_score_forward, select_ftr = evaFtr(X_train, y_train, X_val, y_val,
                                                        tmp_ftr, i, max_score_forward, select_ftr)
        if select_ftr >= 0:
            ftr = np.append(ftr, column[select_ftr])
            ind[select_ftr] = True

        ## Backward
        max_score_backward = max_score_forward
        while ftr.size > 1:
            select_ftr = -1
            for i, n in enumerate(ftr):
                tmp_ftr = np.delete(ftr, i)
                max_score_backward, select_ftr = evaFtr(X_train, y_train, X_val, y_val,
                                                        tmp_ftr, i, max_score_backward, select_ftr)
            if select_ftr >= 0:
                ftr = np.delete(ftr, select_ftr)
                #ind[select_ftr] = False
            else:
                break
                
        count += 1
        print('Loop = %d, Maximum Score = %f, Feature Num = %s' % (count, max(max_score,max_score_backward), ftr.size))

        if max_score < max_score_backward:
            max_score = max_score_backward
        else:
            print('Feature Selection Completed!')
            break
        
    return ftr


def evaFtr(X_train, y_train, X_val, y_val, tmp_ftr, i, max_score, select_ftr):
        tmp_X_train = X_train.loc[:, tmp_ftr]
        tmp_X_val = X_val.loc[:, tmp_ftr]
        cl = RandomForestClassifier()
        cl.fit(tmp_X_train, y_train)
        score = accuracy_score(y_val, cl.predict(tmp_X_val))
        if score > max_score:
            max_score = score
            select_ftr = i
        return max_score, select_ftr

In [90]:
ftr = SFFS(X_train, y_train, X_val, y_val)

Loop = 1, Maximum Score = 0.621875, Feature Num = 1
Loop = 2, Maximum Score = 0.820833, Feature Num = 2
Loop = 3, Maximum Score = 0.903125, Feature Num = 3
Loop = 4, Maximum Score = 0.925000, Feature Num = 4
Loop = 5, Maximum Score = 0.938542, Feature Num = 5
Loop = 6, Maximum Score = 0.944792, Feature Num = 6
Loop = 7, Maximum Score = 0.957292, Feature Num = 7
Loop = 8, Maximum Score = 0.962500, Feature Num = 8
Loop = 9, Maximum Score = 0.962500, Feature Num = 8
Feature Selection Completed!


In [116]:
#Evaluate the selected features based on Random Forest
tmp_X_train = X_train.loc[:, ftr]
tmp_X_test = X_test.loc[:, ftr]
cl = RandomForestClassifier()
cl.fit(tmp_X_train, y_train)
score = accuracy_score(y_test, cl.predict(tmp_X_test))
print('Accuracy of test data is %f' % score)

Accuracy of test data is 0.948958


In [117]:
ftr

array([u'5.0__linear_trend__attr_"stderr"', u'0.0__median',
       u'2.0__sum_values', u'1.0__quantile__q_0.1',
       u'5.0__agg_linear_trend__f_agg_"mean"__chunk_len_5__attr_"intercept"',
       u'0.0__mean_abs_change_quantiles__qh_1.0__ql_0.2',
       u'5.0__autocorrelation__lag_1', u'5.0__median'], 
      dtype='<U66')

In [114]:
#Using all the features as contrast
cl = RandomForestClassifier()
cl.fit(X_train, y_train)
score = accuracy_score(y_test, cl.predict(X_test))
print(score)

0.933333333333
