# 用时序划分训练测试集的尝试

In [1]:
%matplotlib inline
import matplotlib.pylab as plt
import matplotlib
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
# import seaborn as sns
from localtsfresh.tsfresh import extract_features, extract_relevant_features, select_features
from localtsfresh.tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from pandas import DataFrame
import pandas as pd
import numpy as np
import os
import math

import logging

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.signal import butter, lfilter, lfilter_zi
from numpy import linalg as LA
from xgboost import XGBClassifier
import copy
from collections import Counter

# We set the logger to Error level
# This is not recommend for normal use as you can oversee important Warning messages
logging.basicConfig(level=logging.ERROR)

#根据Win，生成DataFrame格式中的Time列
def getTimeColumn(win):
    time=np.arange(win)
    for idx in range(1, sensornum):
        timetmp=np.arange(win)
        time=np.concatenate((time, timetmp), axis=0)
    time.shape=(len(time),1)
    return time

#根据Win和数据，生成DataFrame格式中的kind和value列
def getKindValueColumn(sd, win):
    for i in range(sensornum):
        kind=i*np.ones((win, 1),dtype=int)
        sensorcols=sd[i]
        sensorcols.shape=(win, 1)
        sdata=np.column_stack((kind, sensorcols))
        if i==0:
            sensorframe=sdata
        else:
            sensorframe=np.row_stack((sensorframe, sdata))
    return sensorframe

def getIdColumn(num, win):
    for i in range(num):
        if i==0:
            idary=0*np.ones((sensornum*win, 1))
        else:
            idary=np.row_stack((idary, i*np.ones((sensornum*win, 1))))
    return idary

def getDataLabelColumn(dataary,label, num, win):
    for i in range(filenum):
        if i==0:
            y=label[travel[i]][:num]
            data=dataary[travel[i]][:num*win*sensornum]
        else:
            y=np.row_stack((y, label[travel[i]][:num]))
            data=np.row_stack((data, dataary[travel[i]][:num*win*sensornum]))
    y.shape=(len(y),)
    y=pd.Series(y)
    return data, y


#读取所有数据，结果是字典，分别存储六种运动的传感器读数矩阵，每个矩阵的三列分别是time, kind, value
def loadData(win, step, sampling, begin, end):
    labeldict={}
    datadict={}
    files = os.listdir('/home/hadoop/data')
    if step<=win:
        for i in range(0, filenum):
            with open('/home/hadoop/data/%s' % files[i], 'r') as f:
                sensordata=np.loadtxt(f, delimiter=",")[::2, :]
                
                ########## feature extraction ###########
                sensordata = np.delete(sensordata, np.s_[:3], 1)
                sensordata[:, 2] = sensordata[:, 2] - 9.8
                sensordata = sensordata[int(sensordata.shape[0]*begin): int(sensordata.shape[0]*end)]
                sensordata = resample(sensordata, sampling)
                
                #mi = LA.norm(sensordata, 2, axis=1)
                #sma = np.sum(np.abs(sensordata), 1)
                #velo = calcVelo(sensordata[:,:2], fs, win)
                #sensordata = np.c_[sensordata, mi, sma]
                
                #loadDataPlot(sensordata, i)
                
                ########## filter ############
                #fltr2(sensordata, cutOff=15, fs=fs, order=5)
                #########################################
                
                max_num=int((len(sensordata)-win)/step)+1
                for j in range(0, max_num):
                    start_idx=step*j
                    end_idx=step*j+win
                    sd=sensordata[start_idx:end_idx,:]
                    sd=sd.transpose()
                        
                    time=getTimeColumn(win)
                    kindvalue=getKindValueColumn(sd, win)
                    travary=np.column_stack((time, kindvalue ))

                    if j == 0:
                        dataarray=travary
                    else:
                        dataarray=np.concatenate((dataarray, travary), axis=0)  

                labeldict[travel[i]]=i*np.ones((max_num, 1), dtype=int)
                datadict[travel[i]]=dataarray
                print(files[i]+" loaded!")
        use_num_max=filenum*max_num
        return datadict, labeldict, use_num_max
    else:
        raise IOError('\'step\' of slide window shoud be less than \'win\'')

#从所有数据dataary和其标签label中，选出总数为num的样本，每个类别选num/sensornum个样本。
def genTrainSample(dataary, label, num, win):
    idary=getIdColumn(num, win)
    labelnum=int(num/filenum) ;
    data, y=getDataLabelColumn(dataary, label, labelnum, win)
    data=np.column_stack((idary, data))
    dataframe = DataFrame(data, columns=['id', 'time', 'kind', 'value'])
    return dataframe, y


#画出第idx个样本3个传感器的数据
def plotSample(data, kind, idx, win, use_num):
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(18.5, 10.5)
    tmp=data.iloc[:, -1].values
    for i in range(sensornum):
        plt.subplot(3,2,i+1)
        plt.title(sensor[i] + ' readings')
        begin=kind*win*use_num + idx*win*sensornum + win*i
        end=begin+win-1
        plt.plot(tmp[begin:end])
    plt.show()
    
def loadDataPlot(data, ind):
    plt.figure(ind)
    for i in xrange(0, data.shape[1]):
        fig = matplotlib.pyplot.gcf()
        fig.set_size_inches(18.5, 10.5)
        plt.subplot(3,2,i+1)
        plt.plot(data[:, i])
    return

################ Resampling ##################

def resample(data, sampling):
    tmp = []
    for i in xrange(0, 1):
        ind = np.arange(i, data.shape[0], sampling)
        tmp.append(data[ind, :])
    return np.concatenate(tmp, axis=0)

################ Calculate Velocity ###################

def calcVelo(acc, fs, win):
    velo = np.zeros(acc.shape)
    for i in xrange(0, acc.shape[0]):
        if i % win: 
            velo[i] = velo[i-1] + (acc[i-1]+acc[i])/(2*fs)
    return LA.norm(velo, 2, axis=1)
        

################ Butterworth 滤波 ###################

def butter_lowpass(cutOff, fs, order=5):
    nyq = 0.5 * fs
    normalCutoff = cutOff / nyq
    b, a = butter(order, normalCutoff, btype='low', analog = False)
    return b, a

def fltr(data, win, sensornum, cutOff, fs, order=5): #这个是在滑窗之后滤波的函数
    v = data.iloc[:, -1].values
    b, a = butter_lowpass(cutOff, fs, order)
    zi = np.tile(lfilter_zi(b, a), (sensornum,1))
    for i in xrange(0, v.size, win*sensornum):
        for j in xrange(0, sensornum):
            x = v[i+j*win: i+(j+1)*win]
            y, zi[j]= lfilter(b, a, x, zi=zi[j])
            data.iloc[i+j*win: i+(j+1)*win, -1] = y
    return

def fltr2(sensordata, cutOff, fs, order):  #这个是在数据读取后马上就滤波的函数
    b, a = butter_lowpass(cutOff, fs, order)
    for n in xrange(sensordata.shape[1]):
        zi = lfilter_zi(b, a)
        sensordata[:,n], zi = lfilter(b, a, sensordata[:,n], zi=zi)


localtsfresh_init!
feature_extraction_init
tsfresh_init!


In [4]:
#可修改常量
# 六种出行方式
travel=["bus", "car", "running", "stationary", "subway", "walking"]
# 六个传感器数据
#sensor=["azimath", "pitch", "roll", "mi", "sma"]
sensor=["azimath", "pitch", "roll"]
# 一些常量
filenum=6
#窗口步长大小
sampling=1         #采样间隔，注意，win是在采样间隔基础上算的
win=200/sampling
step=win/2           #步长应该小于等于win
#使用的样本数量(一个窗口的数据是一个样本)，建议设置成sensornum的倍数
fs = 100/sampling

sensornum=len(sensor)

################# 划分数据集 ########################

lst_X = []
lst_y = []
begin = [0, 0.7, 0.8]
end = [0.7, 0.8, 1]
for i in xrange(0,3):
    data, label, use_num_max = loadData(win, step, sampling, begin[i], end[i])
    
    use_num = use_num_max
    df, y=genTrainSample(data, label, use_num, win)

    #fltr(df, win, sensornum, cutOff=4, fs=fs, order=5) #filter

    master_df = df
    print(master_df.shape)
    
    # extraction_settings = ComprehensiveFCParameters()
    # extraction_settings = EfficientFCParameters()
    # extraction_settings = MinimalFCParameters()
    #extraction_settings.update({"fft_dc":None,"fft_mean":None,"fft_var":None,"fft_std":None,"fft_kurt":None,"fft_shape_mean":None,"fft_shape_std":None,"fft_shape_skew":None,"fft_shape_kurt":None})
    #"""
    extraction_settings = {'ar_coefficient': [{'coeff': 0, 'k': 10},
        {'coeff': 1, 'k': 10},
        {'coeff': 2, 'k': 10},
        {'coeff': 3, 'k': 10},
        {'coeff': 4, 'k': 10}],
              'longest_strike_above_mean': None,
              'longest_strike_below_mean': None,
            'mean_abs_change_quantiles': [{'qh': 0.2, 'ql': 0.0},
                                          {'qh': 0.4, 'ql': 0.0},
                                          {'qh': 0.6, 'ql': 0.0},
                                          {'qh': 0.8, 'ql': 0.0},
                                          {'qh': 1.0, 'ql': 0.0},
                                          {'qh': 0.4, 'ql': 0.2},                                       
                                          {'qh': 0.6, 'ql': 0.2},
                                          {'qh': 0.8, 'ql': 0.2},
                                          {'qh': 1.0, 'ql': 0.2},
                                          {'qh': 0.6, 'ql': 0.4},
                                          {'qh': 0.8, 'ql': 0.4},
                                          {'qh': 1.0, 'ql': 0.4},
                                          {'qh': 0.8, 'ql': 0.6},
                                          {'qh': 1.0, 'ql': 0.6},
                                          {'qh': 1.0, 'ql': 0.8}],
              'autocorrelation': [{'lag': 0},{'lag': 1},{'lag': 2},{'lag': 3},{'lag': 4},
                                  {'lag': 5},{'lag': 6},{'lag': 7},{'lag': 8},{'lag': 9}],
              'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
              'quantile': [{'q': 0.1},{'q': 0.2},{'q': 0.3},{'q': 0.4},
                           {'q': 0.6},{'q': 0.7},{'q': 0.8},{'q': 0.9}],
              'number_peaks': [{'n': 1}, {'n': 3}, {'n': 5}],
              'minimum': None,
              'maximum': None,
              'median': None,
              'sum_values': None,
              'spkt_welch_density': [{'coeff': 2}, {'coeff': 5}, {'coeff': 8}],
    }
    #"""
    X = extract_features(master_df, default_fc_parameters=extraction_settings, column_id='id', 
                         column_sort="time", column_kind="kind", column_value="value")

    impute(X)
    print(X.shape)
    for n in X.columns:
        std = np.std(X[n])
        X[n] = (X[n]-np.mean(X[n]))/std if std>0 else X[n]-np.mean(X[n])
    lst_X.append(copy.deepcopy(X))
    lst_y.append(copy.deepcopy(y))

X_train = lst_X[0]
y_train = lst_y[0]
X_val = lst_X[1]
y_val = lst_y[1]
X_test = lst_X[2]
y_test = lst_y[2]


bus_data_set_101 loaded!
car_data_set_103 loaded!
running_data_set_8 loaded!
stationary_data_set_3 loaded!
subway_data_set_102 loaded!
walking_data_set_7 loaded!
(1526400, 4)
It is ok!


Feature Extraction: 100%|██████████| 7632/7632 [01:29<00:00, 85.58it/s]


(2544, 159)
bus_data_set_101 loaded!
car_data_set_103 loaded!
running_data_set_8 loaded!
stationary_data_set_3 loaded!
subway_data_set_102 loaded!
walking_data_set_7 loaded!
(212400, 4)
It is ok!


Feature Extraction: 100%|██████████| 1062/1062 [00:12<00:00, 83.41it/s]


(354, 159)
bus_data_set_101 loaded!
car_data_set_103 loaded!
running_data_set_8 loaded!
stationary_data_set_3 loaded!
subway_data_set_102 loaded!
walking_data_set_7 loaded!
(432000, 4)
It is ok!


Feature Extraction: 100%|██████████| 2160/2160 [00:26<00:00, 82.37it/s]


(720, 159)


# 训练分类器

In [35]:
#Using all the features as contrast
eval_set = [(X_val, y_val)]
cl = XGBClassifier(max_depth=5, n_estimators=200, objective='multi:softmax', reg_alpha=0, reg_lambda=0,
                  subsample=0.75, gamma=5)
cl.fit(X_train, y_train, eval_metric='merror', eval_set=eval_set, early_stopping_rounds=30, verbose=False)
print('Train Accuracy: %f, Validation Accuracy: %f, Test Accuracy: %f' % 
      (accuracy_score(y_train, cl.predict(X_train)), accuracy_score(y_val, cl.predict(X_val)), accuracy_score(y_test, cl.predict(X_test))))


Train Accuracy: 0.979167, Validation Accuracy: 0.539548, Test Accuracy: 0.740278
