In [1]:
import os
path = os.getcwd()

In [2]:
import numpy as np
import pandas as pd

10-1 数据探索：数据预处理

In [3]:
# 划分一次用水事件
threshold = pd.Timedelta('4 min')    # 阈值为4分钟
inputfile = path + '/data/water_heater.xls'
outputfile = path + '/tmp/dividsequence.xls'
data = pd.read_excel(inputfile)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18840 entries, 0 to 18839
Data columns (total 9 columns):
发生时间      18840 non-null int64
开关机状态     18840 non-null object
加热中       18840 non-null object
保温中       18840 non-null object
实际温度      18840 non-null object
热水量       18840 non-null object
水流量       18840 non-null int64
加热剩余时间    18840 non-null object
当前设置温度    18840 non-null object
dtypes: int64(2), object(7)
memory usage: 1.3+ MB


In [4]:
data.head()

Unnamed: 0,发生时间,开关机状态,加热中,保温中,实际温度,热水量,水流量,加热剩余时间,当前设置温度
0,20141019063917,关,关,关,30°C,0%,0,0分钟,50°C
1,20141019070154,关,关,关,30°C,0%,0,0分钟,50°C
2,20141019070156,关,关,关,30°C,0%,8,0分钟,50°C
3,20141019071230,关,关,关,30°C,0%,0,0分钟,50°C
4,20141019071236,关,关,关,29°C,0%,0,0分钟,50°C


In [5]:
# dataframe处理
data[u'发生时间'] = pd.to_datetime(data[u'发生时间'], format = '%Y%m%d%H%M%S')
data = data[data[u'水流量'] > 0]    # 取流量大于0的数据
d = data[u'发生时间'].diff() > threshold    # 相邻时间作差分，大于threshold
data[u'事件编号'] = d.cumsum() + 1    # 通过累积求和的方式为事件编号
data.to_excel(outputfile)
data.head()

Unnamed: 0,发生时间,开关机状态,加热中,保温中,实际温度,热水量,水流量,加热剩余时间,当前设置温度,事件编号
2,2014-10-19 07:01:56,关,关,关,30°C,0%,8,0分钟,50°C,1
56,2014-10-19 07:38:16,关,关,关,30°C,0%,8,0分钟,50°C,2
381,2014-10-19 09:46:38,关,关,关,29°C,0%,16,0分钟,50°C,3
382,2014-10-19 09:46:40,关,关,关,29°C,0%,13,0分钟,50°C,3
384,2014-10-19 09:47:15,关,关,关,29°C,0%,20,0分钟,50°C,3


10-2 阈值寻优：划分用水事件的最优时间间隔 

In [6]:
n = 4    # 使用之后四个点的平均斜率

threshold = pd.Timedelta(minutes = 5)    # 专家阈值
data = pd.read_excel(inputfile)
data[u'发生时间'] = pd.to_datetime(data[u'发生时间'], format = '%Y%m%d%H%M%S')
data = data[data[u'水流量'] > 0]    # 只需流量大于0的记录

def event_num(ts):
    d = data[u'发生时间'].diff() > ts    # 相邻时间差分 > 阈值
    return d.sum() + 1    # 返回事件数

# 定义阈值列
dt = [pd.Timedelta(minutes = i) for i in np.arange(1, 9, 0.25)]
h = pd.DataFrame(dt, columns = [u'阈值'])    # 定义阈值列
h[u'事件数'] = h[u'阈值'].apply(event_num)   # 计算每个阈值对应的事件数
h[u'斜率'] = h[u'事件数'].diff() / 0.25      # 计算每两个相邻点对应的斜率
h[u'斜率指标'] = pd.Series.rolling(h[u'斜率'].abs(), n).mean()  # 采用后n个的斜率绝对值平均作为斜率指标
ts = h[u'阈值'][h[u'斜率指标'].idxmin() - n] # 返回斜率指标最小值对应的阈值
h

Unnamed: 0,阈值,事件数,斜率,斜率指标
0,00:01:00,232,,
1,00:01:15,227,-20.0,
2,00:01:30,218,-36.0,
3,00:01:45,207,-44.0,
4,00:02:00,201,-24.0,31.0
5,00:02:15,197,-16.0,30.0
6,00:02:30,194,-12.0,24.0
7,00:02:45,191,-12.0,16.0
8,00:03:00,186,-20.0,15.0
9,00:03:15,181,-20.0,16.0


In [7]:
ts

Timedelta('0 days 00:04:00')

In [8]:
if ts > threshold:
    ts = pd.Timedelta(minutes = 4)
print(ts)

0 days 00:04:00


10-3 训练多层神经网络模型

In [9]:
from keras.models import Sequential
from keras.layers.core import Activation, Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
inputfile1 = path + '/data/train_neural_network_data.xls'  # 训练数据
inputfile2 = path + '/data/test_neural_network_data.xls'   # 测试数据
testoutputfile = path + '/tmp/test_output_data.xls'        # 测试数据输出

# 读取训练集和测试集
data_train = pd.read_excel(inputfile1)
data_test = pd.read_excel(inputfile2)
data_train.head()

Unnamed: 0,热水事件,起始数据编号,终止数据编号,开始时间（begin_time）,根据日志判断是否为洗浴（1表示是，0表示否）,洗浴时间点,总用水时长（w_time）,总停顿时长（w_pause_time）,平均停顿时长（avg_pause_time）,停顿次数（pause）,用水时长（use_water_time）,用水/总时长（use_water_rate）,总用水量（w_water）,平均水流量（water_rate）,水流量波动（flow_volatility）,停顿时长波动（pause_volatility）
0,1,218,344,2014-10-19 08:51:30',0,8,591.999998,303.500013,50.583336,6,288.499985,0.487331,12.998333,2.703293,0.870856,650.106848
1,2,569,965,2014-10-19 15:55:23',1,15,1008.000005,46.499999,46.499999,1,961.500006,0.953869,50.626667,3.15923,0.2023,0.0
2,3,1077,1128,2014-10-19 18:21:40',0,18,467.999997,269.499987,53.899997,5,198.50001,0.424145,7.0875,2.142317,0.40496,531.384976
3,4,1973,2236,2014-10-20 16:42:41',1,16,660.999996,23.499993,23.499993,1,637.500003,0.964448,32.193333,3.029961,0.291306,0.0
4,5,2320,2435,2014-10-20 18:05:28',1,18,550.000005,164.500013,32.900003,5,385.499992,0.700909,13.459167,2.094812,0.3952,180.384977


In [11]:
data_test.head()

Unnamed: 0,热水事件,起始数据编号,终止数据编号,开始时间（begin_time）,根据日志判断是否为洗浴（1表示是，0表示否）,洗浴时间点,总用水时长（w_time）,总停顿时长（w_pause_time）,平均停顿时长（avg_pause_time）,停顿次数（pause）,用水时长（use_water_time）,用水/总时长（use_water_rate）,总用水量（w_water）,平均水流量（water_rate）,水流量波动（flow_volatility）,停顿时长波动（pause_volatility）
0,1,73,336,2015-01-05 9:42:41',1,9,660.999996,23.499993,23.499993,1,637.500003,0.964448,32.193333,3.029961,0.291306,0.0
1,2,420,535,'2015-01-05 18:05:28',1,18,550.000005,164.500013,32.900003,5,385.499992,0.700909,13.459167,2.094812,0.3952,180.385
2,3,538,706,'2015-01-05 18:25:24',1,18,649.000001,201.000006,201.000006,1,447.999995,0.690293,22.6225,3.029799,0.643431,0.0
3,4,793,910,'2015-01-05 20:00:42',1,20,297.999994,7.999977,1.999994,4,290.000017,0.973154,15.088333,3.121724,1.075976,2.529227e-11
4,5,935,1133,'2015-01-05 20:15:13',1,20,623.999996,4.999993,4.999993,1,619.000003,0.991987,41.015,3.975606,0.20262,0.0


In [12]:
# 划分样本特征和标签
y_train = data_train.iloc[:, 4].as_matrix()
x_train = data_train.iloc[:, 5:17].as_matrix()
y_test = data_test.iloc[:, 4].as_matrix()
x_test = data_test.iloc[:, 5:17].as_matrix()

# 建立神经网络模型
model = Sequential()
model.add(Dense(input_dim = 11, units = 17))
model.add(Activation('relu'))
model.add(Dense(input_dim = 17, units = 10))
model.add(Activation('relu'))
model.add(Dense(input_dim = 10, units = 1))
model.add(Activation('sigmoid'))
# 编译模型
model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    sample_weight_mode = 'binary'
)
# 训练模型
model.fit(x_train, y_train, nb_epoch = 500, batch_size = 1)
# 保存模型
model.save_weights(path + '/tmp/net.model')

# 进行预测
r = pd.DataFrame(model.predict_classes(x_test), columns = [u'预测结果'])
pd.concat([data_test.iloc[:, :5], r], axis = 1).to_excel(testoutputfile)
y_model = model.predict(x_test)
y_model

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500

  % delta_t_median)
  % delta_t_median)


Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 

  % delta_t_median)


Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 239/500
Epoch 240/500
Epoch 241/500
Epoch 242/500
Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 271/500
Epoch 272/500
Epoch 273/500
Epoch 274/500
Epoch 275/500
Epoch 276/500
Epoch 277/500
Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 282/500
Epoch 283/500
Epoch 284/500
Epoch 285/500
Epoch 286/500
Epoch 287/500
Epoch 288/500
Epoch 289/500
Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 

Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 

Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


array([[1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [9.9999845e-01],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00],
       [6.7116205e-35],
       [1.0000000e+00],
       [1.0000000e+00],
       [1.0000000e+00]], dtype=float32)

In [13]:
r

Unnamed: 0,预测结果
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


10-4 模型检验

In [14]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, r))

0.7619047619047619


10-5 连续洗浴事件识别

In [15]:
outputfile = path + '/tmp/attribute_extract.xls'
data = pd.read_excel(inputfile)

data[u'发生时间'] = pd.to_datetime(data[u'发生时间'], format = '%Y%m%d%H%M%S')
data = data[data[u'水流量'] > 0]
d = data[u'发生时间'].diff() > threshold
data[u'事件编号'] = d.cumsum() + 1
data.head()

Unnamed: 0,发生时间,开关机状态,加热中,保温中,实际温度,热水量,水流量,加热剩余时间,当前设置温度,事件编号
2,2014-10-19 07:01:56,关,关,关,30°C,0%,8,0分钟,50°C,1
56,2014-10-19 07:38:16,关,关,关,30°C,0%,8,0分钟,50°C,2
381,2014-10-19 09:46:38,关,关,关,29°C,0%,16,0分钟,50°C,3
382,2014-10-19 09:46:40,关,关,关,29°C,0%,13,0分钟,50°C,3
384,2014-10-19 09:47:15,关,关,关,29°C,0%,20,0分钟,50°C,3


In [16]:
data_g = data.groupby(u'事件编号')
result = pd.DataFrame()
dt = pd.Timedelta(seconds = 2)
data_g.head()

Unnamed: 0,发生时间,开关机状态,加热中,保温中,实际温度,热水量,水流量,加热剩余时间,当前设置温度,事件编号
2,2014-10-19 07:01:56,关,关,关,30°C,0%,8,0分钟,50°C,1
56,2014-10-19 07:38:16,关,关,关,30°C,0%,8,0分钟,50°C,2
381,2014-10-19 09:46:38,关,关,关,29°C,0%,16,0分钟,50°C,3
382,2014-10-19 09:46:40,关,关,关,29°C,0%,13,0分钟,50°C,3
384,2014-10-19 09:47:15,关,关,关,29°C,0%,20,0分钟,50°C,3
404,2014-10-19 11:50:17,关,关,关,29°C,0%,22,0分钟,50°C,4
407,2014-10-19 13:56:21,关,关,关,29°C,0%,8,0分钟,50°C,5
410,2014-10-19 15:34:39,关,关,关,29°C,0%,36,0分钟,50°C,6
411,2014-10-19 15:34:40,关,关,关,29°C,0%,32,0分钟,50°C,6
412,2014-10-19 15:34:45,关,关,关,29°C,0%,31,0分钟,50°C,6


In [17]:
for _, g in data_g:
    temp = pd.DataFrame(index = [0])
    # 根据用水时长、开关机切换次数、总用水量推出是否是洗澡
    tstart = g[u'发生时间'].min()
    tend = g[u'发生时间'].max()
    temp[u'用水事件时长（M）'] = (dt + tend - tstart).total_seconds() / 60
    temp[u'开关机切换次数'] = (pd.Series.rolling(g[u'开关机状态'] == u'关', 2).sum() == 1).sum()
    temp[u'总用水量（L）'] = g[u'水流量'].sum()
    tdiff = g[u'发生时间'].diff()
    if len(g[u'发生时间']) == 1:
        temp[u'总用水时长（Min）'] = dt.total_seconds() / 60
    else:
        temp[u'总用水时长（Min）'] = (
            tdiff.sum() - tdiff.iloc[1] / 2 -
            tdiff.iloc[len(tdiff) - 1] / 2
        ).total_seconds() / 60
    temp[u'平均水流量（L/min）'] = temp[u'总用水量（L）'] / temp[u'总用水时长（Min）']
    result = result.append(temp, ignore_index = True)
result.to_excel(outputfile)

In [18]:
result.head()

Unnamed: 0,用水事件时长（M）,开关机切换次数,总用水量（L）,总用水时长（Min）,平均水流量（L/min）
0,0.033333,0,8,0.033333,240.0
1,0.033333,0,8,0.033333,240.0
2,0.65,0,49,0.308333,158.918919
3,0.033333,0,22,0.033333,660.0
4,0.033333,0,8,0.033333,240.0
