# 事件预测 - 分类问题

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## 导入并处理数据

### 导入

In [2]:
df = pd.read_csv("building_event_binary.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5039 entries, 0 to 5038
Data columns (total 6 columns):
Sunday      5039 non-null object
07/24/05    5039 non-null object
00:00:00    5039 non-null object
0           5039 non-null int64
0.1         5039 non-null int64
noevent     5039 non-null object
dtypes: int64(2), object(4)
memory usage: 236.3+ KB


In [3]:
df.columns = ['week', 'date', 'time', 'enterCnt', 'leaveCnt', 'type']
df.head()

Unnamed: 0,week,date,time,enterCnt,leaveCnt,type
0,Sunday,07/24/05,00:30:00,1,0,noevent
1,Sunday,07/24/05,01:00:00,0,0,noevent
2,Sunday,07/24/05,01:30:00,0,0,noevent
3,Sunday,07/24/05,02:00:00,0,0,noevent
4,Sunday,07/24/05,02:30:00,2,0,noevent


### 处理星期数据

In [4]:
from sklearn.preprocessing import LabelEncoder
weeklbe = LabelEncoder()
weeklbe.fit(df.week.unique())

LabelEncoder()

In [5]:
def encode(x):
    global weeklbe
    newx = weeklbe.transform([x])
    return newx[0]

In [6]:
df['weekEncode'] = df.week.apply(encode)
df.head()

Unnamed: 0,week,date,time,enterCnt,leaveCnt,type,weekEncode
0,Sunday,07/24/05,00:30:00,1,0,noevent,3
1,Sunday,07/24/05,01:00:00,0,0,noevent,3
2,Sunday,07/24/05,01:30:00,0,0,noevent,3
3,Sunday,07/24/05,02:00:00,0,0,noevent,3
4,Sunday,07/24/05,02:30:00,2,0,noevent,3


### 处理时间数据

In [7]:
timelbe = LabelEncoder()
timelbe.fit(df.time.unique())

LabelEncoder()

In [8]:
def encode2(x):
    global timelbe
    newx = timelbe.transform([x])
    return newx[0]

In [9]:
df['timeEncode'] = df.time.apply(encode2)
df.head()

Unnamed: 0,week,date,time,enterCnt,leaveCnt,type,weekEncode,timeEncode
0,Sunday,07/24/05,00:30:00,1,0,noevent,3,1
1,Sunday,07/24/05,01:00:00,0,0,noevent,3,2
2,Sunday,07/24/05,01:30:00,0,0,noevent,3,3
3,Sunday,07/24/05,02:00:00,0,0,noevent,3,4
4,Sunday,07/24/05,02:30:00,2,0,noevent,3,5


### 处理事件类型数据

In [10]:
def encode3(x):
    if x == 'noevent':
        return 0
    else:
        return 1

In [11]:
df['typeEncode'] = df.type.apply(encode3)
df.head()

Unnamed: 0,week,date,time,enterCnt,leaveCnt,type,weekEncode,timeEncode,typeEncode
0,Sunday,07/24/05,00:30:00,1,0,noevent,3,1,0
1,Sunday,07/24/05,01:00:00,0,0,noevent,3,2,0
2,Sunday,07/24/05,01:30:00,0,0,noevent,3,3,0
3,Sunday,07/24/05,02:00:00,0,0,noevent,3,4,0
4,Sunday,07/24/05,02:30:00,2,0,noevent,3,5,0


### 提取x_data, y_data数据

In [12]:
x_data = df[ ['weekEncode', 'timeEncode', 'enterCnt' , 'leaveCnt'] ].values
x_data

array([[ 3,  1,  1,  0],
       [ 3,  2,  0,  0],
       [ 3,  3,  0,  0],
       ...,
       [ 2, 45,  0,  0],
       [ 2, 46,  0,  0],
       [ 2, 47,  1,  0]], dtype=int64)

### 对y_data进行OneHot编码

In [13]:
y_data = df.typeEncode.values
print(y_data.shape)
y_data.shape = (5039,1)
temp = np.zeros((5039,2)).astype(int)
for i in range(len(y_data)):
    if y_data[i][0] == 1:
        temp[i][0] = 1
    else:
        temp[i][1] = 0
y_data = temp
y_data[:3]

(5039,)


array([[0, 0],
       [0, 0],
       [0, 0]])

上面转换可以使用one-hot编码实现

In [14]:
from sklearn.preprocessing import OneHotEncoder

y_data = df.typeEncode.values
print(y_data)
for i in range(len(y_data)):
    if y_data[i][0] == 1:
        break
print(y_data[i])
encoder = OneHotEncoder()
y_data = encoder.fit_transform(y_data).toarray()
y_data[i]

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
[1]


array([0., 1.])

## 模型创建

### 根据样本样式定义创建占位符

In [15]:
x = tf.placeholder(tf.float32,[None,4]) # 4个特征
y = tf.placeholder(tf.float32,[None,2]) # 2个类别
keep_prob = tf.placeholder(tf.float32)
lr =  tf.Variable(0.001, dtype = tf.float32) # 学习率变量

### 定义神经网络

In [16]:
# （输入层4个数据，中间层500个神经元）
W1 = tf.Variable(tf.truncated_normal([4,500], stddev=0.1))
b1 =  tf.Variable(tf.zeros([500]) + 0.1)
L1 = tf.nn.tanh(tf.matmul(x, W1) + b1) # 激活函数：tanh
L1_drop = tf.nn.dropout(L1, keep_prob)

# 中间层
W2 = tf.Variable(tf.truncated_normal([500,300], stddev=0.1)) 
b2 =  tf.Variable(tf.zeros([300]) + 0.1)
L2 = tf.nn.tanh(tf.matmul(L1_drop, W2) + b2)
L2_drop =  tf.nn.dropout(L2, keep_prob)

# 输出层
W3 = tf.Variable(tf.truncated_normal([300,2], stddev=0.1)) 
b3 =  tf.Variable(tf.zeros([2]) + 0.1)
prediction = tf.nn.softmax(tf.matmul(L2_drop, W3) + b3) # 输出层激活函数采用softmax()

W0722 10:35:30.503003 11128 deprecation.py:506] From <ipython-input-16-38be1a7a6944>:5: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


### 损失函数 分类问题用交叉熵

In [17]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels = y, logits = prediction))

### 定义一个梯度下降法来进行训练的优化器

In [23]:
# train =  tf.train.AdamOptimizer(lr).minimize(loss)
train = tf.train.GradientDescentOptimizer(lr).minimize(loss)

### 结果存放在一个布尔型的列表中

In [19]:
correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1)) # argmax 返回一维张量中最大值(one-hot编码后，每行只有0和1)的所以在位置

### 求准确率

In [20]:
# cast 把布尔型列表转换为float32， 如[true.true.false] = [1,1,0] ，那么准确率的值即为66.6%
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

### 训练模型

In [25]:
with tf.Session() as sess: 
    # 初始化全部变量
    sess.run(tf.global_variables_initializer())
    # 训练51个周期
    for epoch in range(101):
        # 每个周期都改变学习率（学习越久学习率越低）
        sess.run(tf.assign(lr, 0.001*(0.95**epoch)))
        # 保存batch_size张图片的数据与标签
        sess.run(train, feed_dict={x: x_data, y: y_data, keep_prob: 1.0})
        
        # 用测试集的图片及标签求得准确率
        acc = sess.run(accuracy, feed_dict={x: x_data, y: y_data, keep_prob: 1.0})
        print("Iter "+ str(epoch) + ", Acc " + str(acc))

Iter 0, Acc 0.5356221
Iter 1, Acc 0.60904944
Iter 2, Acc 0.7033141
Iter 3, Acc 0.83131576
Iter 4, Acc 0.91744393
Iter 5, Acc 0.9474102
Iter 6, Acc 0.96546936
Iter 7, Acc 0.96507245
Iter 8, Acc 0.9652709
Iter 9, Acc 0.96507245
Iter 10, Acc 0.96507245
Iter 11, Acc 0.96487397
Iter 12, Acc 0.96487397
Iter 13, Acc 0.96487397
Iter 14, Acc 0.96487397
Iter 15, Acc 0.96487397
Iter 16, Acc 0.96487397
Iter 17, Acc 0.96487397
Iter 18, Acc 0.96487397
Iter 19, Acc 0.96487397
Iter 20, Acc 0.96487397
Iter 21, Acc 0.96487397
Iter 22, Acc 0.96487397
Iter 23, Acc 0.96507245
Iter 24, Acc 0.96507245
Iter 25, Acc 0.96507245
Iter 26, Acc 0.96507245
Iter 27, Acc 0.96507245
Iter 28, Acc 0.96507245
Iter 29, Acc 0.96507245
Iter 30, Acc 0.96507245
Iter 31, Acc 0.96507245
Iter 32, Acc 0.96507245
Iter 33, Acc 0.96507245
Iter 34, Acc 0.96507245
Iter 35, Acc 0.96507245
Iter 36, Acc 0.96507245
Iter 37, Acc 0.96507245
Iter 38, Acc 0.96507245
Iter 39, Acc 0.96507245
Iter 40, Acc 0.96507245
Iter 41, Acc 0.96507245
Iter 4