### CAICT 風機結冰預測 

2017/11/02  
http://www.industrial-bigdata.com/competition/competitionAction!showDetail.action?competition.competitionId=1

** Fixed Issues **
- 抓不到libcusolver.so.8.0
    - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
    - export CUDA_HOME=/usr/local/cuda
    

In [1]:

import numpy as np
import pandas as pd
import datetime
import os
import sys
import lzma
import pickle
import csv

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import confusion_matrix
import tensorflow as tf

from IPython.core.interactiveshell import InteractiveShell

pd.set_option('display.max_columns', None)
#InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
def updateProgress(msg):
    sys.stdout.write('\r')
    sys.stdout.write(msg)
    sys.stdout.flush()

def myscore(true_y, pred_y):
    n,p =confusion_matrix(true_y, pred_y)
    tn,fp,fn,tp =n[0], n[1], p[0], p[1]
    score = 1- 0.5*(fp/(tn+fp))- 0.5*(fn/(fn+tp)) 
    return score, {'tn':tn,'fp':fp,'fn':fn,'tp':tp}

save_dir = 'checkpoints/'
def get_save_path(net_number):
    return save_dir + 'network' + str(net_number)

<a id='rawdata'></a>
** Load raw data ** 

In [3]:
data_m15 = pd.read_csv('data/ice1/train/15/15_data.csv')
oktime_m15 = pd.read_csv('data/ice1/train/15/15_normalInfo.csv')
ngtime_m15 = pd.read_csv('data/ice1/train/15/15_failureInfo.csv')

data_m21 = pd.read_csv('data/ice1/train/21/21_data.csv')
oktime_m21 = pd.read_csv('data/ice1/train/21/21_normalInfo.csv')
ngtime_m21 = pd.read_csv('data/ice1/train/21/21_failureInfo.csv')

#data_m08 = pd.read_csv('data/ice1/test/08/08_data.csv')


** 根據OK/NG的時間範圍, 對每一筆數據標記"是否結冰" ** 
- 欄位名稱：label

In [4]:
def preprocess_labeling(data, oktime, ngtime):
    for index, row in ngtime.iterrows():
        data.loc[(data['time']>=row[0]) & (data['time']<=row[1]),'label']=1

    for index, row in oktime.iterrows():
        data.loc[(data['time']>=row[0]) & (data['time']<=row[1]),'label']=0
        
preprocess_labeling(data_m15,oktime_m15,ngtime_m15)
preprocess_labeling(data_m21,oktime_m21,ngtime_m21)

#backup label
label_m15 = data_m15['label']
label_m21 = data_m21['label']

** 根據NG的時間, 對每一筆數據標記"事件ID" **
- 欄位名稱：event

In [5]:
def preprocess_event(data, ngtime):
    data['event']=None
    for index, row in ngtime.iterrows():
        data.loc[(data['time']<=row[1]) & ( pd.isnull(data['event'])) ,'event']=index
    
preprocess_event(data_m15,ngtime_m15)
preprocess_event(data_m15,ngtime_m15)

** save read-to-analysis data to pickle **

In [6]:
f_m15 = open('data/pickle/data_m15', 'wb')
pickle.dump(data_m15, f_m15)

f_m21 = open('data/pickle/data_m21', 'wb')
pickle.dump(data_m21, f_m21)

** Data resample **

In [7]:
#TODO




** Pretrain - label unkonwn data **
- unknown data in training set
- validataion set

In [8]:
data_m15['label'] = label_m15
data_m21['label'] = label_m21 

data_notnull_15 = data_m15[pd.notnull(data_m15['label'])]
data_pretrain = data_notnull_15

data_null_15 = data_m15[pd.isnull(data_m15['label'])]
data_pretest = pd.concat([data_null_15,data_m21])

features = ['power','wind_speed','wind_direction','wind_direction_mean','yaw_position','environment_tmp']
pipeline = Pipeline([('forest', RandomForestClassifier())])

X = data_pretrain[features]
y = data_pretrain['label']
model = pipeline.fit(X = X, y = y)

prediction = model.predict(data_pretest[features])

data_m15.loc[pd.isnull(data_m15['label']),'label'] = prediction[:data_null_15.shape[0]]
data_m21['label'] = prediction[data_null_15.shape[0]:]
#data_m08['label'] = prediction[data_null_15.shape[0]+data_null_21.shape[0]:]


** Build Prediction Model **

In [9]:
train_X = pd.concat([data_m15[features],data_m21[features]])
train_y = pd.concat([data_m15['label'],data_m21['label']]).values.astype(int)
train_y = np.eye(2)[train_y]


valid_X = data_m21[features]
valid_y = label_m21.values.astype(int)
valid_y = np.eye(2)[valid_y]

In [10]:
features = ['power','wind_speed','wind_direction','wind_direction_mean','yaw_position','environment_tmp']

n_inputs = len(features)
n_classes = 2 

tf.reset_default_graph()
X =tf.placeholder(tf.float32, [None, n_inputs], name='X')
Y_GT =tf.placeholder(tf.float32, [None, n_classes], name='Y_')

W = {
    'w1': tf.Variable(tf.random_normal([n_inputs,15], stddev=0.01), name='w1'),
    'w2': tf.Variable(tf.random_normal([15,7], stddev=0.01), name='w2'),
    'w3': tf.Variable(tf.random_normal([7, n_classes]), name='w3')
}

B = {
    'b1': tf.Variable(tf.random_normal([15]), name='b1'),
    'b2': tf.Variable(tf.random_normal([7]), name='b2'),
    'b3': tf.Variable(tf.random_normal([n_classes]), name='b3'),
}


with tf.name_scope("L1"):
    H1 = tf.matmul(X, W['w1']) + B['b1'] 
    H1 = tf.nn.relu(H1)

with tf.name_scope("L2"):
    H2 = tf.matmul(H1, W['w2']) + B['b2'] #(-1, 28) matmul (28, hidden_units) => (-1, hidden_units)
    H2 = tf.nn.relu(H2)
    
with tf.name_scope('L3'):
    pred = tf.matmul(H2, W['w3']) + B['b3']
    pred_sfmx = tf.nn.softmax(pred, name="pred_sfmx")

with tf.name_scope('loss_and_acc'):
    correct_pred = tf.equal(tf.argmax(pred_sfmx, 1), tf.argmax(Y_GT, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=Y_GT))

lr = 0.01 # learning rate
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

sess = tf.InteractiveSession()
saver = tf.train.Saver(max_to_keep=100)

In [11]:
def optimize(epoch, train_X, train_y):
    batch_size = 1500
    total_batch= len(train_X) / batch_size
    for ep in range(epoch+1):
        for i in range(int(total_batch)+1):
            rnd_idx = np.random.choice(train_X.shape[0], batch_size, replace=False)
            batch_x = train_X.iloc[rnd_idx]
            batch_y = train_y[rnd_idx]
            _, acc_v1, loss_v1= sess.run([optimizer, accuracy,loss], feed_dict={X: batch_x, Y_GT:batch_y})

        acc_v2, loss_v2= sess.run([accuracy,loss], feed_dict={X: valid_X , Y_GT: valid_y})
        updateProgress('epoch:{x0}, batch:{x4}, train loss:{x1:.3f} acc:{x5:.3f}, valid loss:{x3:.3f} acc:{x2:.3f}'.format(x0=ep,x2=round(acc_v2,3),x3=round(loss_v2,3),x4=i,x1=round(loss_v1,3), x5=round(acc_v1,3)))
        print()
        
num_networks = 5
epoch = 20
for i in range(num_networks):
    print("Neural network: {0}".format(i))
    rnd_idx = np.random.choice(train_X.shape[0], int(train_X.shape[0]*0.6), replace=False)
    sess.run(tf.global_variables_initializer())
    optimize(epoch, train_X.iloc[rnd_idx], train_y[rnd_idx])
    saver.save(sess=sess, save_path=get_save_path(i))
    print()

Neural network: 0
epoch:0, batch:233, train loss:0.105 acc:0.965, valid loss:0.039 acc:0.988
epoch:1, batch:233, train loss:0.106 acc:0.963, valid loss:0.040 acc:0.988
epoch:2, batch:233, train loss:0.128 acc:0.958, valid loss:0.040 acc:0.989
epoch:3, batch:233, train loss:0.134 acc:0.953, valid loss:0.038 acc:0.989
epoch:4, batch:233, train loss:0.124 acc:0.960, valid loss:0.039 acc:0.989
epoch:5, batch:233, train loss:0.129 acc:0.952, valid loss:0.039 acc:0.989
epoch:6, batch:233, train loss:0.117 acc:0.955, valid loss:0.040 acc:0.989
epoch:7, batch:233, train loss:0.090 acc:0.967, valid loss:0.038 acc:0.989
epoch:8, batch:233, train loss:0.118 acc:0.960, valid loss:0.039 acc:0.988
epoch:9, batch:233, train loss:0.118 acc:0.959, valid loss:0.040 acc:0.989
epoch:10, batch:233, train loss:0.118 acc:0.958, valid loss:0.040 acc:0.989
epoch:11, batch:233, train loss:0.116 acc:0.961, valid loss:0.040 acc:0.988
epoch:12, batch:233, train loss:0.128 acc:0.951, valid loss:0.039 acc:0.989
epoc

*** Prediction ***

In [14]:
def ensemble_predictions():
    pred_labels = []
    for i in range(num_networks):
        saver.restore(sess=sess, save_path=get_save_path(i))
        pred_t = pd.DataFrame(pred_sfmx.eval({X: valid_X}))
        pred_t_argm = pred_t.apply(np.argmax,axis=1)
        pred_labels.append(pred_t_argm)
        s, d=myscore(np.argmax(valid_y, axis=1), pred_t_argm)
        print('network {i} score {s}, {d}'.format(i=i,s=s,d=d))
    return pred_labels

pred_labels = ensemble_predictions()

INFO:tensorflow:Restoring parameters from checkpoints/network0
network 0 score 0.6436225441734982, {'tp': 533, 'fp': 817, 'tn': 187849, 'fn': 1295}
INFO:tensorflow:Restoring parameters from checkpoints/network1
network 1 score 0.7654754279505345, {'tp': 978, 'fp': 766, 'tn': 187900, 'fn': 850}
INFO:tensorflow:Restoring parameters from checkpoints/network2
network 2 score 0.713460336666181, {'tp': 782, 'fp': 164, 'tn': 188502, 'fn': 1046}
INFO:tensorflow:Restoring parameters from checkpoints/network3
network 3 score 0.5, {'tp': 0, 'fp': 0, 'tn': 188666, 'fn': 1828}
INFO:tensorflow:Restoring parameters from checkpoints/network4
network 4 score 0.7543056041680735, {'tp': 941, 'fp': 1162, 'tn': 187504, 'fn': 887}


In [15]:
ensemble_pred_labels = np.mean(pred_labels, axis=0)
ensemble_pred_labels = np.array(list(map(lambda x: int(1) if x>=0.5 else int(0), ensemble_pred_labels)))
s, d=myscore(np.argmax(valid_y, axis=1), ensemble_pred_labels)
print('ensemble score {s}, {d}'.format(s=s,d=d))

ensemble score 0.6792784690465576, {'tp': 658, 'fp': 264, 'tn': 188402, 'fn': 1170}


***
** Preformance **

- num_networks = 5,  epoch = 20
    - Training Set:
        - original score: score: ~ 0.71
        - data augmentation score: 0.73
        - ensamble score: ?
    - Valid Set:
        - original score: 0.54
        - data augmentation score: 0.668
        - ensamble score: 0.679

***