In [2]:
import os

train_data_path = "train.csv"
validate_data_path =  "valid.csv"
test_data_path = "testa.csv"
test_data_predict_output_path = "testa_predict.csv"

In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import jieba
import pandas as pd
from sklearn.metrics import f1_score

def load_data_from_csv(file_name, header=0,encoding="UTF-8"):
    data_df = pd.read_csv(file_name, header=header,encoding=encoding)
    return data_df

def seg_words(contents):
    contents_segs = list()
    for content in contents:
        rcontent = content.replace("\r\n", " ").replace("\n", " ")
        segs = [word for word in jieba.cut(rcontent)]
        contents_segs.append(" ".join(segs))
    return contents_segs

def get_f1_scoreE(y_true, y_pred):
    return f1_score(y_true, y_pred, labels=[1, 0, -1, -2], average='macro')

In [4]:
import logging
import os
import sys
import numpy as np
import pandas as pd

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
logger = logging.getLogger()

In [5]:
# load train data
logger.info("start load data")
train_data_df = load_data_from_csv('train.csv')
validate_data_df = load_data_from_csv(file_name='valid.csv')

2018-10-24 10:04:02,763 [INFO] <MainProcess> (MainThread) start load data


In [6]:
content_train = train_data_df.iloc[:, 1]
content_validata = validate_data_df.iloc[:, 1]
logger.info("start seg train data")
content_train = seg_words(content_train)
content_validata = seg_words(content_validata)
logger.info("complete seg train data")

columns = train_data_df.columns.values.tolist()

2018-10-24 10:04:04,882 [INFO] <MainProcess> (MainThread) start seg train data
Building prefix dict from the default dictionary ...
2018-10-24 10:04:04,885 [DEBUG] <MainProcess> (MainThread) Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\dqsy_\AppData\Local\Temp\jieba.cache
2018-10-24 10:04:04,889 [DEBUG] <MainProcess> (MainThread) Loading model from cache C:\Users\dqsy_\AppData\Local\Temp\jieba.cache
Loading model cost 0.578 seconds.
2018-10-24 10:04:05,465 [DEBUG] <MainProcess> (MainThread) Loading model cost 0.578 seconds.
Prefix dict has been built succesfully.
2018-10-24 10:04:05,465 [DEBUG] <MainProcess> (MainThread) Prefix dict has been built succesfully.
2018-10-24 10:06:33,445 [INFO] <MainProcess> (MainThread) complete seg train data


In [8]:
contents_train=[x[1:-1] for x in content_train]
contents_valid=[x[1:-1] for x in content_validata]
#print(contents[1])

In [12]:
max_features = 50000
maxlen = 280
embed_size = 200

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(contents_train+contents_valid)

In [13]:
X_train = tokenizer.texts_to_sequences(content_train)
X_valid = tokenizer.texts_to_sequences(content_validata)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_valid = sequence.pad_sequences(X_valid, maxlen=maxlen)

In [14]:
x_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [15]:
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division

import sys
from os.path import dirname
from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K


class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)
        
    def get_config(self):
        config = {
            'return_attention': self.return_attention,
        }
        base_config = super(AttentionWeightedAverage, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [16]:
from keras.models import Model
from keras.layers import Input, Embedding, Dense, MaxPool2D, LSTM,Bidirectional
from keras.layers import Reshape, Flatten, concatenate, Dropout, Activation
from keras.preprocessing import text, sequence
from keras.callbacks import Callback  

def SimpleNN(return_probabilities=False):
    model_input = Input(shape=(maxlen,), dtype='int32')

    embed = Embedding(input_dim=max_features,
                      output_dim=256,
                      mask_zero=True,
                      input_length=maxlen,
                      name='embedding')
    x = embed(model_input)
    x = Activation('tanh')(x)

    lstm_0_output = Bidirectional(LSTM(32, return_sequences=True), name="bi_lstm_0")(x)
    #lstm_1_output = Bidirectional(LSTM(32, return_sequences=True), name="bi_lstm_1")(lstm_0_output)
    #x = concatenate([lstm_0_output, x])

    #print(x)


    weights1 = None
    x = AttentionWeightedAverage(name='attlayer', return_attention=True)(lstm_0_output)

    x, weights1 = x

    x = Dropout(0.2)(x)

    outputs = [Dense(4, activation='softmax', name='softmax')(x)]


    # add the attention weights to the outputs if required
    if return_probabilities:
        outputs.append(weights1)
    #print(weights)

    model = Model(inputs=model_input, outputs=outputs)
    

    return model

In [17]:
model = SimpleNN()
model.summary()
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 280)               0         
_________________________________________________________________
embedding (Embedding)        (None, 280, 256)          12800000  
_________________________________________________________________
activation_1 (Activation)    (None, 280, 256)          0         
_________________________________________________________________
bi_lstm_0 (Bidirectional)    (None, 280, 64)           73984     
_________________________________________________________________
attlayer (AttentionWeightedA [(None, 64), (None, 280)] 64        
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
softmax (Dense)              (None, 4)                 260       
Total para

In [18]:
columns = train_data_df.columns.values.tolist()

In [19]:
from keras.utils import np_utils
from keras.utils import to_categorical
# model train
logger.info("start train model")

train_label = train_data_df['location_traffic_convenience']+2
valid_label = validate_data_df['location_traffic_convenience']+2

new_y_val_pred = 0
new_y_test_pred = 0
logger.info("start train  model")

y_train_onehot = np_utils.to_categorical(train_label,num_classes=4)
y_valid_onehot = np_utils.to_categorical(valid_label,num_classes=4)

batch_size = 128
epochs = 1

X_tra, X_val, y_tra, y_val = x_train, x_valid,y_train_onehot,y_valid_onehot

model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),verbose=2)



model.save_weights('2.h5')
logger.info("complete train %s model" % 2)


logger.info("complete train model")



2018-10-24 10:08:07,987 [INFO] <MainProcess> (MainThread) start train model
2018-10-24 10:08:07,997 [INFO] <MainProcess> (MainThread) start train  model


Train on 105000 samples, validate on 15000 samples
Epoch 1/1
 - 744s - loss: 0.3660 - acc: 0.8935 - val_loss: 0.2773 - val_acc: 0.9213


2018-10-24 10:20:34,228 [INFO] <MainProcess> (MainThread) complete train 2 model
2018-10-24 10:20:34,228 [INFO] <MainProcess> (MainThread) complete train model


In [20]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse

import logging

import numpy as np

from sklearn.externals import joblib

from keras.models import load_model

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')


# load data
logger.info("start load load")
test_data_df = load_data_from_csv('testa.csv')


content_test = test_data_df['content']
logger.info("start seg train data")
content_test = seg_words(content_test)
logger.info("complete seg train data")

contents_test=[x[1:-1] for x in content_test]

2018-10-24 10:20:34,233 [INFO] <MainProcess> (MainThread) start load load
2018-10-24 10:20:34,481 [INFO] <MainProcess> (MainThread) start seg train data
2018-10-24 10:20:55,725 [INFO] <MainProcess> (MainThread) complete seg train data


In [21]:
X_test = tokenizer.texts_to_sequences(contents_test)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


In [22]:
# model predict
from keras.models import load_model
logger.info("start predict test data")

pred_model = SimpleNN(return_probabilities=True)
pred_model.load_weights('2.h5')#直接将模型导入

y_val_pred = pred_model.predict(x_test)

2018-10-24 10:20:58,069 [INFO] <MainProcess> (MainThread) start predict test data


In [1]:
#tokenizer.index_word

In [122]:
print(contents_test[281])
print(x_test[281])
y_val_pred[0]
addresses = [np.argmax(y_val_pred[0][i]) for i in range(15000)]
print(addresses)
print(np.where(np.array(addresses)>0))
    

 不太 喜欢 主流 咖啡厅 ， 太 商业 太 聒噪 ， 其实 工作 了 一天 就 想 找个 地方 静静 。 豆 老板 的 猫 咖啡 真的 是 绝佳 的 心情 治愈 地 ， 每次 来 就 找个 安静 的 小 角落 ， 和 你 脾气 投缘 的 萌仔们 就 会 偎 在 身边 求 抱抱 求挠 挠 ， 那种 被 猫 依偎 的 感觉 真的 很 好 ， 很 享受 这样 安安静静 望 着 它们 。   我 不是 称职 的 点评 人 ， 每次 来 店里 只点 柚子茶 和 提拉 米苏 ， 其他 的 都 没尝过 ， 因为 思维 已经 随着 这些 萌仔们 慵懒 了 ， 吃 什么 甜品 都 不 重要 。   【 此店 为 绝佳 发呆地 ， 别 捧 着 手机 连 WiFi 了 】 
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     

In [152]:
subs = np.where(np.array(addresses)>0)
subs = list(subs[0])

In [153]:
for sub in subs:
    print(sub)
    print('s')

2
s
3
s
20
s
26
s
32
s
33
s
35
s
60
s
81
s
82
s
83
s
85
s
87
s
92
s
94
s
103
s
104
s
108
s
110
s
111
s
113
s
115
s
116
s
129
s
142
s
147
s
149
s
153
s
155
s
169
s
172
s
173
s
183
s
186
s
192
s
198
s
212
s
216
s
217
s
220
s
221
s
223
s
224
s
230
s
235
s
236
s
237
s
238
s
242
s
244
s
247
s
252
s
254
s
255
s
256
s
268
s
272
s
273
s
275
s
284
s
285
s
290
s
311
s
320
s
321
s
324
s
325
s
334
s
337
s
344
s
351
s
361
s
369
s
378
s
387
s
395
s
410
s
415
s
418
s
428
s
429
s
437
s
445
s
450
s
451
s
456
s
469
s
473
s
477
s
479
s
484
s
487
s
494
s
498
s
499
s
510
s
512
s
513
s
515
s
518
s
520
s
522
s
530
s
533
s
534
s
536
s
541
s
544
s
545
s
548
s
551
s
561
s
563
s
575
s
578
s
586
s
590
s
593
s
594
s
596
s
598
s
605
s
608
s
615
s
628
s
629
s
630
s
637
s
645
s
646
s
654
s
659
s
671
s
672
s
673
s
691
s
695
s
698
s
702
s
707
s
714
s
723
s
725
s
732
s
742
s
747
s
748
s
751
s
756
s
763
s
776
s
778
s
781
s
789
s
791
s
822
s
823
s
825
s
830
s
833
s
838
s
852
s
853
s
856
s
859
s
861
s
868
s
873
s
879
s
886

s
11410
s
11412
s
11415
s
11421
s
11426
s
11434
s
11435
s
11437
s
11452
s
11455
s
11457
s
11463
s
11470
s
11481
s
11483
s
11484
s
11488
s
11492
s
11496
s
11498
s
11499
s
11502
s
11504
s
11512
s
11517
s
11525
s
11535
s
11536
s
11548
s
11552
s
11553
s
11554
s
11556
s
11557
s
11558
s
11569
s
11571
s
11572
s
11573
s
11577
s
11584
s
11587
s
11590
s
11602
s
11604
s
11606
s
11624
s
11631
s
11633
s
11635
s
11638
s
11641
s
11645
s
11657
s
11659
s
11661
s
11663
s
11673
s
11677
s
11679
s
11680
s
11682
s
11684
s
11685
s
11690
s
11693
s
11694
s
11698
s
11700
s
11702
s
11717
s
11728
s
11740
s
11751
s
11755
s
11763
s
11765
s
11768
s
11770
s
11771
s
11773
s
11774
s
11775
s
11793
s
11804
s
11805
s
11813
s
11831
s
11840
s
11841
s
11846
s
11852
s
11854
s
11864
s
11866
s
11870
s
11877
s
11880
s
11882
s
11885
s
11887
s
11897
s
11917
s
11919
s
11926
s
11937
s
11938
s
11941
s
11943
s
11946
s
11947
s
11948
s
11950
s
11963
s
11967
s
11974
s
11983
s
12004
s
12005
s
12019
s
12029
s
12030
s
12039
s
12055
s
12062


In [158]:
for sub in subs:
    #print(contents_test[sub])
    indexes = x_test[sub][np.argsort(-y_val_pred[1][sub])[:10]]
    print([tokenizer.index_word.get(index) for index in indexes])

['交通', '交通', '车站', '地理位置', '好', '方便', '对面', '，', '方便', '124']
['交通', '号线', '便利', '，', '上', '4', '轨道交通', '位置', '，', '在']
['交通', '方便', '市区', '腾冲', '，', '成', '周日', '拼盘', '上座率', '好']
['公交车', '车站', '沟', '普通', '大寨', '附近', '分量', '，', '很大', '位于']
['地铁', '地理位置', '方便', '总结', '的', '上面', '很', '回家路上', '就', '，']
['线', '站', '四号线', '站', '十号', '临平路', '新村', '或者', '邮电', '】']
['地理位置', '珠江路', '在', '路口', '店', '新世界', '再', '往东', '百货', '一点']
['地铁站', '安亭', '位置', '附近', '在', '就', '环境', '城市', '，', '一般']
['公交', '交通', '地铁站', '方便', '，', '有', '就', '算', '还', '位置']
['地铁站', '车站', '听说', '附近', '，', '旁边', '一直', '哈哈哈', '材料', '有']
['交通', '便利', '地点', '打车', '附近', '，', '，', '洗浴', '好', '有']
['交通', '便利', '👍', '，', '位置', '，', '位置', '方便', '能', '看到']
['交通', '便利', '方便', 'shopping', '到', '摆盘', '牛', '肋骨', '边', '考虑']
['公交车', '方便', '坐', '，', '地点', '起来', '位置', '也', '大都会', '站']
['公交车站', '301', '位置', '旁', '对面', '在', '，', '的', '就', '很']
['地铁站', '😊', '南浦', '位置', '附近', '位于', '：', '旁边', '的', '环境']
['交通', '便利', '周边', '不错', '位置', '花园里', '好', '环境',

['火车站', '火车站', '火车站', '火车站', '正是', '，', '附近', '。', '靠近', '给']
['火车站', '火车站', '位置', '火车站', '修路', '】', '【', '出来', '附近', '路']
['车站', '附近', '位置', '街', '二经', '新天地', '走', '旁边', '，', '胡同']
['交通', '火车站', '便利', '，', '很', '也', '昆明', '。', '靠近', '而且']
['地铁站', '号口', '出来', '12', '公园', '路口', '走', '，', '交流', '后']
['交通', '便利', '。', '，', '很大', '也', '爽口', '既', '必备', '停车场']
['交通', '地铁', '便利', '出口', '好', '，', 'k', '找到', '，', '很']
['交通', '公交车站', '方便', '鱼皮', '苑', '￥', '￥', '球', '28', '灌汤']
['交通', '站', '方便', '，', '公园', '正门', '长风', '位置', '位于', '，']
['交通', '公交车站', '便利', '车站', '在', '直达', '就', '，', '，', '靠近']
['地铁站', '大道', '世纪', '附近', '方便', '比较', '近', '在', '上班', '所以']
['交通', '位置', '一流', '方便', '粒', '，', '粒', '團購', '可以', '由']
['地铁', '号线', '中兴路', '号口', '下来', '8', '，', '3', '出', '鸳鸯锅']
['交通', '方便', '位置', '很', '方便', '，', '，', '虽然', '比较', '空']
['地铁站', '湾', '出来', '返回', '后', '博物馆', '附近', '，', '不失', '劲']
['交通', '便利', '好', '😄', '，', '很', '道菜', '地点', '😄', '贝类']
['火车站', '周围', '位置', '的', '，', '在', '就', '不错', '很', '，']
['下车', 

In [None]:
test_data_df

In [None]:
test_data_df.to_csv("testa_predict.csv",
                    encoding="utf-8", index=False)
logger.info("complete predict test data")

In [46]:
import nltk
def filt_sent(X,max_senten_num):
    X_sent = []
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for paragraph in X:
        raw = sent_tokenizer.tokenize(paragraph)
        filt = []
        min_sent_len = 5 if len(raw) <= 10 else 10
        for sentence in raw:
            if len(sentence.split()) >= min_sent_len and len(filt) < max_senten_num:
                filt.append(sentence)
        while len(filt) < max_senten_num:
            filt.append('nosentence')
        X_sent.append(filt)
    return X_sent

In [48]:
text = [i for i in open('news.txt')]
sens = filt_sent(text,5)
sens

[['Obama family spokespersons Eric Schultz and Katie Hill tell CNN they are not commenting on reports that a suspicious package addressed to the former president was intercepted by the US Secret Service.',
  'nosentence',
  'nosentence',
  'nosentence',
  'nosentence'],
 ['nosentence', 'nosentence', 'nosentence', 'nosentence', 'nosentence'],
 ["Neither would discuss the former first family's current location and are directing reporters to the US Secret Service.",
  'CNN confirmed separately that the Obama family is in Washington, DC and that Mr. Obama had no public events planned for the day but was keeping to his normal schedule.',
  'nosentence',
  'nosentence',
  'nosentence'],
 ['nosentence', 'nosentence', 'nosentence', 'nosentence', 'nosentence'],
 ["Outside the Obama home in Washington, where there is a permanent security barricade, CNN's Suzanne Malveaux reports seeing an increase in the number of DC Metropolitan Police Officers on the scene, who are making sure the media and on

In [86]:
arr = np.array([1, 3, 2, 4, 5])



array([5, 4, 3])

In [82]:
x = np.arange(10,1,-1)
print(x)
x[[3, 3, 1, 8]]

[10  9  8  7  6  5  4  3  2]


array([7, 7, 9, 2])