In [1]:
import numpy as np
import pandas as pd
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

Using TensorFlow backend.


In [2]:
train = pd.read_csv('./comment-classification/ai_challenger_sentiment_analysis_trainingset_20180816/sentiment_analysis_trainingset.csv')

In [3]:
train.head(1)

Unnamed: 0,id,content,location_traffic_convenience,location_distance_from_business_district,location_easy_to_find,service_wait_time,service_waiters_attitude,service_parking_convenience,service_serving_speed,price_level,...,environment_decoration,environment_noise,environment_space,environment_cleaness,dish_portion,dish_taste,dish_look,dish_recommendation,others_overall_experience,others_willing_to_consume_again
0,0,"""吼吼吼，萌死人的棒棒糖，中了大众点评的霸王餐，太可爱了。一直就好奇这个棒棒糖是怎么个东西，...",-2,-2,-2,-2,1,-2,-2,-2,...,-2,-2,-2,-2,-2,-2,1,-2,1,-2


In [4]:
def get_dummies(data,name): return pd.get_dummies(data[name], prefix=name)

In [5]:
def handle_dummies(data):
    columns_names = data.columns.values[2:]
    
    concat_list = [data]
    
    for name in columns_names:
        concat_list.append(get_dummies(data, name))
        
    return pd.concat(concat_list, axis=1)

In [6]:
train =  handle_dummies(train).iloc[:40000]

In [7]:
train.shape

(40000, 102)

In [8]:
all_contents = train['content'].tolist()

In [9]:
def cut(string): return list(jieba.cut(string))

In [10]:
all_contents = [' '.join(cut(s)) for s in all_contents]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_8/f14fxxnn7w13cd0l5x9hl4bm0000gn/T/jieba.cache
Loading model cost 0.790 seconds.
Prefix dict has been built succesfully.


In [11]:
all_contents[0]

'" 吼吼 吼 ， 萌死 人 的 棒棒糖 ， 中 了 大众 点评 的 霸王餐 ， 太 可爱 了 。 一直 就 好奇 这个 棒棒糖 是 怎么 个 东西 ， 大众 点评 给 了 我 这个 土老冒 一个 见识 的 机会 。 看 介绍 棒棒糖 是 用 德国 糖 做 的 ， 不会 很甜 ， 中间 的 照片 是 糯米 的 ， 能 食用 ， 真是太 高端 大气 上档次 了 ， 还 可以 买 蝴蝶结 扎口 ， 送人 可以 买 礼盒 。 我 是 先 打 的 卖家 电话 ， 加 了 微信 ， 给 卖家 传 的 照片 。 等 了 几天 ， 卖家 就 告诉 我 可以 取货 了 ， 去 大官 屯 那取 的 。 虽然 连 卖家 的 面 都 没 见到 ， 但是 还是 谢谢 卖家 送 我 这么 可爱 的 东西 ， 太 喜欢 了 ， 这 哪 舍得吃 啊 。 "'

In [12]:
max_features = 5000
maxlen = 100
embed_size = 300

In [13]:
tokenizer = Tokenizer(num_words=max_features)

In [14]:
tokenizer.fit_on_texts(all_contents)

In [15]:
sequences = tokenizer.texts_to_sequences(all_contents)

In [16]:
X_train = pad_sequences(sequences, maxlen=maxlen)

In [17]:
y_train = train[train.columns[-80:]].values

In [27]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, 300, input_length=maxlen)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(80, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

In [28]:
model = get_model()

In [29]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 100, 300)     1500000     input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 100, 300)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 100, 160)     182880      spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
global_ave

In [30]:
batch_size = 32
epochs = 10

In [31]:
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, train_size=0.9, random_state=0)

In [32]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=2)

Train on 36000 samples, validate on 4000 samples
Epoch 1/10
 - 248s - loss: 0.3281 - acc: 0.8642 - val_loss: 0.3019 - val_acc: 0.8785
Epoch 2/10
 - 251s - loss: 0.2957 - acc: 0.8810 - val_loss: 0.2896 - val_acc: 0.8844
Epoch 3/10
 - 282s - loss: 0.2827 - acc: 0.8868 - val_loss: 0.2825 - val_acc: 0.8877
Epoch 4/10
 - 279s - loss: 0.2741 - acc: 0.8906 - val_loss: 0.2803 - val_acc: 0.8893
Epoch 5/10
 - 261s - loss: 0.2671 - acc: 0.8934 - val_loss: 0.2779 - val_acc: 0.8900
Epoch 6/10
 - 245s - loss: 0.2610 - acc: 0.8958 - val_loss: 0.2797 - val_acc: 0.8901
Epoch 7/10
 - 246s - loss: 0.2553 - acc: 0.8982 - val_loss: 0.2801 - val_acc: 0.8897
Epoch 8/10
 - 489s - loss: 0.2496 - acc: 0.9006 - val_loss: 0.2823 - val_acc: 0.8897
Epoch 9/10
 - 246s - loss: 0.2441 - acc: 0.9029 - val_loss: 0.2847 - val_acc: 0.8882
Epoch 10/10
 - 252s - loss: 0.2385 - acc: 0.9052 - val_loss: 0.2897 - val_acc: 0.8851


In [33]:
y_pred = model.predict(X_val, batch_size=32)

In [34]:
y_pred.shape

(4000, 80)

In [35]:
y_pred[0]

array([7.88027346e-01, 2.42944304e-02, 1.55577660e-02, 1.53742746e-01,
       8.35472703e-01, 2.48388061e-03, 4.91097430e-03, 1.63792148e-01,
       9.17777538e-01, 5.00423610e-02, 3.65001783e-02, 3.51194032e-02,
       3.06623697e-01, 2.80836254e-01, 2.90861756e-01, 1.94561064e-01,
       8.26987147e-01, 9.07066744e-03, 9.53930244e-02, 1.54305175e-01,
       9.85910416e-01, 1.07142124e-02, 4.68213344e-03, 8.28887336e-03,
       9.68940794e-01, 2.92383116e-02, 2.14965045e-02, 1.17693329e-02,
       7.65157461e-01, 2.12375093e-02, 1.77859485e-01, 6.48354515e-02,
       7.19269514e-01, 6.71135448e-03, 3.43208984e-02, 2.06883788e-01,
       9.01179790e-01, 6.06599683e-03, 9.37790424e-02, 4.20628898e-02,
       3.80705923e-01, 5.89910187e-02, 1.22798108e-01, 3.44277114e-01,
       3.90029073e-01, 1.29778907e-01, 1.35557950e-01, 2.62189239e-01,
       4.30190235e-01, 1.59292072e-01, 1.15669645e-01, 2.18376130e-01,
       6.58969700e-01, 2.58027278e-02, 7.70675391e-02, 1.86372355e-01,
      