In [1]:
import re
import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.layers.merge import Concatenate
from keras.models import load_model
from keras.utils.np_utils import to_categorical
from keras.callbacks import BaseLogger, TensorBoard

#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import word2vec
from gensim import models
from langconv import *

%matplotlib inline
pd.options.display.max_columns = None
plt.rcParams['font.sans-serif']=['SimHei']

Using TensorFlow backend.


In [3]:
#Load datasets
train = pd.read_csv('./data/train.csv')
val = pd.read_csv('./data/val.csv')

In [4]:
train.columns.values.tolist()

['id',
 'content',
 'location_traffic_convenience',
 'location_distance_from_business_district',
 'location_easy_to_find',
 'service_wait_time',
 'service_waiters_attitude',
 'service_parking_convenience',
 'service_serving_speed',
 'price_level',
 'price_cost_effective',
 'price_discount',
 'environment_decoration',
 'environment_noise',
 'environment_space',
 'environment_cleaness',
 'dish_portion',
 'dish_taste',
 'dish_look',
 'dish_recommendation',
 'others_overall_experience',
 'others_willing_to_consume_again']

In [5]:
print(train.shape, val.shape)

(120000, 22) (15000, 22)


In [6]:
#Get labels
train_labels = train.iloc[:,2:]

In [7]:
val_labels = val.iloc[:,2:]

In [8]:
train_labels.head(1)

Unnamed: 0,location_traffic_convenience,location_distance_from_business_district,location_easy_to_find,service_wait_time,service_waiters_attitude,service_parking_convenience,service_serving_speed,price_level,price_cost_effective,price_discount,environment_decoration,environment_noise,environment_space,environment_cleaness,dish_portion,dish_taste,dish_look,dish_recommendation,others_overall_experience,others_willing_to_consume_again
0,-2,-2,-2,-2,1,-2,-2,-2,-2,1,-2,-2,-2,-2,-2,-2,1,-2,1,-2


In [9]:
train['content'][0]

'" 吼吼 吼 ， 萌死 人 的 棒棒糖 ， 中 了 大众 点评 的 霸王餐 ， 太 可爱 了 。 一直 就 好奇 这个 棒棒糖 是 怎么 个 东西 ， 大众 点评 给 了 我 这个 土老冒 一个 见识 的 机会 。 看 介绍 棒棒糖 是 用 德国 糖 做 的 ， 不会 很甜 ， 中间 的 照片 是 糯米 的 ， 能 食用 ， 真是太 高端 大气 上档次 了 ， 还 可以 买 蝴蝶结 扎口 ， 送人 可以 买 礼盒 。 我 是 先 打 的 卖家 电话 ， 加 了 微信 ， 给 卖家 传 的 照片 。 等 了 几天 ， 卖家 就 告诉 我 可以 取货 了 ， 去 大官 屯 那取 的 。 虽然 连 卖家 的 面 都 没 见到 ， 但是 还是 谢谢 卖家 送 我 这么 可爱 的 东西 ， 太 喜欢 了 ， 这 哪 舍得吃 啊 。 "'

In [10]:
X_train = train['content'].tolist()

In [11]:
X_val = val['content'].tolist()

In [85]:
#y_train = to_categorical(train_labels + 2, num_classes=4)

In [86]:
#y_val = to_categorical(val_labels + 2, num_classes=4)

In [12]:
#Get all categories
labels_length = len(val_labels.columns.values.tolist())

In [112]:
#y = [to_categorical(val_labels.iloc[:,i] + 2, num_classes=4) for i in range(labels_length)]

In [13]:
y_train = [to_categorical(train_labels.iloc[:,i] + 2, num_classes=4) for i in range(labels_length)]

In [14]:
y_val = [to_categorical(val_labels.iloc[:,i] + 2, num_classes=4) for i in range(labels_length)]

In [15]:
#Tokenize
max_features = 20000
maxlen = 512

tokenizer = Tokenizer(max_features)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)

In [16]:
X_train.shape, X_val.shape

((120000, 512), (15000, 512))

In [17]:
#Load pre_trained model
w2v_model = word2vec.Word2Vec.load("train.model")

In [18]:
w2v_model.wv['测试'].shape

(300,)

In [19]:
#Thanks to the author
#https://eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/
num_words = 50000
embedding_dim = 300

vocab_list = [(k, w2v_model.wv[k]) for k, v in w2v_model.wv.vocab.items()]

#embeddings_matrix = np.zeros((len(w2v_model.wv.vocab.items()) + 1, w2v_model.vector_size))
embeddings_matrix = np.zeros((num_words, embedding_dim))
for i in range(num_words):
    embeddings_matrix[i] = vocab_list[i][1]

In [27]:
#Log
logdir = "./logs/"
tensorBoard = TensorBoard(
    log_dir=logdir,
    histogram_freq=1,
    embeddings_data=X_train)

In [22]:
#Create model
EMBEDDING_DIM = 300

input_layer = Input(shape=(maxlen,))
embedding_layer = Embedding(len(embeddings_matrix),
                            EMBEDDING_DIM,
                            weights=[embeddings_matrix],
                            trainable=False)(input_layer)
lstm_layer = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
d1 = Dense(64, activation='relu')(lstm_layer)
layers = []
for i in range(labels_length):
    d2 = Dense(4, activation='sigmoid')(d1)
    layers.append(d2)

model = Model(inputs=input_layer, outputs=layers)
model.compile(loss='categorical_crossentropy', optimizer='Adamax', metrics=['accuracy'])

In [23]:
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 512)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 512, 300)     15000000    input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 128)          219648      embedding_2[0][0]                
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 64)           8256        lstm_2[0][0]                     
____________________________________________________________________________________________

In [28]:
#Train
history = model.fit(X_train, 
                    y_train, 
                    batch_size=128,
                    epochs=5, 
                    validation_data=(X_val, y_val),
                    callbacks=[tensorBoard])

Train on 120000 samples, validate on 15000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


Epoch 5/5


In [29]:
#Predict

In [154]:
test = pd.read_csv('./data/test.csv')

In [51]:
X_test = test['content'].tolist()
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [32]:
test_pred = model.predict(X_test)

In [52]:
np.array(test_pred).shape

(20, 15000, 4)

In [53]:
result = np.argmax(test_pred, axis=2) - 2

In [54]:
np.array(result), np.array(result).shape

(array([[-2, -2,  1, ..., -2, -2,  1],
        [-2, -2, -2, ..., -2, -2, -2],
        [-2, -2,  1, ...,  1, -2,  1],
        ...,
        [-2, -2, -2, ..., -2, -2, -2],
        [ 1,  0,  0, ...,  1,  1,  1],
        [ 1, -2, -2, ..., -2,  1,  1]], dtype=int64), (20, 15000))

In [126]:
test.iloc[:, 2:].fillna(0, inplace=True)

In [130]:
test.iloc[:, 2:] = test.iloc[:, 2:].astype('int64')

In [160]:
test.shape

(15000, 22)

In [105]:
df = pd.DataFrame(result.T)

In [106]:
df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-2,-2,-2,-2,1,-2,-2,1,1,-2,1,-2,-2,-2,1,1,-2,-2,1,1


In [107]:
df.shape

(15000, 20)

In [157]:
for i in range(labels_length):
    test.iloc[:, i + 2] = df.iloc[:, i]

In [159]:
test.to_csv('./data/submission.csv', index=False, encoding='utf_8_sig')

In [142]:
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
del model  # deletes the existing model

In [2]:
#Do comment sentiment analysis
#Load model
model = load_model('my_model.h5')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [3]:
text = '家庭聚餐去的，水煮鱼比较香，没什么辣度，基本鱼肉都是尾巴什么的。\
夫妻肺片不推荐。晾杆毛肚还是挺不错的，毛肚比较嫩，蘸酱虽然还是寡淡了但调味很鲜明，十分搭百叶的味道~服务蛮不错的，环境也好。\
总体除了川菜之外的菜品都还比较有水准，价格偏高。\
川菜嘛，北师大学五食堂的川菜都秒这家八条街真的。。。\
看着名字对这家的川菜湘菜寄予了很高期望，结果虽说不上大大失望也是丝毫没有惊喜的，可能渝湘的同时还要顾及江南二字吧。。。'

In [4]:
def cut(string):
    line = Converter('zh-hans').convert(string)
    string = line.replace('\n', '')
    w_list = jieba.cut(string)
    return [' '.join(w for w in w_list)]

In [5]:
c_text = cut(text)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Dracu\AppData\Local\Temp\jieba.cache
Loading model cost 0.637 seconds.
Prefix dict has been built succesfully.


In [6]:
max_features = 20000
maxlen = 512

train = pd.read_csv('./data/train.csv')
X_train = train['content'].tolist()
tokenizer = Tokenizer(max_features)
tokenizer.fit_on_texts(X_train)

In [7]:
c_text = tokenizer.texts_to_sequences(c_text)
c_text = pad_sequences(c_text, padding='post', maxlen=maxlen)

In [8]:
r_text = model.predict(c_text)

In [9]:
np.array(r_text).shape

(20, 1, 4)

In [10]:
r_text = np.argmax(r_text, axis=2) - 2

In [11]:
r_text.T

array([[-2, -2, -2, -2,  1, -2, -2, -2, -2, -2,  1,  1,  1,  1, -2,  0,
        -2, -2,  1, -2]], dtype=int64)

In [12]:
r_text.T[0]

array([-2, -2, -2, -2,  1, -2, -2, -2, -2, -2,  1,  1,  1,  1, -2,  0, -2,
       -2,  1, -2], dtype=int64)

In [13]:
labels = pd.read_csv('./data/labels.csv', header=None)

In [14]:
labels.rename(columns={0:'Category'}, inplace=True)
labels['Values'] = r_text.T[0]

In [15]:
labels

Unnamed: 0,Category,Values
0,location_traffic_convenience,-2
1,location_distance_from_business_district,-2
2,location_easy_to_find,-2
3,service_wait_time,-2
4,service_waiters_attitude,1
5,service_parking_convenience,-2
6,service_serving_speed,-2
7,price_level,-2
8,price_cost_effective,-2
9,price_discount,-2
