In [1]:
import pandas as pd
import numpy as np
import re
import jieba
from commentRnn import *
from sklearn.utils import shuffle

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import load_model

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('training_data/train_1.7W.csv')
data = data.fillna(0)
data['is_pos'] = data['is_pos'].replace(-1,0)

In [3]:
jieba.load_userdict('user_dict.txt')
for i in range(len(data['Content'])):
    try:
        data.iloc[i,0] = data.iloc[i,0].replace('团购点评','').replace('\n','').replace('&nbsp','')
        data.iloc[i,0] = re.sub(r"\#.*\#", "蔡明", data.iloc[i,0])
        data.iloc[i,0] = data.iloc[i,0].replace('kfc','肯德基').replace('KFC','肯德基').replace('K家','肯德基').replace('mc','麦当劳').replace('MC','麦当劳').replace('M家','麦当劳')
        data.iloc[i,0] = data.iloc[i,0].replace('麦记','麦当劳').replace('Burger king','BK').replace('bk','BK').replace('McDonalds','麦当劳').replace('汉堡王','BK')
        
    except Exception:
        pass

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/sr/3dsk00110ggdmrc_hw67twp40000gn/T/jieba.cache
Loading model cost 0.682 seconds.
Prefix dict has been built succesfully.


In [4]:
comment_df = data
menu = [ line.rstrip() for line in open('menu.txt') ]
stopwords = [ line.rstrip() for line in open('chineseStopWords.txt') ]
jieba.add_word('蔡明', freq=100, tag='n')
vocab_int,vocab,int_to_vocab = get_vocab(comment_df['Content'],stopwords,menu)

In [6]:
import json
vocab_json = {'vocab_to_int':vocab_int,'vocab':vocab,'int_to_vocab':int_to_vocab}
with open("vocab.json", "w") as json_file:
    json_file.write(json.dumps(vocab_json))

In [7]:
def get_model(max_features,out='tanh'):
    model = Sequential()
    model.add(Embedding(max_features, 128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation=out))

    # try using different optimizers and different optimizer configs
    if out=='tanh':
        model.compile(loss='logcosh',
                      optimizer='adam',
                      metrics=['accuracy'])
    elif out=='sigmoid':
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [8]:
def save_model(model,save_path):
    # serialize model to JSON
    model_json = model.to_json()
    with open("{}.json".format(save_path), "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("{}.h5".format(save_path))
    print("Saved model to disk")

In [9]:
def load_mymodel(save_path):
    # load json and create model
    json_file = open('{}.json'.format(save_path), 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights("{}.h5".format(save_path))
    print("Loaded model from disk")
    return loaded_model

In [10]:
comment_df = data[['Content','is_serv']]
X,y,f_lens = get_train_set(comment_df,stopwords,menu,vocab_int,vocab,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16000,:]
train_y = y[:16000]

test_X = X[16000:,]
test_y = y[16000:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 672
train set prepared!
train_X shape:(16000, 100), train_y shape:(16000, 1), test_X shape (1005, 100),test_y shape (1005, 1)


In [11]:
max_features = len(vocab)+1
batch_size = 100

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='tanh'))

# try using different optimizers and different optimizer configs
model.compile(loss='logcosh',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(train_X, train_y,
          batch_size=batch_size,
          epochs=13,
          validation_data=(test_X, test_y))
score, acc = model.evaluate(test_X, test_y,batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


Build model...
Train...
Train on 16000 samples, validate on 1005 samples
Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13
Test score: 0.09117565923069247
Test accuracy: 0.8348258721887769


In [12]:
model.save('models/is_serv1.7.h5')

In [13]:
is_serv_model = load_model('models/is_serv1.7.h5')
score, acc = is_serv_model.evaluate(train_X[:2000], train_y[:2000])
print(acc)

0.9775


In [14]:
comment_df = data[['Content','is_pos']]
X,y,f_lens = get_train_set(comment_df,stopwords,menu,vocab_int,vocab,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16000,:]
train_y = y[:16000]

test_X = X[16000:,]
test_y = y[16000:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 672
train set prepared!
train_X shape:(16000, 100), train_y shape:(16000, 1), test_X shape (1005, 100),test_y shape (1005, 1)


In [15]:
is_pos_model = get_model(len(vocab)+1,'sigmoid')
is_pos_model.fit(train_X,train_y,batch_size=100,epochs=7,validation_data=(test_X, test_y))
score, acc = is_pos_model.evaluate(test_X, test_y,
                            batch_size=100)
print('Test score:', score)
print('Test accuracy:', acc)
is_pos_model.save('models/is_pos1.7.h5')

Train on 16000 samples, validate on 1005 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Test score: 0.4398767167554047
Test accuracy: 0.8646766154920283


In [16]:
comment_df = data[['Content','is_product']]
X,y,f_lens = get_train_set(comment_df,stopwords,menu,vocab_int,vocab,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16000,:]
train_y = y[:16000]

test_X = X[16000:,]
test_y = y[16000:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 672
train set prepared!
train_X shape:(16000, 100), train_y shape:(16000, 1), test_X shape (1005, 100),test_y shape (1005, 1)


In [17]:
is_product_model = get_model(len(vocab)+1)
is_product_model.fit(train_X,train_y,batch_size=100,epochs=12,validation_data=(test_X, test_y))
score, acc = is_product_model.evaluate(test_X, test_y,
                            batch_size=100)
print('Test score:', score)
print('Test accuracy:', acc)
is_product_model.save('models/is_product1.7.h5')

Train on 16000 samples, validate on 1005 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Test score: 0.13739466088921276
Test accuracy: 0.738308457296286


In [18]:
comment_df = data[['Content','is_price']]
X,y,f_lens = get_train_set(comment_df,stopwords,menu,vocab_int,vocab,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16000,:]
train_y = y[:16000]

test_X = X[16000:,]
test_y = y[16000:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 672
train set prepared!
train_X shape:(16000, 100), train_y shape:(16000, 1), test_X shape (1005, 100),test_y shape (1005, 1)


In [19]:
is_price_model = get_model(len(vocab)+1)
is_price_model.fit(train_X,train_y,batch_size=100,epochs=11,validation_data=(test_X, test_y))
score, acc = is_price_model.evaluate(test_X, test_y,
                            batch_size=100)
print('Test score:', score)
print('Test accuracy:', acc)
is_price_model.save('models/is_price1.7.h5')

Train on 16000 samples, validate on 1005 samples
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
Test score: 0.03186171852510923
Test accuracy: 0.9263681582550505


In [20]:
comment_df = data[['Content','is_env']]
X,y,f_lens = get_train_set(comment_df,stopwords,menu,vocab_int,vocab,seq_len=100)
X,y = shuffle(X,y)

train_X = X[:16000,:]
train_y = y[:16000]

test_X = X[16000:,]
test_y = y[16000:,]

print('train_X shape:{}, train_y shape:{}, test_X shape {},test_y shape {}'.format(train_X.shape,train_y.shape,test_X.shape,test_y.shape))

max comment length: 672
train set prepared!
train_X shape:(16000, 100), train_y shape:(16000, 1), test_X shape (1005, 100),test_y shape (1005, 1)


In [21]:
is_env_model = get_model(len(vocab)+1)
is_env_model.fit(train_X,train_y,batch_size=100,epochs=8,validation_data=(test_X, test_y))
score, acc = is_env_model.evaluate(test_X, test_y,
                            batch_size=100)
print('Test score:', score)
print('Test accuracy:', acc)
is_env_model.save('models/is_env1.7.h5')

Train on 16000 samples, validate on 1005 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Test score: 0.03439055231228397
Test accuracy: 0.9283582061084349
