In [2]:
import numpy as np
import pandas as pd
import jieba
import re
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from gensim.corpora.dictionary import Dictionary
import multiprocessing
from sklearn.model_selection import train_test_split
import yaml
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout, Activation
from keras.models import model_from_yaml

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
np.random.seed(1337) # For Reproducibility
# the dimension of word vector
vocab_dim = 300
# sentence length
maxlen = 100
# iter num
n_iterations = 1
# the number of words appearing
n_exposures = 10
# the maximum distance
window_size = 7
# batch size
batch_size = 32
# epoch num
n_epoch = 20
# input length
input_length = 100
# multi processing cpu number
cpu_count = multiprocessing.cpu_count()

In [4]:
# loading data
def loadfile():
    neg = pd.read_excel('./neg.xls', header=None, index=None)
    pos = pd.read_excel('./pos.xls', header=None, index=None)
    #merge all data
    neg = np.array(neg[0])
    pos = np.array(pos[0])
    return neg,pos

In [None]:
neg, pos = loadfile()

In [5]:
#generating set of disused words
def getstopword(stopwordPath):
    stoplist = set()
    for line in stopwordPath:
        stoplist.add(line.strip())
        # print line.strip()
    return stoplist

In [6]:
def wordsege(text):
    stoplist = set()
    stopwordPath = open('./stopwords(ch).txt', 'r')
    for line in stopwordPath:
        stoplist.add(line.strip())
    stopwordPath.close()
    
    text_list = []
    for document in text:
        seg_list = jieba.cut(document.strip())
        fenci = []
        
        for item in seg_list:
            if item not in stoplist and re.match(r'-?\d+\.?\d*', item)==None and len(item.strip())>0:
                fenci.append(item)
        # if the word segmentation of the sentence is null, the label of 
        # the sentence should be deleted accordingly
        if len(fenci)>0:
            text_list.append(fenci)
    return text_list

In [7]:
def tokenizer(neg, pos):
    neg_sege = wordsege(neg)
    pos_sege = wordsege(pos)
    combined = np.concatenate((pos_sege,neg_sege))
    # generating label and meging label data
    y = np.concatenate((np.ones(len(pos_sege), dtype=int), np.zeros(len(neg_sege), dtype=int)))
    return combined,y

In [8]:
combined,y = tokenizer(neg, pos)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\chenx\AppData\Local\Temp\jieba.cache
Loading model cost 0.703 seconds.
Prefix dict has been built succesfully.


In [9]:
# create a dictionary of words and phrases,return the index of each word,vector of words,and index of words corresponding to each sentence
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        # the index of a word which have word vector is not 0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # integrate all the corresponding word vectors into the word vector matrix
        w2vec = {word: model[word] for word in w2indx.keys()}

        # a word without a word vector is indexed 0,return the index of word
        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # unify the length of the sentence with the pad_sequences function of keras
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # return index, word vector matrix and the sentence with an unifying length and indexed
        return w2indx, w2vec, combined
    else:
        print('No data provided...')

In [10]:
# the training of the word vector
def word2vec_train(combined):
    model = Word2Vec(size=vocab_dim,
                     min_count=n_exposures,
                     window=window_size,
                     workers=cpu_count,
                     iter=n_iterations)
    # build the vocabulary dictionary
    model.build_vocab(combined)
    # train the word vector model
    model.train(combined, total_examples=model.corpus_count, epochs=50)
    # save the trained model
    model.save('./Word2vec_model.pkl')
    # index, word vector matrix and the sentence with an unifying length and indexed based on the trained model
    index_dict, word_vectors, combined = create_dictionaries(model=model, combined=combined)

    return index_dict, word_vectors, combined

In [11]:
##定义网络结构
def train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test):
    print('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    model.add(LSTM(output_dim=50, activation='sigmoid', inner_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    print ('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    model.summary()
    print ("Train...")
    model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=n_epoch, verbose=1)

    print ("Evaluate...")
    score = model.evaluate(x_test, y_test,
                           batch_size=batch_size)
    # save the trained lstm model
    yaml_string = model.to_yaml()
    with open('./lstm.yml', 'w') as outfile:
        outfile.write(yaml.dump(yaml_string, default_flow_style=True))
    model.save_weights('./lstm.h5')
    print ('Test score:', score)

In [12]:
# 训练模型，并保存
def train():
    print ('Loading Data...')
    neg, post = loadfile()

    print('Tokenising...')
    combined,y = tokenizer(neg, post)
    print(len(combined), len(y))
    print('Training a Word2vec model...')
    index_dict, word_vectors, combined = word2vec_train(combined)
    print('Setting up Arrays for Keras Embedding Layer...')
    n_symbols, embedding_weights, x_train, y_train, x_test, y_test = get_data(index_dict, word_vectors, combined, y)
    print(x_train.shape, y_train.shape)
    train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test)

# building the input format data
def input_transform(string):
    words = jieba.cut(string)
    # reshape the list to bilayer list
    words = np.array(words).reshape(1, -1)
    model = Word2Vec.load('./Word2vec_model.pkl')
    # create a dictionary of words and phrases,return the index of each word,vector of words,and index of words corresponding to each senten
    _, _, combined = create_dictionaries(model, words)
    return combined


def lstm_predict(string):
    print('loading model......')
    with open('./lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)

    print('loading weights......')
    model.load_weights('./lstm.h5')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    data = input_transform(string)
    data.reshape(1, -1)
    # predict the new data
    result = model.predict_classes(data)
    if result[0][0] == 1:
        print(string, ' positive')
    else:
        print(string, ' negative')

In [13]:
if __name__ == '__main__':
    train()
    # string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
    # string='牛逼的手机，从3米高的地方摔下去都没坏，质量非常好'
    # string='酒店的环境非常好，价格也便宜，值得推荐'
    string='屏幕较差，拍照也很粗糙。'
    # string='我是傻逼'
    # string='你是傻逼'
    # string = '屏幕较差，拍照也很粗糙。'
    # string='质量不错，是正品 ，安装师傅也很好，才要了83元材料费'
    # string='东西非常不错，安装师傅很负责人，装的也很漂亮，精致，谢谢安装师傅！'

    lstm_predict(string)

Loading Data...
Tokenising...
21105 21105
Training a Word2vec model...


  app.launch_new_instance()


Setting up Arrays for Keras Embedding Layer...
(16884, 100) (16884,)
(16884, 100) (16884,)
Defining a Simple Keras Model...


  # Remove the CWD from sys.path while we load stuff.


Compiling the Model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          2381700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                70200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 2,451,951
Trainable params: 2,451,951
Non-trainable params: 0
_________________________________________________________________
Train...




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Evaluate...
Test score: [0.4241661518367369, 0.9121061361420637]
loading model......
loading weights......
屏幕较差，拍照也很粗糙。  negative
