In [1]:
# --*-- encoding:utf-8 --*--
import pandas as pd
import numpy as np
import jieba
import re

In [2]:
# loading data
def loadfile():
    neg = pd.read_excel('./neg1.xls', header=None, index=None)
    pos = pd.read_excel('./pos1.xls', header=None, index=None)
    #merge all data
    neg = np.array(neg[0])
    pos = np.array(pos[0])
    return neg,pos

In [3]:
neg, pos = loadfile()

In [4]:
#generating set of disused words
def getstopword(stopwordPath):
    stoplist = set()
    for line in stopwordPath:
        stoplist.add(line.strip())
        # print line.strip()
    return stoplist

In [5]:
#divide the sentence and remove the disused words
def wordsege(text):
    # get disused words set
    stopwordPath = open('./stopwords(ch).txt', 'r')
    stoplist = getstopword(stopwordPath)
    stopwordPath.close()

    # divide the sentence and remove the disused words with jieba,return list
    text_list = []
    for document in text:

        seg_list = jieba.cut(document.strip())
        fenci = []

        for item in seg_list:
            if item not in stoplist and re.match(r'-?\d+\.?\d*', item) == None and len(item.strip()) > 0:
                fenci.append(item)
        # if the word segmentation of the sentence is null,the label of the sentence should be deleted accordingly
        if len(fenci) > 0:
            text_list.append(fenci)
    return text_list

In [6]:
def tokenizer(neg, pos):
    neg_sege = wordsege(neg)
    pos_sege = wordsege(pos)
    combined = np.concatenate((pos_sege,neg_sege))
    # generating label and meging label data
    y = np.concatenate((np.ones(len(pos_sege), dtype=int), np.zeros(len(neg_sege), dtype=int)))
    return combined,y

In [7]:
combined,y = tokenizer(neg, pos)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\chenx\AppData\Local\Temp\jieba.cache
Loading model cost 1.830 seconds.
Prefix dict has been built succesfully.


In [8]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(combined,y, test_size = 0.3)



In [9]:
from gensim.models import Word2Vec
model = Word2Vec(combined, size=128, window=5, min_count=5, workers=4)



In [10]:
# vec_size 指的是我们本身vector的size
def transform_to_matrix(x, padding_size=256, vec_size=128):
    res = []
    for sen in x:
        matrix = []
        for i in range(padding_size):
            try:
                matrix.append(model[sen[i]].tolist())
            except:
                # 这里有两种except情况，
                # 1. 这个单词找不到
                # 2. sen没那么长
                # 不管哪种情况，我们直接贴上全是0的vec
                matrix.append([0] * vec_size)
        res.append(matrix)
    return res

In [11]:
x_train = transform_to_matrix(x_train)
x_test = transform_to_matrix(x_test)

  


In [12]:
# 搞成np的数组，便于处理
x_train = np.array(x_train)
x_test = np.array(x_test)

# 看看数组的大小
print(x_train.shape)
print(x_test.shape)

(5467, 256, 128)
(2344, 256, 128)


In [13]:
x_train = x_train.reshape(x_train.shape[0], 1, x_train.shape[1], x_train.shape[2])
x_test = x_test.reshape(x_test.shape[0], 1, x_test.shape[1], x_test.shape[2])
#通过print(X_test)观察与前者的区别，就是多了一个括号
print(x_train.shape)
print(x_test.shape)

(5467, 1, 256, 128)
(2344, 1, 256, 128)


In [14]:
from keras import backend as K
K.set_image_dim_ordering('th')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [15]:
from keras import backend as K
K.set_image_data_format('channels_first')

In [19]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding, Reshape
from keras.layers import LSTM
from keras.layers import TimeDistributed 
from keras.layers import Bidirectional, BatchNormalization

# set parameters:
batch_size = 32
n_filter = 16
filter_length = 4
nb_epoch = 10
n_pool = 2

# 新建一个sequential的模型
model = Sequential()
model.add(Convolution2D(n_filter,filter_length,filter_length,
                        input_shape=(1, 256, 128)))
model.add(Activation('relu'))
model.add(Convolution2D(n_filter,filter_length,filter_length))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Reshape((1, 128)))
model.add(Bidirectional(LSTM(units=20,return_sequences=False)))
model.add(Dropout(0.5)) 
model.add(BatchNormalization()) 
# model.add(TimeDistributed(Dense(1, activation='sigmoid'))) 
model.add(Dense(1, activation='sigmoid'))
# LSTM参数个数计算：ht-1与xt拼接、隐藏单元数、四个门的bias 
# （20+40）*units*4+20*4 # # 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
batch_size = 64 
model.fit(x_train, y_train, batch_size=batch_size, epochs=3, validation_data=(x_test, y_test), verbose=1)
score, acc = model.evaluate(x_test, y_test,batch_size=batch_size)
print (score, acc)



Train on 5467 samples, validate on 2344 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.2776657472088068 0.9159556316027462


In [20]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 16, 253, 125)      272       
_________________________________________________________________
activation_10 (Activation)   (None, 16, 253, 125)      0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 16, 250, 122)      4112      
_________________________________________________________________
activation_11 (Activation)   (None, 16, 250, 122)      0         
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 16, 125, 61)       0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 16, 125, 61)       0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 122000)            0         
__________