In [2]:
from urllib.request import urlretrieve
import os
import sys
import zipfile 

from sklearn.utils import shuffle
from sklearn.datasets import fetch_20newsgroups
from gensim.models import *

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### 下载额外训练语料text8

In [3]:
#下载数据集，仅需执行一次
def data_download(url, filename):
    
    #os.chdir('datasets/')
    if not os.path.exists(filename):
        filename,_ = urlretrieve(url+filename,filename)
    else:
        if filename == 'text8.zip':
            statinfo = os.stat(filename)
            print("{} has already existed,file size are {}".format(filename, statinfo.st_size))
        else:
            raise Exception("Please check your origin dataset.")
    return filename
text8 = data_download('http://mattmahoney.net/dc/', 'text8.zip')

In [4]:
# 解压text8词汇语料
#完成一次后，下次不需要再解压
def load_txts(fpath):
    fz = zipfile.ZipFile(fpath,'r')
    for file in fz.namelist():
        fz.extract(file)
    fz.close()
text8 = load_txts('text8.zip')

### 使用Text8 语料训练 word2vec模型

In [None]:
# 使用gensim训练词向量模型
dim = 128
text =  word2vec.Text8Corpus('text8')   #train on the pre-build text8 corpus
w2v_model = Word2Vec(text, size = dim, min_count=1,iter = 10) #get the 128 dimensions word vector，使用CBOW模式
w2v_model.save('word2vec_model')

#### 对word2vec词向量模型进行测试

In [6]:
#检测词向量模型：
w2v_model=Word2Vec.load('word2vec_model')
w2v_model.wv.save_word2vec_format('text8Vec.txt', binary=False)

In [7]:
print(w2v_model.wv['deep'])

[ 0.03512759 -0.00424395 -0.2512604  -0.88085592  0.03835738  0.54644412
  1.61797595 -0.0516028  -0.43942273  1.07317543 -0.06711469 -1.15136158
 -1.24386573 -0.35904369  0.20977189 -0.17898694  0.21471363 -1.29866695
  0.35908294  0.43134731  0.76700592 -3.45465779 -0.53539431 -2.40553665
 -0.4698481  -2.31175327 -1.20812631 -0.45274791  1.35530138  3.15746403
  0.39651179 -0.2127454  -0.80603361 -0.60038775 -0.11123826  0.81275958
  0.22245947 -0.5842241   0.88349861  0.38352695 -0.68967813  1.69892669
 -1.95197868  0.08851481  1.53950238 -1.62948072  0.91869867  2.31755233
  1.58886206  1.05564499 -1.67616487  0.31222013  1.23614001 -1.41685343
 -0.33679372 -1.86449111 -0.09396074  2.12119722  1.88810432  2.69032288
  1.26136065  0.44184107  0.58816493  1.27321422 -0.01046323  2.08304119
  0.02146814 -1.14498115 -0.17239916 -0.33008128  1.2449472   0.98431587
  1.2882812   0.7654416   0.43462437  0.97446895  0.57563263  0.35506377
  0.50279886 -0.26539755  0.19651802  2.14171195  0

In [8]:
word_example =['he his she','big bigger heavy','shanghai china paris']
for example in word_example:
    a,b,x = example.split()
    pred = w2v_model.wv.most_similar(positive=[x,b], negative=[a])[0][0]
    print('{} is to {} as {} is to {}'.format(a,b,x,pred))
print(w2v_model.wv.doesnt_match("breakfast cereal dinner lunch".split()))

he is to his as she is to her
big is to bigger as heavy is to lighter
shanghai is to china as paris is to france
cereal


In [9]:
embeddings_index = {}
f = open(os.path.join('./', 'text8Vec.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

Found 253855 word vectors.
Processing text dataset


### 预处理数据，使用Keras API 获得数据的word_index

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D,Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping

# 文本序列长度
MAX_SEQUENCE_LENGTH = 1000
# 单词最大数量
MAX_NUM_WORDS = 12000
# 词向量长度
EMBEDDING_DIM = w2v_model.wv.syn0.shape[1]

Using TensorFlow backend.


In [11]:
newsgroups = fetch_20newsgroups(subset='all')
targets = newsgroups.target

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


In [12]:
from sklearn.model_selection import train_test_split

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(newsgroups.data)
sequences = tokenizer.texts_to_sequences(newsgroups.data)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(targets))

Found 179209 unique tokens.


In [13]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(0.2 * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

### 神经网络构建

#### 1. 构建embedding矩阵，对存在于字表中的单词，使用已经训练好的w2v_model模型

In [14]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

#### 2. embedding_layer 层，该层设置为trainable=False 

In [16]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

#### 3. 1 神经网络搭建，  Embedding --> Dropout -->  LSTM --> Dense -->Output

In [17]:
from keras.callbacks import EarlyStopping,TensorBoard

# 通过early_stop在精度不再提高的时候停止
#early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, verbose=0)
tensorBoard = TensorBoard(log_dir = './output/logs')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Dropout(0.2)(embedded_sequences)
x = LSTM(256,recurrent_dropout=0.2)(x)

preds = Dense(y_train.shape[1], activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [19]:
model.fit(x_train, y_train,
          batch_size=256,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[tensorBoard])

Train on 15077 samples, validate on 3769 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f49f812bd68>

In [20]:
model.save('./20news_lstm')

In [21]:
result = model.evaluate(x_val, y_val)
print('Test score is: {}'.format(result[0]))
print('Test accuray is: {}'.format(result[1]))

Test score is: 0.5723360699015101
Test accuray is: 0.8214380471957524


#### 3. 2 神经网络搭建，  Embedding --> Dropout -->  GRU --> Dense -->Output

In [23]:
from keras.layers import GRU

tensorBoard = TensorBoard(log_dir = './output/logs')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
x = Dropout(0.2)(embedded_sequences)
x = GRU(256,recurrent_dropout=0.2)(x)

preds = Dense(y_train.shape[1], activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [24]:
model.fit(x_train, y_train,
          batch_size=256,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[tensorBoard])

Train on 15077 samples, validate on 3769 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f49bf96ffd0>

In [25]:
model.save('./20news_GRU')

In [26]:
result = model.evaluate(x_val, y_val)
print('Test score is: {}'.format(result[0]))
print('Test accuray is: {}'.format(result[1]))

Test score is: 0.48670502125220577
Test accuray is: 0.8538073759301646


#### 3. 3 神经网络搭建，  Embedding --> Conv1D --> MaxPooling --> Dropout -->  LSTM --> Dense -->Output

In [27]:
tensorBoard = TensorBoard(log_dir = './output/logs')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(32, 3, activation='relu')(embedded_sequences)
x = MaxPooling1D(3)(x)
x = Dropout(0.2)(x)
x = LSTM(256,recurrent_dropout=0.2)(x)

preds = Dense(y_train.shape[1], activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [28]:
model.fit(x_train, y_train,
          batch_size=256,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[tensorBoard])

Train on 15077 samples, validate on 3769 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f49be5a09b0>

In [29]:
model.save('./20news_CNN_LSTM')

In [30]:
result = model.evaluate(x_val, y_val)
print('Test score is: {}'.format(result[0]))
print('Test accuray is: {}'.format(result[1]))

Test score is: 0.8217184117884748
Test accuray is: 0.7280445738413126


#### 3. 4 神经网络搭建，  Embedding --> Conv1D --> MaxPooling --> Dropout -->  GRU --> Dense -->Output

In [31]:
tensorBoard = TensorBoard(log_dir = './output/logs')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(32, 3, activation='relu')(embedded_sequences)
x = MaxPooling1D(3)(x)
x = Dropout(0.2)(x)
x = GRU(256,recurrent_dropout=0.2)(x)

preds = Dense(y_train.shape[1], activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [32]:
model.fit(x_train, y_train,
          batch_size=256,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[tensorBoard])

Train on 15077 samples, validate on 3769 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f49b277da90>

In [33]:
model.save('./20news_CNN_GRU')

In [34]:
result = model.evaluate(x_val, y_val)
print('Test score is: {}'.format(result[0]))
print('Test accuray is: {}'.format(result[1]))

Test score is: 0.7877520742989498
Test accuray is: 0.7341469882748494
