In [1]:
from urllib.request import urlretrieve
import os
import codecs
import pickle
import sys
import tarfile
import zipfile 

from sklearn.datasets import load_files
from sklearn.utils import shuffle

from gensim.models import *

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### 引入需要的基本库
#### 定义文件名称

In [2]:
CACHE_NAME = "20news-bydate.pkz"
TRAIN_FOLDER = "20news-bydate-train"
TEST_FOLDER = "20news-bydate-test"

### 下载数据集

In [3]:
#下载数据集，仅需执行一次
def data_download(url, filename):
    
    #os.chdir('datasets/')
    if not os.path.exists(filename):
        filename,_ = urlretrieve(url+filename,filename)
    else:
        if filename == 'text8.zip':
            statinfo = os.stat(filename)
            print("{} has already existed,file size are {}".format(filename, statinfo.st_size))
        elif filename == '20news-bydate.tar.gz':
            statinfo = os.stat(filename)
            print("{} has already existed,file size are {}".format(filename, statinfo.st_size))
        else:
            raise Exception("Please check your origin dataset.")
    return filename

In [4]:
news = data_download('http://www.qwone.com/~jason/20Newsgroups/', '20news-bydate.tar.gz')
text8 = data_download('http://mattmahoney.net/dc/', 'text8.zip')

20news-bydate.tar.gz has already existed,file size are 14464277
text8.zip has already existed,file size are 31344016


In [5]:
# 解压text8词汇语料
#完成一次后，下次不需要再解压
def load_txts(fpath):
    fz = zipfile.ZipFile(fpath,'r')
    for file in fz.namelist():
        fz.extract(file)
    fz.close()
text8 = load_txts('text8.zip')

In [6]:
# 解压20newsgroup 数据集
#完成一次后，下次不需要再解压


def load_categories(fpath):
    with tarfile.open(fpath) as tar:
        tar.extractall()
        tar.close()
        
    cache = dict(train=load_files(TRAIN_FOLDER, encoding='latin1'),
                 test=load_files(TEST_FOLDER, encoding='latin1'))
    compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec')
    
    cache_path = CACHE_NAME
    with open(cache_path, 'wb') as f:
        f.write(compressed_content)         #写入pkz文件
    #return cache
        
load_categories('20news-bydate.tar.gz')

### 读取数据集

In [7]:
#读取数据集
def read_categories(subset,random_state=42):
     if os.path.getsize(CACHE_NAME):
        try:
            with open(CACHE_NAME, 'rb') as f:
                compressed_content = f.read()
            uncompressed_content = codecs.decode(
                compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)  
        except Exception as e:
            print(80 * '_')
            print('Cache loading failed')
            print(80 * '_')
            print(e) 
        data = cache[subset]
        data.data, data.target = shuffle(data.data, data.target,random_state=random_state)
        
        return data

In [8]:
news_train = read_categories(subset='train')  #训练集
news_test = read_categories(subset='test')   #测试集
print(news_train.data[0])
print(news_train.target[:5])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





[ 7  4  4  1 14]


### 使用Text8 语料训练 word2vec模型

In [11]:
# 使用gensim训练词向量模型
dim = 128

text =  word2vec.Text8Corpus('text8')   #train on the pre-build text8 corpus
w2v_model = Word2Vec(text, size = dim, min_count=1,iter = 10) #get the 128 dimensions word vector，使用CBOW模式
w2v_model.save('word2vec_model')

#### 对word2vec词向量模型进行测试

In [1]:
#检测词向量模型：
w2v_model=Word2Vec.load('word2vec_model')

w2v_model.wv['deep']

NameError: name 'Word2Vec' is not defined

In [10]:
word_example =['he his she','big bigger heavy','shanghai china paris']
for example in word_example:
    a,b,x = example.split()
    pred = w2v_model.wv.most_similar(positive=[x,b], negative=[a])[0][0]
    print('{} is to {} as {} is to {}'.format(a,b,x,pred))
print(w2v_model.wv.doesnt_match("breakfast cereal dinner lunch".split()))

he is to his as she is to her
big is to bigger as heavy is to heavier
shanghai is to china as paris is to france
cereal


#### 构建基于训练的 text8 模型 的字典序列word_index

In [11]:
#获得基于词频v.index 的词频统计，eg. model_index_word['you'] = 206 model_index_word['beauty'] = 3714
def index_word(model):
    temp = {}
    for k,v in model:
        temp[k] = v.index
    return temp
model_index_word = index_word(w2v_model.wv.vocab.items())

In [12]:
#获得基于model_index_word 的word_index 转换，eg. model_index_word[123] = 'you'   model_index_word[456] = 'beauty'
def word_index(vocab_dict):
    temp = {}
    for word,i in enumerate(vocab_dict):
        temp[word] = i
    return temp
word_index = word_index(model_index_word)

### 清洗数据
#### 并获得20news 基于text8 词向量模型 的word to index 字典

In [13]:
#去除各种标点符号和标签，获得清洗过的数据
import re

def preprocessor(text):
    text = re.sub('[\W]+', ' ', text.lower()) 

    #new_text =[]
    #snowball = nltk.stem.SnowballStemmer('english')  
    #text = [snowball.stem(word) for word in text.split()]
    #text = ' '.join(text)

    return text.strip()
train_data = list(map(preprocessor,news_train.data))
test_data = list(map(preprocessor,news_test.data))

print(len(train_data),len(test_data))
print(train_data[0].split())

11314 7532
['from', 'lerxst', 'wam', 'umd', 'edu', 'where', 's', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac3', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', 'it', 'was', 'a', '2', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', '60s', 'early', '70s', 'it', 'was', 'called', 'a', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'i', 'know', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'e', 'mail', 'tha

In [14]:
trainX_raw = []
testX_raw = []

for sentence in train_data: 
    tmp = []
    for word in sentence.split():
        if word in model_index_word:
            tmp.append(model_index_word[word])
        else:
            tmp.append(len(model_index_word))
    trainX_raw.append(tmp)
    
for sentence in test_data:
    tmp = []
    for word in sentence.split():
        if word in model_index_word:
            tmp.append(model_index_word[word])
        else:
            tmp.append(len(model_index_word))
    testX_raw.append(tmp)

### 预处理数据，使用Keras API 获得数据的word_index


In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D,Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model
from keras.callbacks import EarlyStopping

# 文本序列长度
MAX_SEQUENCE_LENGTH = 300
# 单词最大数量
MAX_NUM_WORDS = 12000
# 词向量长度
EMBEDDING_DIM = w2v_model.wv.syn0.shape[1]

Using TensorFlow backend.


In [16]:
x_train = pad_sequences(trainX_raw, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(testX_raw, maxlen=MAX_SEQUENCE_LENGTH)
# Converting labels to binary vectors
y_train = to_categorical(np.asarray(news_train.target))
y_test = to_categorical(np.asarray(news_test.target))

### 神经网络构建

#### 1. 构建embedding矩阵，对存在于字表中的单词，使用已经训练好的w2v_model模型

In [17]:
embedding_matrix = np.zeros([(len(word_index)+1), EMBEDDING_DIM])
embedding_matrix[:-1] = w2v_model.wv.syn0

#### 2. embedding_layer 层，该层设置为trainable=False 

In [18]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

#### 3. 神经网络搭建，  Embedding --> Dropout -->  LSTM --> Dense -->Output

In [2]:

from keras.callbacks import TensorBoard

#early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=25)
tensorBoard = TensorBoard(log_dir = './output/output/logs')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Dropout(0.5)(embedded_sequences)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)
#x = GlobalMaxPooling1D()(x)
x = LSTM(128)(x)
x = Dense(128, activation='relu')(x)
preds = Dense(y_train.shape[1], activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

Using TensorFlow backend.


NameError: name 'Input' is not defined

In [25]:
model.fit(x_train, y_train,
          batch_size=256,
          epochs=50,
          validation_data=(x_test, y_test),
          callbacks=[tensorBoard])

Train on 11314 samples, validate on 7532 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f923432f6d8>

In [26]:
model.save('./output/20news_lstm')

In [28]:
result = model.evaluate(x_test, y_test)
print('Test score is: {}'.format(result[0]))
print('Test accuray is: {}'.format(result[1]))

Test score is: 1.0922839744763366
Test accuray is: 0.7032660648218825
