In [1]:
import pandas as pd

train = pd.read_csv('task1_train.csv')
test = pd.read_csv('task1_test.csv')

In [2]:
import jieba
import numpy as np

train['sen_cut'] = train['joke'].apply(jieba.lcut)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.627 seconds.
Prefix dict has been built succesfully.


In [3]:
X_train = train['sen_cut'].apply(lambda x: ' '.join(x)).tolist()
y_train = pd.get_dummies((np.asarray(train["label"])))
text = np.array(X_train)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import time

vocab_size = 30000
maxlen = 25

print("开始统计语料的词频信息...")
t = Tokenizer(vocab_size)
t.fit_on_texts(text)
word_index = t.word_index
print('完整的字典大小：', len(word_index))

print("开始序列化句子...")
X_train = t.texts_to_sequences(X_train)
print("开始对齐句子序列...")
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
print("完成！")

Using TensorFlow backend.


开始统计语料的词频信息...
完整的字典大小： 29702
开始序列化句子...
开始对齐句子序列...
完成！


In [5]:
import copy

small_word_index = copy.deepcopy(word_index) # 防止原来的字典被改变
x = list(t.word_counts.items())
s = sorted(x, key=lambda p:p[1], reverse=True)
print("移除word_index字典中的低频词...")
for item in s[20000:]:
    small_word_index.pop(item[0]) # 对字典pop
print("完成！")
print(len(small_word_index))
print(len(word_index))

移除word_index字典中的低频词...
完成！
20000
29702


In [6]:
print(type(X_train))
print(type(y_train))

print(X_train.shape)
print(y_train.shape)

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
(16420, 25)
(16420, 2)


In [7]:
import gensim

model_file = 'E:/embedding/wiki.zh.vec' # input your file path
print("加载Word2Vec模型...")
wv_model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=False)

加载Word2Vec模型...


In [8]:
embedding_matrix = np.random.uniform(size=(vocab_size+1,300))
print("构建embedding_matrix...")
for word, index in small_word_index.items():
    try:
        word_vector = wv_model[word]
        embedding_matrix[index] = word_vector
    except:
        print("Word: [",word,"] not in wvmodel! Use random embedding instead.")
print("完成！")
print("Embedding matrix shape:\n",embedding_matrix.shape)

构建embedding_matrix...
Word: [ “ ] not in wvmodel! Use random embedding instead.
Word: [ ” ] not in wvmodel! Use random embedding instead.
Word: [ 两个 ] not in wvmodel! Use random embedding instead.
Word: [ 地说 ] not in wvmodel! Use random embedding instead.
Word: [ 这是 ] not in wvmodel! Use random embedding instead.
Word: [ 每个 ] not in wvmodel! Use random embedding instead.
Word: [ 我会 ] not in wvmodel! Use random embedding instead.
Word: [ ’ ] not in wvmodel! Use random embedding instead.
Word: [ 服务员 ] not in wvmodel! Use random embedding instead.
Word: [ 有个 ] not in wvmodel! Use random embedding instead.
Word: [ 哥们 ] not in wvmodel! Use random embedding instead.
Word: [ 阿凡提 ] not in wvmodel! Use random embedding instead.
Word: [ 5 ] not in wvmodel! Use random embedding instead.
Word: [ 3 ] not in wvmodel! Use random embedding instead.
Word: [ 10 ] not in wvmodel! Use random embedding instead.
Word: [ 有没有 ] not in wvmodel! Use random embedding instead.
Word: [ 三个 ] not in wvmodel! Use ran

Word: [ 常说 ] not in wvmodel! Use random embedding instead.
Word: [ 达尔文 ] not in wvmodel! Use random embedding instead.
Word: [ 一百元 ] not in wvmodel! Use random embedding instead.
Word: [ 哭闹 ] not in wvmodel! Use random embedding instead.
Word: [ 一切都在 ] not in wvmodel! Use random embedding instead.
Word: [ 近视眼 ] not in wvmodel! Use random embedding instead.
Word: [ 听过 ] not in wvmodel! Use random embedding instead.
Word: [ 自动扶梯 ] not in wvmodel! Use random embedding instead.
Word: [ 每天晚上 ] not in wvmodel! Use random embedding instead.
Word: [ 没有勇气 ] not in wvmodel! Use random embedding instead.
Word: [ 门上 ] not in wvmodel! Use random embedding instead.
Word: [ 买菜 ] not in wvmodel! Use random embedding instead.
Word: [ 他爸 ] not in wvmodel! Use random embedding instead.
Word: [ 伪装成 ] not in wvmodel! Use random embedding instead.
Word: [ 发挥作用 ] not in wvmodel! Use random embedding instead.
Word: [ 绕过 ] not in wvmodel! Use random embedding instead.
Word: [ 坐车 ] not in wvmodel! Use random em

Word: [ 法盲 ] not in wvmodel! Use random embedding instead.
Word: [ 一款 ] not in wvmodel! Use random embedding instead.
Word: [ 遇到困难 ] not in wvmodel! Use random embedding instead.
Word: [ 地喊 ] not in wvmodel! Use random embedding instead.
Word: [ 我读 ] not in wvmodel! Use random embedding instead.
Word: [ 凉皮 ] not in wvmodel! Use random embedding instead.
Word: [ 这时候 ] not in wvmodel! Use random embedding instead.
Word: [ 长命百岁 ] not in wvmodel! Use random embedding instead.
Word: [ 闪光灯 ] not in wvmodel! Use random embedding instead.
Word: [ 憋住 ] not in wvmodel! Use random embedding instead.
Word: [ 付帐 ] not in wvmodel! Use random embedding instead.
Word: [ 王小二 ] not in wvmodel! Use random embedding instead.
Word: [ 不怎么样 ] not in wvmodel! Use random embedding instead.
Word: [ 酷毙了 ] not in wvmodel! Use random embedding instead.
Word: [ 黑匣子 ] not in wvmodel! Use random embedding instead.
Word: [ 综合症 ] not in wvmodel! Use random embedding instead.
Word: [ 大不了 ] not in wvmodel! Use random emb

Word: [ 上见 ] not in wvmodel! Use random embedding instead.
Word: [ 实习生 ] not in wvmodel! Use random embedding instead.
Word: [ 药剂师 ] not in wvmodel! Use random embedding instead.
Word: [ 小箱子 ] not in wvmodel! Use random embedding instead.
Word: [ 里取 ] not in wvmodel! Use random embedding instead.
Word: [ 字迹 ] not in wvmodel! Use random embedding instead.
Word: [ 翻车 ] not in wvmodel! Use random embedding instead.
Word: [ 基因突变 ] not in wvmodel! Use random embedding instead.
Word: [ 毫不迟疑 ] not in wvmodel! Use random embedding instead.
Word: [ 经常出现 ] not in wvmodel! Use random embedding instead.
Word: [ 考是 ] not in wvmodel! Use random embedding instead.
Word: [ 定睛一看 ] not in wvmodel! Use random embedding instead.
Word: [ 想买个 ] not in wvmodel! Use random embedding instead.
Word: [ 一个七八岁 ] not in wvmodel! Use random embedding instead.
Word: [ 未干 ] not in wvmodel! Use random embedding instead.
Word: [ 金银花 ] not in wvmodel! Use random embedding instead.
Word: [ 准星 ] not in wvmodel! Use random 

Word: [ 百货商店 ] not in wvmodel! Use random embedding instead.
Word: [ 一呼 ] not in wvmodel! Use random embedding instead.
Word: [ 新月 ] not in wvmodel! Use random embedding instead.
Word: [ 会到 ] not in wvmodel! Use random embedding instead.
Word: [ 内心深处 ] not in wvmodel! Use random embedding instead.
Word: [ 猫扑 ] not in wvmodel! Use random embedding instead.
Word: [ 怒视 ] not in wvmodel! Use random embedding instead.
Word: [ 朱哈 ] not in wvmodel! Use random embedding instead.
Word: [ 我忘关 ] not in wvmodel! Use random embedding instead.
Word: [ 或否 ] not in wvmodel! Use random embedding instead.
Word: [ 杰瑞 ] not in wvmodel! Use random embedding instead.
Word: [ 大声疾呼 ] not in wvmodel! Use random embedding instead.
Word: [ 一头雾水 ] not in wvmodel! Use random embedding instead.
Word: [ 诺基亚 ] not in wvmodel! Use random embedding instead.
Word: [ 传教士 ] not in wvmodel! Use random embedding instead.
Word: [ 摔下来 ] not in wvmodel! Use random embedding instead.
Word: [ 赛球 ] not in wvmodel! Use random embe

Word: [ 嬉皮笑脸 ] not in wvmodel! Use random embedding instead.
Word: [ 宠物商店 ] not in wvmodel! Use random embedding instead.
Word: [ 整日 ] not in wvmodel! Use random embedding instead.
Word: [ 哥伦比亚大学 ] not in wvmodel! Use random embedding instead.
Word: [ 纽约市 ] not in wvmodel! Use random embedding instead.
Word: [ 沉船 ] not in wvmodel! Use random embedding instead.
Word: [ 无拘无束 ] not in wvmodel! Use random embedding instead.
Word: [ 自赏 ] not in wvmodel! Use random embedding instead.
Word: [ 自恋 ] not in wvmodel! Use random embedding instead.
Word: [ 鸡毛掸子 ] not in wvmodel! Use random embedding instead.
Word: [ 回吻 ] not in wvmodel! Use random embedding instead.
Word: [ 商务旅行 ] not in wvmodel! Use random embedding instead.
Word: [ 蛋白质 ] not in wvmodel! Use random embedding instead.
Word: [ 凝胶 ] not in wvmodel! Use random embedding instead.
Word: [ 有鱼 ] not in wvmodel! Use random embedding instead.
Word: [ 死鱼 ] not in wvmodel! Use random embedding instead.
Word: [ 上出 ] not in wvmodel! Use random 

Word: [ 滑雪者 ] not in wvmodel! Use random embedding instead.
Word: [ 体瘦且 ] not in wvmodel! Use random embedding instead.
Word: [ 奇长 ] not in wvmodel! Use random embedding instead.
Word: [ 其长 ] not in wvmodel! Use random embedding instead.
Word: [ 卡布奇诺 ] not in wvmodel! Use random embedding instead.
Word: [ 李有 ] not in wvmodel! Use random embedding instead.
Word: [ 李太太 ] not in wvmodel! Use random embedding instead.
Word: [ 惊道 ] not in wvmodel! Use random embedding instead.
Word: [ 百慕大 ] not in wvmodel! Use random embedding instead.
Word: [ 动作片 ] not in wvmodel! Use random embedding instead.
Word: [ 罗马不是 ] not in wvmodel! Use random embedding instead.
Word: [ 有玉 ] not in wvmodel! Use random embedding instead.
Word: [ 呐喊助威 ] not in wvmodel! Use random embedding instead.
Word: [ 好客 ] not in wvmodel! Use random embedding instead.
Word: [ 上大吼 ] not in wvmodel! Use random embedding instead.
Word: [ 回头率 ] not in wvmodel! Use random embedding instead.
Word: [ 女追男 ] not in wvmodel! Use random em

Word: [ 狂吞 ] not in wvmodel! Use random embedding instead.
Word: [ 五十张 ] not in wvmodel! Use random embedding instead.
Word: [ 特别感谢 ] not in wvmodel! Use random embedding instead.
Word: [ 杰森 ] not in wvmodel! Use random embedding instead.
Word: [ 每学期 ] not in wvmodel! Use random embedding instead.
Word: [ 一牛人 ] not in wvmodel! Use random embedding instead.
Word: [ 高一上 ] not in wvmodel! Use random embedding instead.
Word: [ 高三上 ] not in wvmodel! Use random embedding instead.
Word: [ 下学期 ] not in wvmodel! Use random embedding instead.
Word: [ 高考落榜 ] not in wvmodel! Use random embedding instead.
Word: [ 名曰 ] not in wvmodel! Use random embedding instead.
Word: [ 华清 ] not in wvmodel! Use random embedding instead.
Word: [ 台湾人 ] not in wvmodel! Use random embedding instead.
Word: [ 左读 ] not in wvmodel! Use random embedding instead.
Word: [ 恰为 ] not in wvmodel! Use random embedding instead.
Word: [ 池浴 ] not in wvmodel! Use random embedding instead.
Word: [ 学完 ] not in wvmodel! Use random embed

In [9]:
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, GRU, LSTM, Activation, Dropout, Embedding
from keras.layers import Multiply, Concatenate, Dot

from sklearn.metrics import f1_score

# lstm
wv_dim = 300
n_timesteps = maxlen
inputs = Input(shape=(maxlen,))
embedding_sequences = Embedding(vocab_size+1, wv_dim, input_length=maxlen, weights=[embedding_matrix])(inputs)
lstm = LSTM(128, return_sequences= False)(embedding_sequences)
l = Dense(128, activation="tanh")(lstm)
l = Dropout(0.5)(l)
l = Dense(2, activation="softmax")(l)
m = Model(inputs, l)
m.summary()
m.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
m.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

W0901 18:21:27.335547 144828 deprecation_wrapper.py:119] From D:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0901 18:21:27.347527 144828 deprecation_wrapper.py:119] From D:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0901 18:21:27.349545 144828 deprecation_wrapper.py:119] From D:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0901 18:21:27.355536 144828 deprecation_wrapper.py:119] From D:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0901 18:21:27.356534 144828 deprecation_wrapper.py:119] From D:\Anaconda3\lib\site-packages\keras\b

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 25)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 25, 300)           9000300   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 9,236,718
Trainable params: 9,236,718
Non-trainable params: 0
_________________________________________________________________


KeyboardInterrupt: 