In [1]:
# 笔记本目的:
# 笔记本的目的是查看数据集中提供的不同的预先训练的嵌入，并了解它们在模型构建过程中是如何有用的。
# 首先，让我们导入必要的模块并读取输入数据。

In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print('Train shape : ', train_df.shape)
print('Test shape : ',  test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [3]:
# 下一步工作如下:
# 将训练数据集分为训练样本和val样本。交叉验证是一个耗时的过程，所以让我们来做简单的val分割。
# 用“na”填充文本列中缺少的值
# 标记文本列并将其转换为向量序列
# 根据需要填充序列——如果文本中的单词数量大于'max_len'，则将其截断为'max_len'，或者如果文本中的单词数量小于'max_len'，则为其余的值添加0。

In [3]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df['question_text'].fillna('_na_').values
val_X = val_df['question_text'].fillna('_na_').values
test_X = test_df['question_text'].fillna('_na_').values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [4]:
# 没有Pretrained Embeddings:
# 现在我们已经完成了所有必要的预处理步骤，我们可以首先训练一个双向GRU模型。
# 我们不会在这个模型中使用任何预先训练好的单词嵌入，这些嵌入将会从头开始学习。
# 请查看模型摘要以了解所使用的层的详细信息。

In [6]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0      

In [7]:
# 使用train samples对模型进行训练，并在验证集上监控。
# 这只是一个运行了两个时代的样本模型。更改epoch、batch_size和模型参数可能会得到更好的模型。

In [8]:
## Train the model
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2




















Epoch 2/2






















<keras.callbacks.History at 0x1be54abf048>

In [9]:
#现在让我们得到验证样本预测和F1得分的最佳阈值。

In [16]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.588994053103275
F1 score at threshold 0.11 is 0.5972192513368983
F1 score at threshold 0.12 is 0.6050332051730164
F1 score at threshold 0.13 is 0.611808709591237
F1 score at threshold 0.14 is 0.6171118153010413
F1 score at threshold 0.15 is 0.6231563421828908
F1 score at threshold 0.16 is 0.6261529097804205
F1 score at threshold 0.17 is 0.6307838479809976
F1 score at threshold 0.18 is 0.6351396998504077
F1 score at threshold 0.19 is 0.639045149929071
F1 score at threshold 0.2 is 0.6426340060494867
F1 score at threshold 0.21 is 0.6454723519662499
F1 score at threshold 0.22 is 0.6485964198535396
F1 score at threshold 0.23 is 0.650110545529333
F1 score at threshold 0.24 is 0.6524999999999999
F1 score at threshold 0.25 is 0.6537711484741475
F1 score at threshold 0.26 is 0.6550916063059226
F1 score at threshold 0.27 is 0.6565107390859665
F1 score at threshold 0.28 is 0.6564777548134451
F1 score at threshold 0.29 is 0.6565917458921801
F1 score at threshold 0.3 

In [18]:
#现在让我们也得到测试集预测并保存它们

In [20]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



In [21]:
#现在我们的模型构建已经完成，在进行下一步之前清理一些内存可能是一个好主意。

In [22]:
del model, inp, x
import gc
gc.collect()
time.sleep(10)

In [23]:
#所以我们得到了一些没有经过预先训练的嵌入的基线GRU模型。
#现在让我们使用所提供的嵌入并重新构建模型来查看性能

In [25]:
import os
print(os.listdir('../input/embeddings'))

['glove.840B.300d', 'GoogleNews-vectors-negative300', 'paragram_300_sl999', 'ReadMe.txt', 'wiki-news-300d-1M']


We have four different types of embeddings.
 * GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
 * glove.840B.300d - https://nlp.stanford.edu/projects/glove/
 * paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
 * wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html
 
 A very good explanation for different types of embeddings are given in this [kernel](https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge). Please refer the same for more details..

**Glove Embeddings:**

In this section, let us use the Glove embeddings and rebuild the GRU model.

In [9]:
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8') if len(o)>100 )

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          140544    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_______________________________________________________

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

In [None]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {0} is {1}'.format(thresh, metrics.fi_score(val_y, (pred_glove_val_y>thresh).astype(int))))

结果似乎比没有预训练Embedding的模型要好。

In [None]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [10]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc
gc.collect()
time.sleep(10)

Wiki News FastText Embeddings:
现在，让我们使用在Wiki新闻语料库上训练的快速文本嵌入来代替手套嵌入，并重新构建模型。

In [None]:
EMBDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metircs=['accuracy'])

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

In [None]:
pred_fasttext_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {0} is {1}'.format(thresh, metrics.f1_score(val_y, (pred_fasttext_val_y>thresh).astype(int))))

In [None]:
pred_fasttext_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matirx, model, inp, x
import gc
fc.collect()
time.sleep(10)

Paragram嵌入的:
    
在本节中，我们可以使用parram嵌入来构建模型并进行预测。

In [None]:
EMBEDDING_FILE = '../input/embedings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8', errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embsshape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matirx = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = BIdirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

In [None]:
pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)#四舍五入到小数第二位
    print('F1 score at threshold {0} is {1}'.format(thresh, metrics.fi_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

In [None]:
pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, model, inp, x
import gc
gc.collect()
time.sleep(10)

观察:<br>
整体的预训练Embedding似乎比非预训练模型得到更好的结果。<br>
不同的预训练Embedding的性能几乎是相似的。<br>
最后的融合:<br>
尽管具有不同预训练嵌入的模型的结果是相似的，但是它们很可能从数据中捕获不同类型的信息。让我们把这三种模型的预测平均起来。

In [None]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print('F1 score at threshold {0} is {1}'.format(thresh, metrics.f1_score(val_y, (pred_cal_y>thresh).astype(int))))

结果似乎比单独的预训练模型要好，因此我们让我们使用这个模型混合创建一个提交文件。

In [None]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.Dataframe({'qid':test_df['qid'].values})
out_df['prediction'] = pred_test_y
out_df.to_csv('submission.csv', index=False)


**References:**

Thanks to the below kernels which helped me with this one. 
1. https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
2. https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge