# PPDAI Text Mining

In [1]:
%load_ext autoreload
%autoreload 2
from ppdaiutil import *

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
config = {
    'TRAIN_PATH':'data/train.csv',
    'TEST_PATH':'data/test.csv',
    'QUESTION_PATH' : 'data/question.csv',   
}

** read data **

In [3]:
print('Load files...')
data={
    'qes' : pd.read_csv(config['QUESTION_PATH']),
    'tr' : pd.read_csv(config['TRAIN_PATH']),
    'te' : pd.read_csv(config['TEST_PATH']),
    #'co' : questions['words'],
}
data['co']=data['qes']['words']

Load files...


In [4]:
if False:
    display(data['qes'].head())
    display(data['tr'].head())
    display(data['te'].head())

**1. ID轉成詞語序列or單字序列**

In [5]:
def get_ids(qids):
    ids = []
    for t_ in qids:
        ids.append(int(t_[1:]))
    return np.asarray(ids)

def get_textschars(d):
    all_words = data['qes']['words']
    all_chars = data['qes']['chars']
    q1id, q2id = d['q1'], d['q2']
    id1s, id2s = get_ids(q1id), get_ids(q2id)
    q1_texts = []
    q2_texts = []
    for t_ in zip(id1s, id2s):
        q1_texts.append(all_words[t_[0]])
        q2_texts.append(all_words[t_[1]])
    d['q1_texts'] = q1_texts
    d['q2_texts'] = q2_texts
    
    q1_chars = []
    q2_chars = []
    for t_ in zip(id1s, id2s):
        q1_chars.append(all_chars[t_[0]])
        q2_chars.append(all_chars[t_[1]])
    d['q1_chars'] = q1_chars
    d['q2_chars'] = q2_chars
    

print('Get texts/chars...')
get_textschars(data['tr'])
get_textschars(data['te'])

Get texts/chars...


In [6]:
data['tr'].head()

Unnamed: 0,label,q1,q2,q1_texts,q2_texts,q1_chars,q2_chars
0,1,Q397345,Q538594,W04465 W04058 W05284 W02916,W18238 W18843 W01490 W09905,L2218 L2568 L0360 L0242 L2218 L0741,L3019 L0104 L0582 L2218 L1861 L1556 L0242
1,0,Q193805,Q699273,W10054 W04476 W09996 W12244 W18103,W18439 W00863 W04259 W00740 W16070,L2376 L2168 L0050 L1187 L0104 L2432 L0902 L014...,L0156 L2452 L1187 L0104 L2459 L2979 L2613 L0449
2,0,Q085471,Q676160,W04346 W17378 W19355 W17926 W14185 W11567 W07863,W14586 W09745 W06017 W09067 W16319,L2323 L1526 L2214 L1132 L2723 L1861 L2249 L050...,L2568 L0971 L1291 L0358 L0037 L2582
3,0,Q189314,Q438123,W17508 W09996 W19662 W17534 W11399 W17057 W182...,W18238 W02357 W06606,L0018 L2321 L1346 L2432 L0902 L1149 L1980 L187...,L3019 L0104 L1104 L1935 L1683 L2495 L2812
4,0,Q267714,Q290126,W13157 W03390 W01952 W05789 W17378 W08714 W13157,W04476 W06606 W00316 W13157,L2271 L1346 L1389 L2932 L0466 L2218 L1971 L221...,L0050 L1187 L0104 L1683 L2495 L2812 L1588 L255...


**2. 序列化**
- tokenizer

In [7]:

with open('data/word_embed.txt') as f:
    MAX_NB_WORDS = (len(list(f)))

trq1_text=data['tr']['q1_texts'].values
trq2_text=data['tr']['q2_texts'].values
teq1_text=data['te']['q1_texts'].values
teq2_text=data['te']['q2_texts'].values
alltext=np.concatenate([trq1_text, trq2_text, teq1_text, teq2_text])
MAX_SEQUENCE_LENGTH = max(list(map(lambda x: len(x), alltext))) 

tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(alltext) 

In [8]:
data['tr']['q1_sequences'] = tokenizer.texts_to_sequences(trq1_text) 
data['tr']['q2_sequences'] = tokenizer.texts_to_sequences(trq2_text) 
data['te']['q1_sequences'] = tokenizer.texts_to_sequences(teq1_text) 
data['te']['q2_sequences'] = tokenizer.texts_to_sequences(teq2_text) 
#data['tr'].head()

In [9]:
word_index = tokenizer.word_index 
print('Found %s unique tokens' % len(word_index)) 

Found 15880 unique tokens


** 3. pad_sequences **

In [10]:
data['trq1_padseq'] = pad_sequences(data['tr']['q1_sequences'], maxlen=MAX_SEQUENCE_LENGTH) 
data['trq2_padseq'] = pad_sequences(data['tr']['q2_sequences'], maxlen=MAX_SEQUENCE_LENGTH) 
data['teq1_padseq'] = pad_sequences(data['te']['q1_sequences'], maxlen=MAX_SEQUENCE_LENGTH) 
data['teq2_padseq'] = pad_sequences(data['te']['q2_sequences'], maxlen=MAX_SEQUENCE_LENGTH) 

** 4. prepare embeddings**

In [51]:

EMBEDDING_FILE='data/word_embed.txt'
EMBEDDING_DIM = 300

embeddings_index = {} 
f = open(EMBEDDING_FILE,"rb") 
for line in f: 
    values = line.split() 
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32') 
    embeddings_index[word] = coefs 
f.close() 

nb_words = len(word_index)+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM)) 
for word, i in word_index.items(): 
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector 

    

** prepare training data**

In [55]:
## sample train/validation data

VALIDATION_SPLIT = 0.1
trlen = len(data['trq1_padseq'])
perm = np.random.permutation(trlen)
idx_train = perm[:int(trlen*(1-VALIDATION_SPLIT))] 
idx_val = perm[int(trlen*(1-VALIDATION_SPLIT)):] 

data_trainq1=data['trq1_padseq'][idx_train] 
data_trainq2=data['trq2_padseq'][idx_train] 
data_valq1=data['trq1_padseq'][idx_val] 
data_valq2=data['trq2_padseq'][idx_val] 

labels_train = data['tr']['label'][idx_train] 
labels_val = data['tr']['label'][idx_val] 


date_testq1 = data['teq1_padseq']
date_testq2 = data['teq2_padseq']


In [53]:
embedding_layer = Embedding(input_dim=nb_words, output_dim=300, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) 
num_lstm = 300 
num_dense = 256 
rate_drop_lstm = 0.25 
rate_drop_dense = 0.25 
act = 'relu' 


q1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='q1_input') 
q2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='q2_input') 
q1_embseq= embedding_layer(q1_input) 
q2_embseq= embedding_layer(q2_input) 

lstm_layerq1 = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,return_sequences=True, name='q1_lstm') 
q1_lstm = lstm_layerq1(q1_embseq) 
q1_drop = Dropout(rate_drop_dense, name='q1_drop')(q1_lstm) 
q1_att = Attention(MAX_SEQUENCE_LENGTH)(q1_drop)

lstm_layerq2 = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,return_sequences=True, name='q2_lstm') 
q2_lstm = lstm_layerq2(q2_embseq) 
q2_drop = Dropout(rate_drop_dense, name='q2_drop')(q2_lstm) 
q2_att = Attention(MAX_SEQUENCE_LENGTH)(q2_drop)

q1q2_concat = Concatenate(axis=-1,name='q1q2concat')([q1_att,q2_att])
q1q2_concat = Dense(num_dense, activation=act, name='Q_dense')(q1q2_concat) 
q1q2_concat = Dropout(rate_drop_dense, name='Q_drop')(q1q2_concat) 
q1q2_concat = BatchNormalization(name='Q_batchnorm')(q1q2_concat) 
preds = Dense(1, activation='sigmoid', name='Q_output')(q1q2_concat)

model = Model(inputs=[q1_input, q2_input],  outputs=preds) 
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) 
print(model.summary()) 
plot_model(model, to_file='model.png')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
q1_input (InputLayer)           (None, 272)          0                                            
__________________________________________________________________________________________________
q2_input (InputLayer)           (None, 272)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 272, 300)     4764300     q1_input[0][0]                   
                                                                 q2_input[0][0]                   
__________________________________________________________________________________________________
q1_lstm (LSTM)                  (None, 272, 300)     721200      embedding_2[0][0]                
__________

** training **

In [57]:

STAMP = 'model/simple_lstm_glove_vectors_%.2f_%.2f'%(rate_drop_lstm,rate_drop_dense) 
print('STAMP',STAMP)
bst_model_path = STAMP + '.h5' 
print('bst_model_path',bst_model_path) 

early_stopping =EarlyStopping(monitor='val_loss', patience=5) 
#model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) 

hist = model.fit([data_trainq1, data_trainq2], labels_train, validation_data=([data_valq1,data_valq2], labels_val), epochs=50, batch_size=256, shuffle=True, callbacks=[early_stopping]) 

#model.load_weights(bst_model_path) 
bst_val_score = min(hist.history['val_loss']) 

y_test = model.predict([date_testq1, date_testq2], batch_size=1024, verbose=1) 

#data['sam'][list_classes] = y_test 
#data['sam'].to_csv('%.4f_'%(bst_val_score) + STAMP + '.csv', index=False)

    
    
    

STAMP model/simple_lstm_glove_vectors_0.25_0.25
bst_model_path model/simple_lstm_glove_vectors_0.25_0.25.h5
Train on 228947 samples, validate on 25439 samples
Epoch 1/50
  1024/228947 [..............................] - ETA: 9:00:30 - loss: 0.6942 - acc: 0.5244

KeyboardInterrupt: 

In [None]:
def make_submission(predict_prob):
    with open('submission.csv', 'w') as file:
        file.write(str('y_pre') + '\n')
        for line in predict_prob:
            file.write(str(line) + '\n')
    file.close()


In [None]:
testpred = model.predict([date_testq1, date_testq2], batch_size=1024, verbose=1) 

  2048/172956 [..............................] - ETA: 1:37:49

In [None]:

make_submission(testpred[:, 1])
