In [1]:
import os
os.environ['CUDA_DEVICE_ORDER'] = '1'

In [2]:
%run init.ipynb

Using TensorFlow backend.


matchzoo version 2.1.0

data loading ...
data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`
`ranking_task` initialized with metrics [normalized_discounted_cumulative_gain@3(0.0), normalized_discounted_cumulative_gain@5(0.0), mean_average_precision(0.0)]
loading embedding ...
embedding loaded as `glove_embedding`


In [3]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=False)
train_pack_processed = preprocessor.fit_transform(train_pack_raw)
dev_pack_processed = preprocessor.transform(dev_pack_raw)
test_pack_processed = preprocessor.transform(test_pack_raw)

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 11358.96it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 18841/18841 [00:02<00:00, 7577.41it/s]
Processing text_right with append: 100%|██████████| 18841/18841 [00:00<00:00, 1524429.13it/s]
Building FrequencyFilter from a datapack.: 100%|██████████| 18841/18841 [00:00<00:00, 224973.54it/s]
Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 228433.24it/s]
Processing text_left with extend: 100%|██████████| 2118/2118 [00:00<00:00, 1033931.08it/s]
Processing text_right with extend: 100%|██████████| 18841/18841 [00:00<00:00, 1283517.38it/s]
Building Vocabulary from a datapack.: 100%|██████████| 404432/404432 [00:00<00:00, 4278999.05it/s]
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 15384.85it/s]
Processin

In [4]:
preprocessor.context

{'filter_unit': <matchzoo.preprocessors.units.frequency_filter.FrequencyFilter at 0x7f98fb53bbe0>,
 'vocab_unit': <matchzoo.preprocessors.units.vocabulary.Vocabulary at 0x7f9910122320>,
 'vocab_size': 16674,
 'embedding_input_dim': 16674,
 'input_shapes': [(10,), (100,)]}

In [4]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankCrossEntropyLoss(num_neg=10))
ranking_task.metrics = [
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
    mz.metrics.MeanAveragePrecision()
]

In [19]:
bin_size = 30
model = mz.models.DRMM()
model.params.update(preprocessor.context)
model.params['input_shapes'] = [[10,], [10, bin_size,]]
model.params['task'] = ranking_task
model.params['mask_value'] = 0
model.params['embedding_output_dim'] = glove_embedding.output_dim
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 10
model.params['mlp_num_fan_out'] = 1
model.params['mlp_activation_func'] = 'tanh'
model.params['optimizer'] = 'adadelta'
model.build()
model.compile()
model.backend.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 10)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 300)      5002200     text_left[0][0]                  
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 10, 1)        300         embedding[0][0]                  
__________________________________________________________________________________________________
match_histogram (InputLayer)    (None, 10, 30)       0                                            
__________________________________________________________________________________________________
attention_

In [6]:
embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
# normalize the word embedding for fast histogram generating.
# l2_norm = np.sqrt((embedding_matrix*embedding_matrix).sum(axis=1))
# embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]


In [20]:
model.load_embedding_matrix(embedding_matrix)

In [7]:
hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='CH')

In [8]:
train_generator = mz.DataGenerator(train_pack_processed, mode='pair', num_dup=5, num_neg=10, batch_size=20, 
                                   callbacks=[hist_callback])
print('num batches:', len(train_generator))

num batches: 255


In [9]:
pred_generator = mz.DataGenerator(test_pack_processed, mode='point', callbacks=[hist_callback])
pred_x, pred_y = pred_generator[:]
evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                           x=pred_x,
                                           y=pred_y,
                                           once_every=1, 
                                           batch_size=len(pred_y),
                                           model_save_path='./drmm_pretrained_model/'
                                          )

In [9]:
train_x, train_y = train_generator[:]
evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                           x=train_x,
                                           y=train_y,
                                           once_every=1, 
                                           batch_size=len(train_y))

num batches: 255


In [30]:
for x, y in train_generator:
    if 'Q1' in x['id_left']:
        print(x)
        print(y)

{'id_left': array(['Q2403', 'Q2403', 'Q2403', 'Q2403', 'Q2403', 'Q2403', 'Q2403',
       'Q2403', 'Q2403', 'Q2403', 'Q2403', 'Q1612', 'Q1612', 'Q1612',
       'Q1612', 'Q1612', 'Q1612', 'Q1612', 'Q1612', 'Q1612', 'Q1612',
       'Q1612', 'Q2397', 'Q2397', 'Q2397', 'Q2397', 'Q2397', 'Q2397',
       'Q2397', 'Q2397', 'Q2397', 'Q2397', 'Q2397', 'Q734', 'Q734',
       'Q734', 'Q734', 'Q734', 'Q734', 'Q734', 'Q734', 'Q734', 'Q734',
       'Q734', 'Q2392', 'Q2392', 'Q2392', 'Q2392', 'Q2392', 'Q2392',
       'Q2392', 'Q2392', 'Q2392', 'Q2392', 'Q2392', 'Q556', 'Q556',
       'Q556', 'Q556', 'Q556', 'Q556', 'Q556', 'Q556', 'Q556', 'Q556',
       'Q556', 'Q1424', 'Q1424', 'Q1424', 'Q1424', 'Q1424', 'Q1424',
       'Q1424', 'Q1424', 'Q1424', 'Q1424', 'Q1424', 'Q1', 'Q1', 'Q1',
       'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q2207', 'Q2207',
       'Q2207', 'Q2207', 'Q2207', 'Q2207', 'Q2207', 'Q2207', 'Q2207',
       'Q2207', 'Q2207', 'Q210', 'Q210', 'Q210', 'Q210', 'Q210', 'Q210',
      

{'id_left': array(['Q2669', 'Q2669', 'Q2669', 'Q2669', 'Q2669', 'Q2669', 'Q2669',
       'Q2669', 'Q2669', 'Q2669', 'Q2669', 'Q178', 'Q178', 'Q178', 'Q178',
       'Q178', 'Q178', 'Q178', 'Q178', 'Q178', 'Q178', 'Q178', 'Q2905',
       'Q2905', 'Q2905', 'Q2905', 'Q2905', 'Q2905', 'Q2905', 'Q2905',
       'Q2905', 'Q2905', 'Q2905', 'Q737', 'Q737', 'Q737', 'Q737', 'Q737',
       'Q737', 'Q737', 'Q737', 'Q737', 'Q737', 'Q737', 'Q2818', 'Q2818',
       'Q2818', 'Q2818', 'Q2818', 'Q2818', 'Q2818', 'Q2818', 'Q2818',
       'Q2818', 'Q2818', 'Q1043', 'Q1043', 'Q1043', 'Q1043', 'Q1043',
       'Q1043', 'Q1043', 'Q1043', 'Q1043', 'Q1043', 'Q1043', 'Q1348',
       'Q1348', 'Q1348', 'Q1348', 'Q1348', 'Q1348', 'Q1348', 'Q1348',
       'Q1348', 'Q1348', 'Q1348', 'Q1831', 'Q1831', 'Q1831', 'Q1831',
       'Q1831', 'Q1831', 'Q1831', 'Q1831', 'Q1831', 'Q1831', 'Q1831',
       'Q1024', 'Q1024', 'Q1024', 'Q1024', 'Q1024', 'Q1024', 'Q1024',
       'Q1024', 'Q1024', 'Q1024', 'Q1024', 'Q1347', 'Q1347', 'Q1

{'id_left': array(['Q1639', 'Q1639', 'Q1639', 'Q1639', 'Q1639', 'Q1639', 'Q1639',
       'Q1639', 'Q1639', 'Q1639', 'Q1639', 'Q1257', 'Q1257', 'Q1257',
       'Q1257', 'Q1257', 'Q1257', 'Q1257', 'Q1257', 'Q1257', 'Q1257',
       'Q1257', 'Q2202', 'Q2202', 'Q2202', 'Q2202', 'Q2202', 'Q2202',
       'Q2202', 'Q2202', 'Q2202', 'Q2202', 'Q2202', 'Q1452', 'Q1452',
       'Q1452', 'Q1452', 'Q1452', 'Q1452', 'Q1452', 'Q1452', 'Q1452',
       'Q1452', 'Q1452', 'Q2338', 'Q2338', 'Q2338', 'Q2338', 'Q2338',
       'Q2338', 'Q2338', 'Q2338', 'Q2338', 'Q2338', 'Q2338', 'Q700',
       'Q700', 'Q700', 'Q700', 'Q700', 'Q700', 'Q700', 'Q700', 'Q700',
       'Q700', 'Q700', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1',
       'Q1', 'Q1', 'Q1', 'Q1822', 'Q1822', 'Q1822', 'Q1822', 'Q1822',
       'Q1822', 'Q1822', 'Q1822', 'Q1822', 'Q1822', 'Q1822', 'Q564',
       'Q564', 'Q564', 'Q564', 'Q564', 'Q564', 'Q564', 'Q564', 'Q564',
       'Q564', 'Q564', 'Q1611', 'Q1611', 'Q1611', 'Q1611', 'Q1611',
       'Q

{'id_left': array(['Q2249', 'Q2249', 'Q2249', 'Q2249', 'Q2249', 'Q2249', 'Q2249',
       'Q2249', 'Q2249', 'Q2249', 'Q2249', 'Q2844', 'Q2844', 'Q2844',
       'Q2844', 'Q2844', 'Q2844', 'Q2844', 'Q2844', 'Q2844', 'Q2844',
       'Q2844', 'Q1954', 'Q1954', 'Q1954', 'Q1954', 'Q1954', 'Q1954',
       'Q1954', 'Q1954', 'Q1954', 'Q1954', 'Q1954', 'Q1', 'Q1', 'Q1',
       'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1', 'Q1130', 'Q1130',
       'Q1130', 'Q1130', 'Q1130', 'Q1130', 'Q1130', 'Q1130', 'Q1130',
       'Q1130', 'Q1130', 'Q1366', 'Q1366', 'Q1366', 'Q1366', 'Q1366',
       'Q1366', 'Q1366', 'Q1366', 'Q1366', 'Q1366', 'Q1366', 'Q1741',
       'Q1741', 'Q1741', 'Q1741', 'Q1741', 'Q1741', 'Q1741', 'Q1741',
       'Q1741', 'Q1741', 'Q1741', 'Q46', 'Q46', 'Q46', 'Q46', 'Q46',
       'Q46', 'Q46', 'Q46', 'Q46', 'Q46', 'Q46', 'Q94', 'Q94', 'Q94',
       'Q94', 'Q94', 'Q94', 'Q94', 'Q94', 'Q94', 'Q94', 'Q94', 'Q1183',
       'Q1183', 'Q1183', 'Q1183', 'Q1183', 'Q1183', 'Q1183', 'Q1183',
    

{'id_left': array(['Q1253', 'Q1253', 'Q1253', 'Q1253', 'Q1253', 'Q1253', 'Q1253',
       'Q1253', 'Q1253', 'Q1253', 'Q1253', 'Q441', 'Q441', 'Q441', 'Q441',
       'Q441', 'Q441', 'Q441', 'Q441', 'Q441', 'Q441', 'Q441', 'Q1574',
       'Q1574', 'Q1574', 'Q1574', 'Q1574', 'Q1574', 'Q1574', 'Q1574',
       'Q1574', 'Q1574', 'Q1574', 'Q1527', 'Q1527', 'Q1527', 'Q1527',
       'Q1527', 'Q1527', 'Q1527', 'Q1527', 'Q1527', 'Q1527', 'Q1527',
       'Q398', 'Q398', 'Q398', 'Q398', 'Q398', 'Q398', 'Q398', 'Q398',
       'Q398', 'Q398', 'Q398', 'Q2458', 'Q2458', 'Q2458', 'Q2458',
       'Q2458', 'Q2458', 'Q2458', 'Q2458', 'Q2458', 'Q2458', 'Q2458',
       'Q178', 'Q178', 'Q178', 'Q178', 'Q178', 'Q178', 'Q178', 'Q178',
       'Q178', 'Q178', 'Q178', 'Q620', 'Q620', 'Q620', 'Q620', 'Q620',
       'Q620', 'Q620', 'Q620', 'Q620', 'Q620', 'Q620', 'Q1591', 'Q1591',
       'Q1591', 'Q1591', 'Q1591', 'Q1591', 'Q1591', 'Q1591', 'Q1591',
       'Q1591', 'Q1591', 'Q651', 'Q651', 'Q651', 'Q651', 'Q651', 'Q6

array([[ 0.01048858, -0.11956459,  0.13691483, ...,  0.0247253 ,
         0.01884524,  0.06992381],
       [ 0.030294  ,  0.48109   ,  0.048802  , ..., -0.45234   ,
        -0.070569  ,  0.14183   ],
       [ 0.67931   ,  0.4693    ,  0.11554   , ..., -0.061515  ,
         0.37124   , -0.37292   ],
       ...,
       [-0.032296  ,  0.37312   , -0.50619   , ..., -0.68138   ,
         0.50282   , -0.24503   ],
       [-0.23365   , -0.48598   ,  0.016299  , ..., -0.63779   ,
         0.67902   , -0.3734    ],
       [ 0.078074  ,  0.25521   ,  0.55932   , ...,  0.17318   ,
         0.17503   ,  0.019776  ]])

In [11]:
x, y = data

In [11]:
idx2term = {}
for term, index in preprocessor.context['vocab_unit'].state['term_index'].items():
    idx2term[index] = term

In [12]:
q_text = [idx2term[i] for i in x['text_left'][0] if i > 0]
print(q_text)
d_text = [idx2term[i] for i in x['text_right'][0] if i > 0]
print(d_text)

NameError: name 'x' is not defined

In [10]:
term_index = preprocessor.context['vocab_unit'].state['term_index']
for term in q_text:
    idx = term_index[term]
    print("term: ", term)
    print("term vec: ", embedding_matrix[idx])

NameError: name 'q_text' is not defined

In [14]:
print("TEXT_LEFT")
print(x['text_left'][:3])

print("TEXT_RIGHT")
print(x['text_right'][:3])

print("HISTOGRAM")
print(x['match_histogram'][:3])

TEXT_LEFT
[[ 7765 11451 15011  6552  3706 11400     0     0     0     0]
 [ 7765 11451 15011  6552  3706 11400     0     0     0     0]
 [ 7765 11451 15011  6552  3706 11400     0     0     0     0]]
TEXT_RIGHT
[[10437  3101  1128  5379 13595 16147 15011  5664  3744 12202  6552 16147
    922 13595  3706 11400     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]
 [12202  5664 15011  6057 15558  9121  8044   922 11770     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0   

In [None]:
history = model.fit_generator(train_generator, epochs=30, callbacks=[evaluate], workers=30, use_multiprocessing=True)

Epoch 1/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.3218515789789301 - normalized_discounted_cumulative_gain@5(0.0): 0.3305217802704022 - mean_average_precision(0.0): 0.4136121287208221
Epoch 2/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.4452650686370265 - normalized_discounted_cumulative_gain@5(0.0): 0.4526996833224589 - mean_average_precision(0.0): 0.5200881280018711
Epoch 3/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5029171528588098 - normalized_discounted_cumulative_gain@5(0.0): 0.5160088422383353 - mean_average_precision(0.0): 0.5790185708168546
Epoch 4/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5382684736417338 - normalized_discounted_cumulative_gain@5(0.0): 0.5505652594997511 - mean_average_precision(0.0): 0.6100740662079234
Epoch 5/30
Validation: normalized_discounted_cumulative_gain@3(0.0): 0.5751321420453881 - normalized_discounted_cumulative_gain@5(0.0): 0.5856890150213557 - mean_average_precision(

In [21]:
# drmm_model = mz.load_model('./drmm_pretrained_model/16')
test_generator = mz.DataGenerator(data_pack=dev_pack_processed[:10], mode='point', callbacks=[hist_callback])
test_x, test_y = test_generator[:]
prediction = model.predict(test_x)

prediction

array([[-0.14671361],
       [-0.18790512],
       [-0.16797136],
       [-0.17602642],
       [-0.15680443],
       [-0.16919973],
       [-0.19999962],
       [-0.17057955],
       [-0.11827587],
       [-0.14870849]], dtype=float32)

In [22]:
model.dense_input

AttributeError: 'DRMM' object has no attribute 'dense_input'

In [14]:
print(list(test_x["text_left"][0]))
print(list(test_x["text_right"][0]))
print(list(test_x["match_histogram"][0]))

[13455, 4260, 6442, 16447, 10900, 7359, 12794, 12435, 8425, 0]
[16447, 10900, 7359, 16315, 10128, 16654, 2952, 11716, 5246, 13704, 14897, 1770, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 3., 3.,
       1., 3., 3., 1., 4., 1., 2., 1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 4.,
       2., 3., 2., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       1., 4., 3., 1., 5., 1., 1., 1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 4.,
       2., 2., 3., 2., 2., 2., 1., 1., 1., 1., 1., 2., 1.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 

In [12]:
import shutil
shutil.rmtree('./drmm_pretrained_model/')

In [15]:
term_index = preprocessor.context['vocab_unit'].state['term_index']
index_term = {}
for term, index in term_index.items():
    index_term[index] = term

In [18]:
text_left_i = list(test_x["text_left"][0])
text_right_i = list(test_x["text_right"][0])

print(text_left_i)
print(text_right_i)

[13455, 4260, 6442, 16447, 10900, 7359, 12794, 12435, 8425, 0]
[16447, 10900, 7359, 16315, 10128, 16654, 2952, 11716, 5246, 13704, 14897, 1770, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
text_left = [index_term[i] for i in text_left_i if i > 0]
text_right = [index_term[i] for i in text_right_i if i > 0]

match_histogram = list(test_x["match_histogram"][0])
# match_histogram = [index_term[i] for i in list(test_x["match_histogram"][0]) if i > 0]

In [36]:
print(text_left)
print(text_right)
print(match_histogram)

['how', 'big', 'is', 'software', 'in', 'houston', 'tx']
['for', '2011', 'the', 'company', 'recorded', 'an', 'annual', 'revenue', 'of', 'billion', 'making', 'it', 'the', '20', 'largest', 'software', 'company', 'in', 'terms', 'of', 'revenue', 'for', 'that', 'year']
[array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 3.,
       5., 3., 4., 8., 2., 3., 2., 2., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       2., 6., 9., 8., 4., 1., 1., 1., 1., 1., 1., 1., 1.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       5., 4., 5., 3., 6., 6., 1., 2., 1., 1., 1., 1., 1.]), array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
       11., 15.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.]), array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  4., 10.,  8.,  3.,  3.,  1.,  1.,  1.,  1.,  1.,
        1.

In [None]:
text_left = [index_term[i] for i in list(test_x["text_left"][0]) if i > 0]
