In [11]:
import tensorflow as tf

import read
import prepare_data
import input_builder

import importlib
import model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import *
from IPython.display import display
from IPython.html import widgets

BATCH_SIZE = 1
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 128

importlib.reload(input_builder)
import os
os.environ['TFHUB_CACHE_DIR'] = '/home/djjindal/bert/script-learning'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"
model_dir = 'output_sentence1'


# DATA

In [12]:
dataset = 'dataset/gw_extractions_no_rep_no_fin.pickle'
train_dataset = read.read_data_iterator(dataset)
features = list(prepare_data.tokenize_if_small_enough(train_dataset,sentences=True,no_context=True))
train_set = features[:int(0.8 * len(features))]
val_set = features[int(0.8 * len(features)):int(0.9*len(features))]
print(train_set[0])
len(train_set)

<input_builder.InputFeatures object at 0x7f9a2dfc0550>


8000

# Model Objects

In [13]:
run_config = tf.estimator.RunConfig(
    model_dir=model_dir,
    save_summary_steps=0,
    save_checkpoints_steps=0,
    log_step_count_steps=100)

model_fn = model.model_fn_builder(
  num_labels=5,
  learning_rate=LEARNING_RATE,
  num_train_steps=1,
  num_warmup_steps=1)

train_test_input_fn = input_builder.input_fn_builder(
    features=train_set,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False,
    candidates=5)

val_test_input_fn = input_builder.input_fn_builder(
    features=val_set,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False,
    candidates=5)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': 'output_sentence1', '_tf_random_seed': None, '_save_summary_steps': 0, '_save_checkpoints_steps': 0, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9a051c4668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0504 17:10:04.461709 140302407182080 tf_logging.py:115] Using config: {'_model_dir': 'output_sentence1', '_tf_random_seed': None, '_save_summary_steps': 0, '_save_checkpoints_steps': 0, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9a051c4668>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


# PREDICT FUNCTION

In [6]:

def predict(sentenecs, triples, candidates, entity):
    e_dict = dict()
    check_dataset = []
    e_dict['sentences'] = sentenecs
    e_dict['triples'] = triples
    e_dict['candidates'] = candidates
    e_dict['correct'] = 4
    e_dict['entity'] = entity
    check_dataset = [e_dict]
    predict_set = list(prepare_data.tokenize_if_small_enough(check_dataset, sentences=True,no_context=True))

    predict_input_fn = input_builder.input_fn_builder(
        features=predict_set,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False,
        candidates=5)
    predictions = estimator.predict(input_fn=predict_input_fn,yield_single_examples=True,no_context=True)

    for (i, prediction) in enumerate(predictions):
        arr = prediction['probabilities'][0]
        prob = [np.exp(arr[0]),np.exp(arr[1]),np.exp(arr[2]),np.exp(arr[3]),np.exp(arr[4])]
        total = np.sum(prob)
        max= np.max(prob)
        ec_dict = check_dataset[i]
        print("SENTENCE EVENT CHAIN", ec_dict['sentences'],"\n")
        print("TRIPLE EVENT CHAIN", ec_dict['triples'],"\n")
        print("CANDIDATES", ec_dict['candidates'],"\n")
        print("ENTITY", ec_dict['entity'],"\n")
        print("PREDICTION", prediction['labels'], "with Probability", np.max(prob),"\n","\n")
        return prediction['labels']

def on_button_clicked(b):
    print(text1.value, text2.value, text3.value)
    print(predict([], text1.value.split('), ('), text2.value.split('), ('), text3.value))    

# DEMO

In [7]:

text1 = widgets.Text(description="Event Chain", width=200)
text2 = widgets.Text(description="Candidates", width=200)
text3 = widgets.Text(description="Entity", width=200)
button = widgets.Button(description="Predict")
button.on_click(on_button_clicked)
display(text1)
display(text2)
display(text3)
display(button)

# [('john','ordered',None),('john','ate','food')]
# [('john','left',None),('john','stays',None)]

# [('john','ordered',None),('john','paid',None),('john','ate','food')]
# [('john','left',None),('john','stays',None)]

# [(None, 'took', 'phone call'), (None, 'had', 'something'), (None, 'began', 'to squint')] 
# [(None, 'broadcast', '(the two previous record-holders'), ('the pennant', 'slammed', None), (None, "n't imagine", 'Maris'), (None, 'expressed', 'interest'), (None, 'turned', '68 _')] 



Text(value='', description='Event Chain')

Text(value='', description='Candidates')

Text(value='', description='Entity')

Button(description='Predict', style=ButtonStyle())

# Some Examples

In [14]:
dataset = list(read.read_data_iterator('dataset/gw_extractions_no_rep_no_fin.pickle'))
train_data = dataset[:int(0.8 * len(features))]
val_data = dataset[int(0.8 * len(features)):int(0.9*len(features))]

check_dataset = []

for i, ec_dict in zip(range(1000), val_data):
    check_dataset.append(ec_dict)
    
predict_set = list(prepare_data.tokenize_if_small_enough(check_dataset, sentences=True))
predict_input_fn = input_builder.input_fn_builder(
    features=predict_set,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False,
    candidates=5)

predictions = estimator.predict(input_fn=predict_input_fn,yield_single_examples=False)
predictions_list = list(predictions)
# for (i, prediction) in enumerate(predictions):
#     arr = prediction['probabilities'][0]
#     prob = [np.exp(arr[0]),np.exp(arr[1]),np.exp(arr[2]),np.exp(arr[3]),np.exp(arr[4])]
#     total = np.sum(prob)
#     max= np.max(prob)
#     ec_dict = check_dataset[i]
#     print("SENTENCE EVENT CHAIN", ec_dict['sentences'],"\n")
#     print("TRIPLE EVENT CHAIN", ec_dict['triples'],"\n")
#     print("CANDIDATES", ec_dict['candidates'],"\n")
#     print("ENTITY", ec_dict['entity'],"\n")
#     print("CORRECT", ec_dict['correct'] + 1,"\n")
#     print("PREDICTION", prediction['labels'] + 1, "with Probability", np.max(prob),"\n","\n")



  """Entry point for launching an IPython kernel.


INFO:tensorflow:Could not find trained model in model_dir: output_sentence1, running initialization to predict.


I0504 17:10:13.447347 140302407182080 tf_logging.py:115] Could not find trained model in model_dir: output_sentence1, running initialization to predict.


INFO:tensorflow:Calling model_fn.


I0504 17:10:15.896109 140302407182080 tf_logging.py:115] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0504 17:10:19.374439 140302407182080 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0504 17:10:20.065944 140302407182080 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0504 17:10:21.163154 140302407182080 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0504 17:10:21.920489 140302407182080 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0504 17:10:23.110321 140302407182080 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


I0504 17:10:23.302025 140302407182080 tf_logging.py:115] Done calling model_fn.


INFO:tensorflow:Graph was finalized.


I0504 17:10:23.827108 140302407182080 tf_logging.py:115] Graph was finalized.


INFO:tensorflow:Running local_init_op.


I0504 17:10:25.317260 140302407182080 tf_logging.py:115] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0504 17:10:25.474417 140302407182080 tf_logging.py:115] Done running local_init_op.


In [15]:
bad = []
for i, c in enumerate(check_dataset):
    r = list(prepare_data.tokenize_if_small_enough([c],sentences=True,no_context=True))
    if not r:
#         print(i)
        bad.append(i)
good_gt = [c for i, c in enumerate(check_dataset) if i not in bad]
print(len(good_gt))
print(len(predictions_list))
pred_df = pd.DataFrame.from_dict({'predictions':predictions_list, 'dataset':good_gt})

1000
1000


In [16]:
pred_df['pred_label'] = pred_df.predictions.apply(lambda x: x['labels'] - 1)
pred_df['gt_label'] = pred_df.dataset.apply(lambda x: x['correct'])
pred_df['correct_pred'] = pred_df.apply(lambda s: s.pred_label == s.gt_label, axis=1)
print(pred_df.correct_pred.sum()/len(pred_df.index))

0.17
