# Introduction
Train a bert classifier using tensorflow hub. Adapted from:

[1]TensorFlow code and pre-trained models for BERT. Contribute to google-research/bert development by creating an account on GitHub. Google AI Research, 2019.


In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import json
from pprint import pprint
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import importlib

import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from imblearn.over_sampling import SMOTE

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

# custom data loading functions
import load_data
import clean_data
import jupyter_bert

W0402 13:29:45.530753 140736101036928 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# test = '../data/data_turk/dummy_data.json'
annotations = '../data/data_turk/Annotations04-02-19.json'

### Load Data

In [5]:
importlib.reload(load_data)
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:"""
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu..."
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i..."
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...


### Reformat Data for Bert Model

In [6]:
to  = 'label'
field = 'annotation'

df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row,field), axis =1) 
df.head()

Unnamed: 0,annotation,fileID,text,label
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0


In [7]:
to  = 'text_a'
field = 'text'
    
df[to] = df.apply(lambda row:clean_data.cleanSents(row, field), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,text_a
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,yes i consent parent legal guardian signature
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,prior to administering the vaccine s the nurse...
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,within 2 business days of immunisation i under...
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,mobile email pre vaccination checklist please ...


In [8]:
df['text_b'] = 'a'
df.head()

Unnamed: 0,annotation,fileID,text,label,text_a,text_b
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,yes i consent parent legal guardian signature,a
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...,a
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,prior to administering the vaccine s the nurse...,a
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,within 2 business days of immunisation i under...,a
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,mobile email pre vaccination checklist please ...,a


In [9]:
df = df.rename(index=str, columns={"fileID": "guid"})

### Explore Some Positive Cases

In [None]:
positives = df.loc[df['label'] == 1]
positives["guid"] = pd.to_numeric(positives["guid"])
positives = positives.rename(index=str, columns={"guid": "fileID"})
positives.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,annotation,fileID,text,label,text_a,text_b
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,yes i consent parent legal guardian signature,a
7,permission_statement,490,"""if you sign your name below, it means that yo...",1,if you sign your name below it means that you ...,a
15,permission_statement,565,i also understand that i should not consume al...,1,i also understand that i should not consume al...,a
19,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...,a
36,permission_statement,515,we may need to re- contact you if we are study...,1,we may need to re contact you if we are studyi...,a


In [None]:
filenames = pd.read_csv('../data/outputs/file_ids-2019-03-26.csv')
filenames = filenames.drop(['path'], axis=1)
filenames.head()

Unnamed: 0,fileID,name
0,1,TAMU - HRPP Informed consent.txt
1,2,Potomac Primary Care_flu-consent-form.txt
2,3,OSU_Scheduled_Delivery_Consent.txt
3,4,consent_biorepository_12-19-14.txt
4,5,Cambridge_Consent_endodontics2.txt


In [None]:
joined = pd.merge(positives, filenames, on='fileID', how='outer')
joined.rename(index=str, columns={"name": "file_name"})
joined = joined.drop(['text', 'label', 'text_b', 'fileID'], axis=1)
joined.head(10)

Unnamed: 0,annotation,text_a,name
0,permission_statement,yes i consent parent legal guardian signature,Queensland_AUSTRALIA_ Immunization Consent For...
1,permission_statement,if you sign your name below it means that you ...,MinorAssentForm030708 (1).txt
2,permission_statement,i also understand that i should not consume al...,Cambridge_Consent_endodontics11.txt
3,permission_statement,if you agree to being audiotaped but feel unco...,CF-Sample_Interview_Audiotape.txt
4,permission_statement,we may need to re contact you if we are studyi...,Marshfield-Consent-Form-10.28.13.txt
5,permission_statement,if you become ill or injured from this study m...,Marshfield-Consent-Form-10.28.13.txt
6,permission_statement,in order to minimize the risk of unintended re...,Marshfield-Consent-Form-10.28.13.txt
7,permission_statement,while we work with other researchers that are ...,Marshfield-Consent-Form-10.28.13.txt
8,permission_statement,your relatives will not be contacted because o...,Marshfield-Consent-Form-10.28.13.txt
9,permission_statement,we will code all information entered into the ...,Marshfield-Consent-Form-10.28.13.txt


In [None]:
# joined.to_csv('../data/outputs/positive_class_examples.csv', index=False)

### Sample to Speed it up.

In [None]:
# n_samples = 500
# df = df.sample(n=n_samples, random_state=1729)

In [None]:
cols = ['guid', 'text_a', 'text_b', 'label', ] 
df = df[cols]
df.head()

Unnamed: 0,guid,text_a,text_b,label
0,370,yes i consent parent legal guardian signature,a,1
1,370,my child has already had dtpa vaccination i do...,a,0
2,370,prior to administering the vaccine s the nurse...,a,0
3,370,within 2 business days of immunisation i under...,a,0
4,370,mobile email pre vaccination checklist please ...,a,0


### Train - Test split

In [None]:
train, test = train_test_split(df,stratify=df['label'],test_size=0.3)

print(type(train), type(test))
print(train.dtypes)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
guid      object
text_a    object
text_b    object
label      int64
dtype: object


### SMOTE IT
Unfortunatelty SMOTE does not work for this kind of data as implemented. Need to kleep thinking.

In [None]:
# # very annoyingly, SMOTE implementation does not support passing text. 
# # here's a dumb workaround.

# sm = SMOTE(random_state=12, ratio = 1.0)

# train['index1'] = train.index
# train_prep = train[['guid', 'index1']]

# sm_train_idx, sm_train_label = sm.fit_sample(train_prep,train['label'])

# print(len(train_prep), len(train['label']))
# print(len(sm_train_idx), len(sm_train_label))

In [None]:
# # combine the smote results with the text data

# smote_results = pd.DataFrame({'guid':sm_train_idx[:,0],
#                                   'index1':sm_train_idx[:,1]})

# smote_results['label'] = sm_train_label
# smote_results['guid'] = smote_results['guid'].astype(object)

# rejoin_from_train_df = train[['text_a', 'text_b', 'index1']]
# rejoin_from_train_df['index1'] = rejoin_from_train_df['index1'].astype(float)

# rejoin = pd.merge(smote_results, rejoin_from_train_df, on='index1', how='outer')
# rejoin = rejoin.set_index('index1')
# rejoin['text_b'] = 'a'
# cols = ['guid', 'text_a', 'text_b', 'label'] 
# rejoin = rejoin[cols]
# train = rejoin

In [None]:
# for idx, row in train.iterrows():
#     print(row, '\n')

In [None]:
DATA_COLUMN = 'text_a'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [None]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

### BERT config

In [None]:
today = datetime.today().strftime('%Y-%m-%d')
OUTPUT_DIR = '../bert_output' + today
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: ../bert_output2019-04-02 *****


In [None]:
tokenizer = jupyter_bert.create_tokenizer_from_hub_module()

Instructions for updating:
Colocations handled automatically by placer.


W0402 13:29:56.319947 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0402 13:29:58.477382 140736101036928 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [None]:
# # test
# tokenizer.tokenize("This here's an example of using the BERT tokenizer")

In [None]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 1942


I0402 13:29:58.950724 140736101036928 run_classifier.py:774] Writing example 0 of 1942


INFO:tensorflow:*** Example ***


I0402 13:29:58.953509 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:29:58.956137 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] although the language is optional for new studies approved before that date sponsor may request the language [SEP]


I0402 13:29:58.958472 140736101036928 run_classifier.py:464] tokens: [CLS] although the language is optional for new studies approved before that date sponsor may request the language [SEP]


INFO:tensorflow:input_ids: 101 2348 1996 2653 2003 11887 2005 2047 2913 4844 2077 2008 3058 10460 2089 5227 1996 2653 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.959857 140736101036928 run_classifier.py:465] input_ids: 101 2348 1996 2653 2003 11887 2005 2047 2913 4844 2077 2008 3058 10460 2089 5227 1996 2653 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.961694 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.963238 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:29:58.964838 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:29:58.969518 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:29:58.971181 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] genetic code the sequence of nu ##cle ##otide ##s coded in triple ##ts cod ##ons along the mrna that determines the sequence of amino acids in protein synthesis [SEP]


I0402 13:29:58.972646 140736101036928 run_classifier.py:464] tokens: [CLS] genetic code the sequence of nu ##cle ##otide ##s coded in triple ##ts cod ##ons along the mrna that determines the sequence of amino acids in protein synthesis [SEP]


INFO:tensorflow:input_ids: 101 7403 3642 1996 5537 1997 16371 14321 26601 2015 22402 1999 6420 3215 19429 5644 2247 1996 28848 2008 16463 1996 5537 1997 13096 12737 1999 5250 10752 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.974307 140736101036928 run_classifier.py:465] input_ids: 101 7403 3642 1996 5537 1997 16371 14321 26601 2015 22402 1999 6420 3215 19429 5644 2247 1996 28848 2008 16463 1996 5537 1997 13096 12737 1999 5250 10752 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.976008 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.977520 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:29:58.978821 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:29:58.981089 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:29:58.982758 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] del ##ete last sentence if this study provides a direct benefit to the child for which a comparable benefit is not available through non research alternatives [SEP]


I0402 13:29:58.984453 140736101036928 run_classifier.py:464] tokens: [CLS] del ##ete last sentence if this study provides a direct benefit to the child for which a comparable benefit is not available through non research alternatives [SEP]


INFO:tensorflow:input_ids: 101 3972 12870 2197 6251 2065 2023 2817 3640 1037 3622 5770 2000 1996 2775 2005 2029 1037 12435 5770 2003 2025 2800 2083 2512 2470 15955 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.986180 140736101036928 run_classifier.py:465] input_ids: 101 3972 12870 2197 6251 2065 2023 2817 3640 1037 3622 5770 2000 1996 2775 2005 2029 1037 12435 5770 2003 2025 2800 2083 2512 2470 15955 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.987864 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.990199 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:29:58.991477 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:29:58.993234 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:29:58.994364 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] you can ask all the questions you want before you decide [SEP]


I0402 13:29:58.995536 140736101036928 run_classifier.py:464] tokens: [CLS] you can ask all the questions you want before you decide [SEP]


INFO:tensorflow:input_ids: 101 2017 2064 3198 2035 1996 3980 2017 2215 2077 2017 5630 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.996581 140736101036928 run_classifier.py:465] input_ids: 101 2017 2064 3198 2035 1996 3980 2017 2215 2077 2017 5630 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.997750 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:58.999207 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:29:59.000517 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:29:59.003732 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:29:59.005039 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] if subjects are not patients at the mount sinai hospital and the information being gathered remains solely in the research record the need to give out a notice of privacy practices is eliminated and the following sentence can be removed when in doubt the sentence in and give out the notice of privacy practices to those who have not received it during the course of clinical care if you have not already received it you will also be given the mount sinai hospital notice of privacy practices that contains more information about how mount sinai uses and disclose ##s your protected health information [SEP]


I0402 13:29:59.006628 140736101036928 run_classifier.py:464] tokens: [CLS] if subjects are not patients at the mount sinai hospital and the information being gathered remains solely in the research record the need to give out a notice of privacy practices is eliminated and the following sentence can be removed when in doubt the sentence in and give out the notice of privacy practices to those who have not received it during the course of clinical care if you have not already received it you will also be given the mount sinai hospital notice of privacy practices that contains more information about how mount sinai uses and disclose ##s your protected health information [SEP]


INFO:tensorflow:input_ids: 101 2065 5739 2024 2025 5022 2012 1996 4057 20837 2902 1998 1996 2592 2108 5935 3464 9578 1999 1996 2470 2501 1996 2342 2000 2507 2041 1037 5060 1997 9394 6078 2003 5892 1998 1996 2206 6251 2064 2022 3718 2043 1999 4797 1996 6251 1999 1998 2507 2041 1996 5060 1997 9394 6078 2000 2216 2040 2031 2025 2363 2009 2076 1996 2607 1997 6612 2729 2065 2017 2031 2025 2525 2363 2009 2017 2097 2036 2022 2445 1996 4057 20837 2902 5060 1997 9394 6078 2008 3397 2062 2592 2055 2129 4057 20837 3594 1998 26056 2015 2115 5123 2740 2592 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:59.008112 140736101036928 run_classifier.py:465] input_ids: 101 2065 5739 2024 2025 5022 2012 1996 4057 20837 2902 1998 1996 2592 2108 5935 3464 9578 1999 1996 2470 2501 1996 2342 2000 2507 2041 1037 5060 1997 9394 6078 2003 5892 1998 1996 2206 6251 2064 2022 3718 2043 1999 4797 1996 6251 1999 1998 2507 2041 1996 5060 1997 9394 6078 2000 2216 2040 2031 2025 2363 2009 2076 1996 2607 1997 6612 2729 2065 2017 2031 2025 2525 2363 2009 2017 2097 2036 2022 2445 1996 4057 20837 2902 5060 1997 9394 6078 2008 3397 2062 2592 2055 2129 4057 20837 3594 1998 26056 2015 2115 5123 2740 2592 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:59.009768 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:29:59.011045 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:29:59.012215 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:Writing example 0 of 833


I0402 13:30:00.004755 140736101036928 run_classifier.py:774] Writing example 0 of 833


INFO:tensorflow:*** Example ***


I0402 13:30:00.006449 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:30:00.008403 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] acceptable terms include payment re ##mun ##eration rei ##mb ##urse ##ment gift prize token of appreciation etc [SEP]


I0402 13:30:00.009865 140736101036928 run_classifier.py:464] tokens: [CLS] acceptable terms include payment re ##mun ##eration rei ##mb ##urse ##ment gift prize token of appreciation etc [SEP]


INFO:tensorflow:input_ids: 101 11701 3408 2421 7909 2128 23041 16754 24964 14905 28393 3672 5592 3396 19204 1997 12284 4385 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.011963 140736101036928 run_classifier.py:465] input_ids: 101 11701 3408 2421 7909 2128 23041 16754 24964 14905 28393 3672 5592 3396 19204 1997 12284 4385 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.013949 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.016114 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:30:00.017797 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:30:00.020951 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:30:00.023303 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] if you are taking part at si ##bley memorial hospital [SEP]


I0402 13:30:00.024724 140736101036928 run_classifier.py:464] tokens: [CLS] if you are taking part at si ##bley memorial hospital [SEP]


INFO:tensorflow:input_ids: 101 2065 2017 2024 2635 2112 2012 9033 29538 3986 2902 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.026412 140736101036928 run_classifier.py:465] input_ids: 101 2065 2017 2024 2635 2112 2012 9033 29538 3986 2902 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.028250 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.030021 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:30:00.031668 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:30:00.033627 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:30:00.035274 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] it has no effect on a person because it has no real medicine in it [SEP]


I0402 13:30:00.036697 140736101036928 run_classifier.py:464] tokens: [CLS] it has no effect on a person because it has no real medicine in it [SEP]


INFO:tensorflow:input_ids: 101 2009 2038 2053 3466 2006 1037 2711 2138 2009 2038 2053 2613 4200 1999 2009 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.038071 140736101036928 run_classifier.py:465] input_ids: 101 2009 2038 2053 3466 2006 1037 2711 2138 2009 2038 2053 2613 4200 1999 2009 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.039712 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.041126 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:30:00.042426 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 13:30:00.046907 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:30:00.048279 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] the stanford university administrative panel on human subjects in medical research and any other unit of stanford university as necessary research staff list every other class of persons or organization affiliated with stanford who might need to use and or disclose the participant s information in connection with this study who may receive or use the information the parties listed in the preceding paragraph may disclose your health information to the following persons and organizations for their use in connection with this research study the office for human research protections in the u s [SEP]


I0402 13:30:00.049911 140736101036928 run_classifier.py:464] tokens: [CLS] the stanford university administrative panel on human subjects in medical research and any other unit of stanford university as necessary research staff list every other class of persons or organization affiliated with stanford who might need to use and or disclose the participant s information in connection with this study who may receive or use the information the parties listed in the preceding paragraph may disclose your health information to the following persons and organizations for their use in connection with this research study the office for human research protections in the u s [SEP]


INFO:tensorflow:input_ids: 101 1996 8422 2118 3831 5997 2006 2529 5739 1999 2966 2470 1998 2151 2060 3131 1997 8422 2118 2004 4072 2470 3095 2862 2296 2060 2465 1997 5381 2030 3029 6989 2007 8422 2040 2453 2342 2000 2224 1998 2030 26056 1996 13180 1055 2592 1999 4434 2007 2023 2817 2040 2089 4374 2030 2224 1996 2592 1996 4243 3205 1999 1996 11003 20423 2089 26056 2115 2740 2592 2000 1996 2206 5381 1998 4411 2005 2037 2224 1999 4434 2007 2023 2470 2817 1996 2436 2005 2529 2470 28548 1999 1996 1057 1055 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.051423 140736101036928 run_classifier.py:465] input_ids: 101 1996 8422 2118 3831 5997 2006 2529 5739 1999 2966 2470 1998 2151 2060 3131 1997 8422 2118 2004 4072 2470 3095 2862 2296 2060 2465 1997 5381 2030 3029 6989 2007 8422 2040 2453 2342 2000 2224 1998 2030 26056 1996 13180 1055 2592 1999 4434 2007 2023 2817 2040 2089 4374 2030 2224 1996 2592 1996 4243 3205 1999 1996 11003 20423 2089 26056 2115 2740 2592 2000 1996 2206 5381 1998 4411 2005 2037 2224 1999 4434 2007 2023 2470 2817 1996 2436 2005 2529 2470 28548 1999 1996 1057 1055 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.053025 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.054440 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0402 13:30:00.055664 140736101036928 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:*** Example ***


I0402 13:30:00.057498 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 13:30:00.059162 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] the basis for the health care agent or sur ##rogate ##s decision must be the patient s wishes including religious and moral beliefs or if unknown the patient s best interests [SEP]


I0402 13:30:00.060824 140736101036928 run_classifier.py:464] tokens: [CLS] the basis for the health care agent or sur ##rogate ##s decision must be the patient s wishes including religious and moral beliefs or if unknown the patient s best interests [SEP]


INFO:tensorflow:input_ids: 101 1996 3978 2005 1996 2740 2729 4005 2030 7505 21799 2015 3247 2442 2022 1996 5776 1055 8996 2164 3412 1998 7191 9029 2030 2065 4242 1996 5776 1055 2190 5426 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.062355 140736101036928 run_classifier.py:465] input_ids: 101 1996 3978 2005 1996 2740 2729 4005 2030 7505 21799 2015 3247 2442 2022 1996 5776 1055 8996 2164 3412 1998 7191 9029 2030 2065 4242 1996 5776 1055 2190 5426 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.063749 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 13:30:00.064900 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 13:30:00.066213 140736101036928 run_classifier.py:468] label: 0 (id = 0)


In [None]:
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where the learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [None]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [None]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [None]:
model_fn = jupyter_bert.model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': '../bert_output2019-04-02', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x157865080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0402 13:30:00.546468 140736101036928 estimator.py:201] Using config: {'_model_dir': '../bert_output2019-04-02', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x157865080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [None]:
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

INFO:tensorflow:Calling model_fn.


I0402 13:30:02.026859 140736101036928 estimator.py:1111] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0402 13:30:05.690821 140736101036928 saver.py:1483] Saver not created because there are no variables in the graph to restore


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0402 13:30:05.852097 140736101036928 deprecation.py:506] From /Users/milk/Desktop/git/permission_statement_extraction/notebooks/jupyter_bert.py:83: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


W0402 13:30:05.904537 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/learning_rate_decay_v2.py:321: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Use tf.cast instead.


W0402 13:30:06.014869 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use tf.cast instead.


W0402 13:30:12.949680 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/metrics_impl.py:455: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Done calling model_fn.


I0402 13:30:15.300033 140736101036928 estimator.py:1113] Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


I0402 13:30:15.304615 140736101036928 basic_session_run_hooks.py:527] Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


I0402 13:30:18.511015 140736101036928 monitored_session.py:222] Graph was finalized.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


W0402 13:30:18.514399 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


INFO:tensorflow:Restoring parameters from ../bert_output2019-04-02/model.ckpt-32


I0402 13:30:18.517419 140736101036928 saver.py:1270] Restoring parameters from ../bert_output2019-04-02/model.ckpt-32


Instructions for updating:
Use standard file utilities to get mtimes.


W0402 13:30:19.855957 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1070: get_checkpoint_mtimes (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file utilities to get mtimes.


INFO:tensorflow:Running local_init_op.


I0402 13:30:20.283195 140736101036928 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0402 13:30:20.509099 140736101036928 session_manager.py:493] Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 32 into ../bert_output2019-04-02/model.ckpt.


I0402 13:30:28.492151 140736101036928 basic_session_run_hooks.py:594] Saving checkpoints for 32 into ../bert_output2019-04-02/model.ckpt.


INFO:tensorflow:loss = 0.22138637, step = 33


I0402 13:31:07.095727 140736101036928 basic_session_run_hooks.py:249] loss = 0.22138637, step = 33


### Bert eval

In [None]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

## (OBSOLETE) 
This section contains code to output annotation data into files for local invocation of BERT.

In [None]:
# # remove old files
# os.remove('../data/bert_training_inputs/train.tsv')
# os.remove('../data/bert_training_inputs/dev.tsv')
# os.remove('../data/bert_training_inputs/test.tsv')

In [None]:
# # split into train, test, and dev 
# train, dummy = train_test_split(df, test_size=0.2)
# dev, test = train_test_split(dummy, test_size=0.5)

# print('original: ', len(df))
# print('train: ', len(train))
# print('dev: ', len(dev))
# print('test: ', len(test))

In [None]:
# # Train to CSV
# train.insert(0, 'New_ID', range(0, 0 + len(train)))
# train.to_csv('../data/bert_training_inputs/train.tsv', sep='\t', index=False, header=False)
# train.head()

In [None]:
# # Dev to CSV
# dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
# dev.to_csv('../data/bert_training_inputs/dev.tsv', sep='\t', index=False, header=False)
# dev.head()

In [None]:
# # Test to csv - requires a little more work 
# test.insert(0, 'New_ID', range(0, 0 + len(test)))
# cols = ['New_ID', 'clean_text']
# test = test[cols]
# test.columns = ['id', 'sentence']
# test.to_csv('../data/bert_training_inputs/test.tsv', sep='\t', index=False, header=True)
# test.head()