# Introduction
Train a bert classifier using tensorflow hub. Adapted from:

[1]TensorFlow code and pre-trained models for BERT. Contribute to google-research/bert development by creating an account on GitHub. Google AI Research, 2019.


In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import json
from pprint import pprint
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import importlib

import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from imblearn.over_sampling import SMOTE

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

# custom data loading functions
import load_data
import clean_data
import jupyter_bert

W0406 09:38:31.198750 140736101036928 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# test = '../data/data_turk/dummy_data.json'
annotations = '../data/data_turk/Annotations04-05-19.json'

### Load Data

In [5]:
importlib.reload(load_data)
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,permission_statement,1,i give my permission for photographs/audio/vid...
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi..."
2,NON_permission_statement.,1,this consent form will be filed securely in an...
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial..."
4,permission_statement,1,"""i hereby certify that to the best of my knowl..."


### Reformat Data for Bert Model

In [6]:
to  = 'label'
field = 'annotation'

df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row,field), axis =1) 
df.head()

Unnamed: 0,annotation,fileID,text,label
0,permission_statement,1,i give my permission for photographs/audio/vid...,1
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1


In [7]:
to  = 'text_a'
field = 'text'
    
df[to] = df.apply(lambda row:clean_data.cleanSents(row, field), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,text_a
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,i give my permission for photographs audio vid...
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0,if applicable add information about individual...
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0,this consent form will be filed securely in an...
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0,include circumstances if any where partial pay...
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,i hereby certify that to the best of my knowle...


In [8]:
df['text_b'] = 'a'
df.head()

Unnamed: 0,annotation,fileID,text,label,text_a,text_b
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,i give my permission for photographs audio vid...,a
1,NON_permission_statement.,1,"""(if applicable, add) information about indivi...",0,if applicable add information about individual...,a
2,NON_permission_statement.,1,this consent form will be filed securely in an...,0,this consent form will be filed securely in an...,a
3,NON_permission_statement.,1,"""[include circumstances, if any, where partial...",0,include circumstances if any where partial pay...,a
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,i hereby certify that to the best of my knowle...,a


In [9]:
df = df.rename(index=str, columns={"fileID": "guid"})

### Explore Some Positive Cases

In [10]:
positives = df.loc[df['label'] == 1]
positives["guid"] = pd.to_numeric(positives["guid"])
positives = positives.rename(index=str, columns={"guid": "fileID"})
positives.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,annotation,fileID,text,label,text_a,text_b
0,permission_statement,1,i give my permission for photographs/audio/vid...,1,i give my permission for photographs audio vid...,a
4,permission_statement,1,"""i hereby certify that to the best of my knowl...",1,i hereby certify that to the best of my knowle...,a
5,permission_statement,1,information about you and related to this stud...,1,information about you and related to this stud...,a
7,permission_statement,1,i do not give my permission for my xxx records...,1,i do not give my permission for my xxx records...,a
12,permission_statement,4,"""by signing this document, you are authorizing...",1,by signing this document you are authorizing c...,a


In [11]:
filenames = pd.read_csv('../data/outputs/file_ids-2019-03-26.csv')
filenames = filenames.drop(['path'], axis=1)
filenames.head()

Unnamed: 0,fileID,name
0,1,TAMU - HRPP Informed consent.txt
1,2,Potomac Primary Care_flu-consent-form.txt
2,3,OSU_Scheduled_Delivery_Consent.txt
3,4,consent_biorepository_12-19-14.txt
4,5,Cambridge_Consent_endodontics2.txt


In [12]:
joined = pd.merge(positives, filenames, on='fileID', how='outer')
joined.rename(index=str, columns={"name": "file_name"})
joined = joined.drop(['text', 'label', 'text_b', 'fileID'], axis=1)
joined.head(10)

Unnamed: 0,annotation,text_a,name
0,permission_statement,i give my permission for photographs audio vid...,TAMU - HRPP Informed consent.txt
1,permission_statement,i hereby certify that to the best of my knowle...,TAMU - HRPP Informed consent.txt
2,permission_statement,information about you and related to this stud...,TAMU - HRPP Informed consent.txt
3,permission_statement,i do not give my permission for my xxx records...,TAMU - HRPP Informed consent.txt
4,permission_statement,by signing this document you are authorizing c...,consent_biorepository_12-19-14.txt
5,permission_statement,you consent to the use of your excess tissue b...,HMH - Biobanking-Genetic Research.txt
6,permission_statement,how long will you participqte and will you be ...,HMH - Biobanking-Genetic Research.txt
7,permission_statement,i have a right to request a copy of any of my ...,HMH - Biobanking-Genetic Research.txt
8,permission_statement,o i consent to any other emergency procedure r...,The American College of Surgeons_Lap Choly-Com...
9,permission_statement,my signature below acknowledges that i consent...,Copy of SA Pathology_Genomic-Testing-Form-PUB-...


In [13]:
# joined.to_csv('../data/outputs/positive_class_examples.csv', index=False)

### Sample to Speed it up.

In [14]:
# n_samples = 500
# df = df.sample(n=n_samples, random_state=1729)

In [15]:
cols = ['guid', 'text_a', 'text_b', 'label', ] 
df = df[cols]
df.head()

Unnamed: 0,guid,text_a,text_b,label
0,1,i give my permission for photographs audio vid...,a,1
1,1,if applicable add information about individual...,a,0
2,1,this consent form will be filed securely in an...,a,0
3,1,include circumstances if any where partial pay...,a,0
4,1,i hereby certify that to the best of my knowle...,a,1


### Train - Test split

In [16]:
train, test = train_test_split(df,stratify=df['label'],test_size=0.3)

print(type(train), type(test))
print(train.dtypes)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>
guid      object
text_a    object
text_b    object
label      int64
dtype: object


### SMOTE IT
Unfortunatelty SMOTE does not work for this kind of data as implemented. Need to kleep thinking.

In [17]:
# # very annoyingly, SMOTE implementation does not support passing text. 
# # here's a dumb workaround.

# sm = SMOTE(random_state=12, ratio = 1.0)

# train['index1'] = train.index
# train_prep = train[['guid', 'index1']]

# sm_train_idx, sm_train_label = sm.fit_sample(train_prep,train['label'])

# print(len(train_prep), len(train['label']))
# print(len(sm_train_idx), len(sm_train_label))

In [18]:
# # combine the smote results with the text data

# smote_results = pd.DataFrame({'guid':sm_train_idx[:,0],
#                                   'index1':sm_train_idx[:,1]})

# smote_results['label'] = sm_train_label
# smote_results['guid'] = smote_results['guid'].astype(object)

# rejoin_from_train_df = train[['text_a', 'text_b', 'index1']]
# rejoin_from_train_df['index1'] = rejoin_from_train_df['index1'].astype(float)

# rejoin = pd.merge(smote_results, rejoin_from_train_df, on='index1', how='outer')
# rejoin = rejoin.set_index('index1')
# rejoin['text_b'] = 'a'
# cols = ['guid', 'text_a', 'text_b', 'label'] 
# rejoin = rejoin[cols]
# train = rejoin

In [19]:
# for idx, row in train.iterrows():
#     print(row, '\n')

In [20]:
DATA_COLUMN = 'text_a'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [21]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

### BERT config

In [22]:
today = datetime.today().strftime('%Y-%m-%d')
OUTPUT_DIR = '../bert_output' + today
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: ../bert_output2019-04-06 *****


In [23]:
tokenizer = jupyter_bert.create_tokenizer_from_hub_module()

Instructions for updating:
Colocations handled automatically by placer.


W0406 09:38:42.083155 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0406 09:38:43.832597 140736101036928 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [24]:
# # test
# tokenizer.tokenize("This here's an example of using the BERT tokenizer")

In [25]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 1727


I0406 09:38:44.479390 140736101036928 run_classifier.py:774] Writing example 0 of 1727


INFO:tensorflow:*** Example ***


I0406 09:38:44.487730 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:44.490777 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] author ##izing the research team to use your phi means that we can release it only to the people or groups listed above and only for the purposes described in this form [SEP]


I0406 09:38:44.495707 140736101036928 run_classifier.py:464] tokens: [CLS] author ##izing the research team to use your phi means that we can release it only to the people or groups listed above and only for the purposes described in this form [SEP]


INFO:tensorflow:input_ids: 101 3166 6026 1996 2470 2136 2000 2224 2115 13569 2965 2008 2057 2064 2713 2009 2069 2000 1996 2111 2030 2967 3205 2682 1998 2069 2005 1996 5682 2649 1999 2023 2433 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.498462 140736101036928 run_classifier.py:465] input_ids: 101 3166 6026 1996 2470 2136 2000 2224 2115 13569 2965 2008 2057 2064 2713 2009 2069 2000 1996 2111 2030 2967 3205 2682 1998 2069 2005 1996 5682 2649 1999 2023 2433 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.500111 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.501837 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0406 09:38:44.503394 140736101036928 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:*** Example ***


I0406 09:38:44.505071 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:44.506475 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] study staff will protect my confidential ##ity as provided by law [SEP]


I0406 09:38:44.508216 140736101036928 run_classifier.py:464] tokens: [CLS] study staff will protect my confidential ##ity as provided by law [SEP]


INFO:tensorflow:input_ids: 101 2817 3095 2097 4047 2026 18777 3012 2004 3024 2011 2375 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.516371 140736101036928 run_classifier.py:465] input_ids: 101 2817 3095 2097 4047 2026 18777 3012 2004 3024 2011 2375 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.518274 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.519335 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:44.520669 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:44.524272 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:44.525652 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] privacy risks for example disclosure of private information where private information is being collected most studies include the following statement there always exists the potential for loss of private information however there are procedures in place to minimize this risk [SEP]


I0406 09:38:44.532480 140736101036928 run_classifier.py:464] tokens: [CLS] privacy risks for example disclosure of private information where private information is being collected most studies include the following statement there always exists the potential for loss of private information however there are procedures in place to minimize this risk [SEP]


INFO:tensorflow:input_ids: 101 9394 10831 2005 2742 19380 1997 2797 2592 2073 2797 2592 2003 2108 5067 2087 2913 2421 1996 2206 4861 2045 2467 6526 1996 4022 2005 3279 1997 2797 2592 2174 2045 2024 8853 1999 2173 2000 18478 2023 3891 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.534143 140736101036928 run_classifier.py:465] input_ids: 101 9394 10831 2005 2742 19380 1997 2797 2592 2073 2797 2592 2003 2108 5067 2087 2913 2421 1996 2206 4861 2045 2467 6526 1996 4022 2005 3279 1997 2797 2592 2174 2045 2024 8853 1999 2173 2000 18478 2023 3891 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.535301 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.537817 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:44.539269 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:44.542635 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:44.549341 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] once provided to the fund ##er the investigator will not be able to destroy your specimens or data if you decide in the future that you do not wish to participate in this research repository [SEP]


I0406 09:38:44.555184 140736101036928 run_classifier.py:464] tokens: [CLS] once provided to the fund ##er the investigator will not be able to destroy your specimens or data if you decide in the future that you do not wish to participate in this research repository [SEP]


INFO:tensorflow:input_ids: 101 2320 3024 2000 1996 4636 2121 1996 14064 2097 2025 2022 2583 2000 6033 2115 9908 2030 2951 2065 2017 5630 1999 1996 2925 2008 2017 2079 2025 4299 2000 5589 1999 2023 2470 22409 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.557832 140736101036928 run_classifier.py:465] input_ids: 101 2320 3024 2000 1996 4636 2121 1996 14064 2097 2025 2022 2583 2000 6033 2115 9908 2030 2951 2065 2017 5630 1999 1996 2925 2008 2017 2079 2025 4299 2000 5589 1999 2023 2470 22409 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.559373 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.563186 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:44.565551 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:44.570173 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:44.571362 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] we will not give information that identifies you to anyone except if required by law [SEP]


I0406 09:38:44.573832 140736101036928 run_classifier.py:464] tokens: [CLS] we will not give information that identifies you to anyone except if required by law [SEP]


INFO:tensorflow:input_ids: 101 2057 2097 2025 2507 2592 2008 14847 2017 2000 3087 3272 2065 3223 2011 2375 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.575495 140736101036928 run_classifier.py:465] input_ids: 101 2057 2097 2025 2507 2592 2008 14847 2017 2000 3087 3272 2065 3223 2011 2375 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.579856 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:44.583019 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 1 (id = 1)


I0406 09:38:44.585134 140736101036928 run_classifier.py:468] label: 1 (id = 1)


INFO:tensorflow:Writing example 0 of 741


I0406 09:38:45.621109 140736101036928 run_classifier.py:774] Writing example 0 of 741


INFO:tensorflow:*** Example ***


I0406 09:38:45.623831 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:45.625324 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] c necessity of root canal therapy placing filling ##s and or build ups and preparing teeth for crowns or bridges nec ##ess ##itate ##s the removal of tooth structure adequate to ins ##ure the complete removal of the disease ##d or otherwise compromised tooth structure [SEP]


I0406 09:38:45.630580 140736101036928 run_classifier.py:464] tokens: [CLS] c necessity of root canal therapy placing filling ##s and or build ups and preparing teeth for crowns or bridges nec ##ess ##itate ##s the removal of tooth structure adequate to ins ##ure the complete removal of the disease ##d or otherwise compromised tooth structure [SEP]


INFO:tensorflow:input_ids: 101 1039 13185 1997 7117 5033 7242 6885 8110 2015 1998 2030 3857 11139 1998 8225 4091 2005 24364 2030 7346 26785 7971 17570 2015 1996 8208 1997 11868 3252 11706 2000 16021 5397 1996 3143 8208 1997 1996 4295 2094 2030 4728 20419 11868 3252 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.633165 140736101036928 run_classifier.py:465] input_ids: 101 1039 13185 1997 7117 5033 7242 6885 8110 2015 1998 2030 3857 11139 1998 8225 4091 2005 24364 2030 7346 26785 7971 17570 2015 1996 8208 1997 11868 3252 11706 2000 16021 5397 1996 3143 8208 1997 1996 4295 2094 2030 4728 20419 11868 3252 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.635390 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.636921 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:45.638489 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:45.641338 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:45.646806 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] witness statement the participant was unable to read or sign this consent form because of the following reason the participant is ill ##iter ##ate the participant is visually impaired the participant is physically unable to sign the consent form [SEP]


I0406 09:38:45.652250 140736101036928 run_classifier.py:464] tokens: [CLS] witness statement the participant was unable to read or sign this consent form because of the following reason the participant is ill ##iter ##ate the participant is visually impaired the participant is physically unable to sign the consent form [SEP]


INFO:tensorflow:input_ids: 101 7409 4861 1996 13180 2001 4039 2000 3191 2030 3696 2023 9619 2433 2138 1997 1996 2206 3114 1996 13180 2003 5665 21646 3686 1996 13180 2003 17453 18234 1996 13180 2003 8186 4039 2000 3696 1996 9619 2433 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.654752 140736101036928 run_classifier.py:465] input_ids: 101 7409 4861 1996 13180 2001 4039 2000 3191 2030 3696 2023 9619 2433 2138 1997 1996 2206 3114 1996 13180 2003 5665 21646 3686 1996 13180 2003 17453 18234 1996 13180 2003 8186 4039 2000 3696 1996 9619 2433 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.656602 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.658073 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:45.662480 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:45.667797 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:45.672672 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] male subjects must agree to use an acceptable method for contra ##ception during the entire study treatment period through 4 months after the last dose of ml ##n ##8 ##23 ##7 [SEP]


I0406 09:38:45.675351 140736101036928 run_classifier.py:464] tokens: [CLS] male subjects must agree to use an acceptable method for contra ##ception during the entire study treatment period through 4 months after the last dose of ml ##n ##8 ##23 ##7 [SEP]


INFO:tensorflow:input_ids: 101 3287 5739 2442 5993 2000 2224 2019 11701 4118 2005 24528 24422 2076 1996 2972 2817 3949 2558 2083 1018 2706 2044 1996 2197 13004 1997 19875 2078 2620 21926 2581 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.683341 140736101036928 run_classifier.py:465] input_ids: 101 3287 5739 2442 5993 2000 2224 2019 11701 4118 2005 24528 24422 2076 1996 2972 2817 3949 2558 2083 1018 2706 2044 1996 2197 13004 1997 19875 2078 2620 21926 2581 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.685351 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.687000 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:45.688802 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:45.691658 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:45.696778 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] if you were getting the th ##x ##33 ##3 pills and they seemed to be helping you your regular doctor might be able to pre ##scribe something like th ##x ##33 ##3 after the study ends [SEP]


I0406 09:38:45.702976 140736101036928 run_classifier.py:464] tokens: [CLS] if you were getting the th ##x ##33 ##3 pills and they seemed to be helping you your regular doctor might be able to pre ##scribe something like th ##x ##33 ##3 after the study ends [SEP]


INFO:tensorflow:input_ids: 101 2065 2017 2020 2893 1996 16215 2595 22394 2509 15345 1998 2027 2790 2000 2022 5094 2017 2115 3180 3460 2453 2022 2583 2000 3653 29234 2242 2066 16215 2595 22394 2509 2044 1996 2817 4515 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.709189 140736101036928 run_classifier.py:465] input_ids: 101 2065 2017 2020 2893 1996 16215 2595 22394 2509 15345 1998 2027 2790 2000 2022 5094 2017 2115 3180 3460 2453 2022 2583 2000 3653 29234 2242 2066 16215 2595 22394 2509 2044 1996 2817 4515 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.712747 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.717681 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:45.719031 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0406 09:38:45.720750 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0406 09:38:45.721909 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] if you have questions later you can ask them of me or of another researcher [SEP]


I0406 09:38:45.723135 140736101036928 run_classifier.py:464] tokens: [CLS] if you have questions later you can ask them of me or of another researcher [SEP]


INFO:tensorflow:input_ids: 101 2065 2017 2031 3980 2101 2017 2064 3198 2068 1997 2033 2030 1997 2178 10753 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.724642 140736101036928 run_classifier.py:465] input_ids: 101 2065 2017 2031 3980 2101 2017 2064 3198 2068 1997 2033 2030 1997 2178 10753 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.732442 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0406 09:38:45.735862 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0406 09:38:45.738023 140736101036928 run_classifier.py:468] label: 0 (id = 0)


In [26]:
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where the learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [27]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [28]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [29]:
model_fn = jupyter_bert.model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': '../bert_output2019-04-06', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x14f7e6550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0406 09:38:46.187440 140736101036928 estimator.py:201] Using config: {'_model_dir': '../bert_output2019-04-06', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x14f7e6550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [30]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [31]:
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
# print("Training took time ", datetime.now() - current_time)

INFO:tensorflow:Calling model_fn.


I0406 09:38:47.056472 140736101036928 estimator.py:1111] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0406 09:38:49.757671 140736101036928 saver.py:1483] Saver not created because there are no variables in the graph to restore


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


W0406 09:38:49.900986 140736101036928 deprecation.py:506] From /Users/milk/Desktop/git/permission_statement_extraction/notebooks/jupyter_bert.py:83: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


W0406 09:38:49.943834 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/learning_rate_decay_v2.py:321: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Use tf.cast instead.


W0406 09:38:50.024766 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use tf.cast instead.


W0406 09:38:56.936157 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/metrics_impl.py:455: to_float (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.



For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

INFO:tensorflow:Done calling model_fn.


I0406 09:38:58.937247 140736101036928 estimator.py:1113] Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


I0406 09:38:58.940962 140736101036928 basic_session_run_hooks.py:527] Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


I0406 09:39:02.523769 140736101036928 monitored_session.py:222] Graph was finalized.


INFO:tensorflow:Running local_init_op.


I0406 09:39:06.598407 140736101036928 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0406 09:39:06.844200 140736101036928 session_manager.py:493] Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into ../bert_output2019-04-06/model.ckpt.


I0406 09:39:14.787797 140736101036928 basic_session_run_hooks.py:594] Saving checkpoints for 0 into ../bert_output2019-04-06/model.ckpt.


INFO:tensorflow:loss = 0.6528851, step = 1


I0406 09:39:51.567425 140736101036928 basic_session_run_hooks.py:249] loss = 0.6528851, step = 1


INFO:tensorflow:global_step/sec: 0.0222849


I0406 10:54:38.911586 140736101036928 basic_session_run_hooks.py:680] global_step/sec: 0.0222849


INFO:tensorflow:loss = 0.21075012, step = 101 (4487.392 sec)


I0406 10:54:38.959188 140736101036928 basic_session_run_hooks.py:247] loss = 0.21075012, step = 101 (4487.392 sec)


INFO:tensorflow:Saving checkpoints for 161 into ../bert_output2019-04-06/model.ckpt.


I0406 11:28:29.172631 140736101036928 basic_session_run_hooks.py:594] Saving checkpoints for 161 into ../bert_output2019-04-06/model.ckpt.


INFO:tensorflow:Loss for final step: 0.006996846.


I0406 11:28:34.691071 140736101036928 estimator.py:359] Loss for final step: 0.006996846.


<tensorflow_estimator.python.estimator.estimator.Estimator at 0x14f7e6048>

In [32]:
print("Training took time ", datetime.now() - current_time)

Training took time  1:49:48.541084


### Bert eval

In [33]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [34]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

INFO:tensorflow:Calling model_fn.


I0406 11:28:35.281996 140736101036928 estimator.py:1111] Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0406 11:28:39.074589 140736101036928 saver.py:1483] Saver not created because there are no variables in the graph to restore
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.


I0406 11:28:46.662927 140736101036928 estimator.py:1113] Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2019-04-06T15:28:46Z


I0406 11:28:46.684400 140736101036928 evaluation.py:257] Starting evaluation at 2019-04-06T15:28:46Z


INFO:tensorflow:Graph was finalized.


I0406 11:28:48.438401 140736101036928 monitored_session.py:222] Graph was finalized.


Instructions for updating:
Use standard file APIs to check for files with this prefix.


W0406 11:28:48.440508 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


INFO:tensorflow:Restoring parameters from ../bert_output2019-04-06/model.ckpt-161


I0406 11:28:48.445054 140736101036928 saver.py:1270] Restoring parameters from ../bert_output2019-04-06/model.ckpt-161


INFO:tensorflow:Running local_init_op.


I0406 11:28:50.311867 140736101036928 session_manager.py:491] Running local_init_op.


INFO:tensorflow:Done running local_init_op.


I0406 11:28:50.615545 140736101036928 session_manager.py:493] Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2019-04-06-15:33:00


I0406 11:33:00.498469 140736101036928 evaluation.py:277] Finished evaluation at 2019-04-06-15:33:00


INFO:tensorflow:Saving dict for global step 161: auc = 0.85997075, eval_accuracy = 0.88933873, f1_score = 0.7734806, false_negatives = 34.0, false_positives = 48.0, global_step = 161, loss = 0.42093647, precision = 0.7446808, recall = 0.8045977, true_negatives = 519.0, true_positives = 140.0


I0406 11:33:00.499677 140736101036928 estimator.py:1979] Saving dict for global step 161: auc = 0.85997075, eval_accuracy = 0.88933873, f1_score = 0.7734806, false_negatives = 34.0, false_positives = 48.0, global_step = 161, loss = 0.42093647, precision = 0.7446808, recall = 0.8045977, true_negatives = 519.0, true_positives = 140.0


INFO:tensorflow:Saving 'checkpoint_path' summary for global step 161: ../bert_output2019-04-06/model.ckpt-161


I0406 11:33:03.081827 140736101036928 estimator.py:2039] Saving 'checkpoint_path' summary for global step 161: ../bert_output2019-04-06/model.ckpt-161


{'auc': 0.85997075,
 'eval_accuracy': 0.88933873,
 'f1_score': 0.7734806,
 'false_negatives': 34.0,
 'false_positives': 48.0,
 'loss': 0.42093647,
 'precision': 0.7446808,
 'recall': 0.8045977,
 'true_negatives': 519.0,
 'true_positives': 140.0,
 'global_step': 161}

## (OBSOLETE) 
This section contains code to output annotation data into files for local invocation of BERT.

In [35]:
# # remove old files
# os.remove('../data/bert_training_inputs/train.tsv')
# os.remove('../data/bert_training_inputs/dev.tsv')
# os.remove('../data/bert_training_inputs/test.tsv')

In [36]:
# # split into train, test, and dev 
# train, dummy = train_test_split(df, test_size=0.2)
# dev, test = train_test_split(dummy, test_size=0.5)

# print('original: ', len(df))
# print('train: ', len(train))
# print('dev: ', len(dev))
# print('test: ', len(test))

In [37]:
# # Train to CSV
# train.insert(0, 'New_ID', range(0, 0 + len(train)))
# train.to_csv('../data/bert_training_inputs/train.tsv', sep='\t', index=False, header=False)
# train.head()

In [38]:
# # Dev to CSV
# dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
# dev.to_csv('../data/bert_training_inputs/dev.tsv', sep='\t', index=False, header=False)
# dev.head()

In [39]:
# # Test to csv - requires a little more work 
# test.insert(0, 'New_ID', range(0, 0 + len(test)))
# cols = ['New_ID', 'clean_text']
# test = test[cols]
# test.columns = ['id', 'sentence']
# test.to_csv('../data/bert_training_inputs/test.tsv', sep='\t', index=False, header=True)
# test.head()