# Introduction
Train a bert classifier using tensorflow hub. Adapted from:

[1]TensorFlow code and pre-trained models for BERT. Contribute to google-research/bert development by creating an account on GitHub. Google AI Research, 2019.


In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import json
from pprint import pprint
import os
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import importlib

import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from imblearn.over_sampling import SMOTE

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

# custom data loading functions
import load_data
import clean_data
import jupyter_bert

W0402 12:01:34.729977 140736101036928 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# test = '../data/data_turk/dummy_data.json'
annotations = '../data/data_turk/Annotations04-02-19.json'

### Load Data

In [5]:
importlib.reload(load_data)
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:"""
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu..."
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i..."
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...


### Reformat Data for Bert Model

In [6]:
to  = 'label'
field = 'annotation'

df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row,field), axis =1) 
df.head()

Unnamed: 0,annotation,fileID,text,label
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0


In [7]:
to  = 'text_a'
field = 'text'
    
df[to] = df.apply(lambda row:clean_data.cleanSents(row, field), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,text_a
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,yes i consent parent legal guardian signature
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,prior to administering the vaccine s the nurse...
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,within 2 business days of immunisation i under...
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,mobile email pre vaccination checklist please ...


In [8]:
df['text_b'] = 'a'
df.head()

Unnamed: 0,annotation,fileID,text,label,text_a,text_b
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,yes i consent parent legal guardian signature,a
1,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...,a
2,NON_permission_statement.,370,"""prior to administering the vaccine(s), the nu...",0,prior to administering the vaccine s the nurse...,a
3,NON_permission_statement.,370,""". - within 2 business days of immunisation, i...",0,within 2 business days of immunisation i under...,a
4,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,mobile email pre vaccination checklist please ...,a


In [9]:
df = df.rename(index=str, columns={"fileID": "guid"})

### Explore Some Positive Cases

In [10]:
positives = df.loc[df['label'] == 1]
positives["guid"] = pd.to_numeric(positives["guid"])
positives = positives.rename(index=str, columns={"guid": "fileID"})
positives.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,annotation,fileID,text,label,text_a,text_b
0,permission_statement,370,"""yes, i consent parent/legal guardian signature:""",1,yes i consent parent legal guardian signature,a
7,permission_statement,490,"""if you sign your name below, it means that yo...",1,if you sign your name below it means that you ...,a
15,permission_statement,565,i also understand that i should not consume al...,1,i also understand that i should not consume al...,a
19,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...,a
36,permission_statement,515,we may need to re- contact you if we are study...,1,we may need to re contact you if we are studyi...,a


In [11]:
filenames = pd.read_csv('../data/outputs/file_ids-2019-03-26.csv')
filenames = filenames.drop(['path'], axis=1)
filenames.head()

Unnamed: 0,fileID,name
0,1,TAMU - HRPP Informed consent.txt
1,2,Potomac Primary Care_flu-consent-form.txt
2,3,OSU_Scheduled_Delivery_Consent.txt
3,4,consent_biorepository_12-19-14.txt
4,5,Cambridge_Consent_endodontics2.txt


In [12]:
joined = pd.merge(positives, filenames, on='fileID', how='outer')
joined.rename(index=str, columns={"name": "file_name"})
joined = joined.drop(['text', 'label', 'text_b', 'fileID'], axis=1)
joined.head(10)

Unnamed: 0,annotation,text_a,name
0,permission_statement,yes i consent parent legal guardian signature,Queensland_AUSTRALIA_ Immunization Consent For...
1,permission_statement,if you sign your name below it means that you ...,MinorAssentForm030708 (1).txt
2,permission_statement,i also understand that i should not consume al...,Cambridge_Consent_endodontics11.txt
3,permission_statement,if you agree to being audiotaped but feel unco...,CF-Sample_Interview_Audiotape.txt
4,permission_statement,we may need to re contact you if we are studyi...,Marshfield-Consent-Form-10.28.13.txt
5,permission_statement,if you become ill or injured from this study m...,Marshfield-Consent-Form-10.28.13.txt
6,permission_statement,in order to minimize the risk of unintended re...,Marshfield-Consent-Form-10.28.13.txt
7,permission_statement,while we work with other researchers that are ...,Marshfield-Consent-Form-10.28.13.txt
8,permission_statement,your relatives will not be contacted because o...,Marshfield-Consent-Form-10.28.13.txt
9,permission_statement,we will code all information entered into the ...,Marshfield-Consent-Form-10.28.13.txt


In [13]:
# joined.to_csv('../data/outputs/positive_class_examples.csv', index=False)

### Sample to Speed it up.

In [14]:
n_samples = 500
df = df.sample(n=n_samples, random_state=1729)

In [15]:
cols = ['guid', 'text_a', 'text_b', 'label', ] 
df = df[cols]
df.head()

Unnamed: 0,guid,text_a,text_b,label
0,370,yes i consent parent legal guardian signature,a,1
1,370,my child has already had dtpa vaccination i do...,a,0
2,370,prior to administering the vaccine s the nurse...,a,0
3,370,within 2 business days of immunisation i under...,a,0
4,370,mobile email pre vaccination checklist please ...,a,0


### Train - Test split

In [16]:
train, test = train_test_split(df,stratify=df['label'],test_size=0.3)
train.head()

Unnamed: 0,guid,text_a,text_b,label
1943,307,i platelets platelets help the blood to clot,a,0
1998,237,explain what commitment you expect from the su...,a,0
1706,334,the doctor could tell you about the effects of...,a,0
1221,539,o yfhastings fact sheet consent form prince ed...,a,0
2061,439,i know enough about the purpose protections ri...,a,0


### SMOTE IT

In [17]:
# very annoyingly, SMOTE implementation does not support passing text. 
# here's a dumb workaround.

sm = SMOTE(random_state=12, ratio = 1.0)

train['index1'] = train.index
train_prep = train[['guid', 'index1']]

sm_train_idx, sm_train_label = sm.fit_sample(train_prep,train['label'])

print(len(train_prep), len(train['label']))
print(len(sm_train_idx), len(sm_train_label))

1942 1942
3388 3388


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
# combine the smote results with the text data

smote_results = pd.DataFrame({'guid':sm_train_idx[:,0],
                                  'index1':sm_train_idx[:,1]})

smote_results['label'] = sm_train_label

rejoin_from_train_df = train[['text_a', 'text_b', 'index1']]
rejoin_from_train_df['index1'] = rejoin_from_train_df['index1'].astype(float)

rejoin = pd.merge(smote_results, rejoin_from_train_df, on='index1', how='left')
rejoin = rejoin.set_index('index1')
cols = ['guid', 'text_a', 'text_b', 'label', ] 
rejoin = rejoin[cols]
train = rejoin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
DATA_COLUMN = 'text_a'
LABEL_COLUMN = 'label'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

In [20]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

### BERT config

In [21]:
today = datetime.today().strftime('%Y-%m-%d')
OUTPUT_DIR = '../bert_output' + today
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: ../bert_output2019-04-02 *****


In [22]:
tokenizer = jupyter_bert.create_tokenizer_from_hub_module()

Instructions for updating:
Colocations handled automatically by placer.


W0402 12:01:44.989603 140736101036928 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0402 12:01:46.749701 140736101036928 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [23]:
# # test
# tokenizer.tokenize("This here's an example of using the BERT tokenizer")

In [24]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

INFO:tensorflow:Writing example 0 of 3388


I0402 12:01:47.215557 140736101036928 run_classifier.py:774] Writing example 0 of 3388


INFO:tensorflow:*** Example ***


I0402 12:01:47.219666 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 12:01:47.221951 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] i plate ##lets plate ##lets help the blood to cl ##ot [SEP]


I0402 12:01:47.226363 140736101036928 run_classifier.py:464] tokens: [CLS] i plate ##lets plate ##lets help the blood to cl ##ot [SEP]


INFO:tensorflow:input_ids: 101 1045 5127 13461 5127 13461 2393 1996 2668 2000 18856 4140 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.230065 140736101036928 run_classifier.py:465] input_ids: 101 1045 5127 13461 5127 13461 2393 1996 2668 2000 18856 4140 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.234595 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.238070 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 12:01:47.241312 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 12:01:47.244499 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 12:01:47.247066 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] explain what commitment you expect from the subject [SEP]


I0402 12:01:47.251434 140736101036928 run_classifier.py:464] tokens: [CLS] explain what commitment you expect from the subject [SEP]


INFO:tensorflow:input_ids: 101 4863 2054 8426 2017 5987 2013 1996 3395 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.253577 140736101036928 run_classifier.py:465] input_ids: 101 4863 2054 8426 2017 5987 2013 1996 3395 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.254719 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.255941 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 12:01:47.257544 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 12:01:47.259579 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 12:01:47.261615 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] the doctor could tell you about the effects of stopping name of study product [SEP]


I0402 12:01:47.263992 140736101036928 run_classifier.py:464] tokens: [CLS] the doctor could tell you about the effects of stopping name of study product [SEP]


INFO:tensorflow:input_ids: 101 1996 3460 2071 2425 2017 2055 1996 3896 1997 7458 2171 1997 2817 4031 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.271508 140736101036928 run_classifier.py:465] input_ids: 101 1996 3460 2071 2425 2017 2055 1996 3896 1997 7458 2171 1997 2817 4031 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.275110 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.279017 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 12:01:47.281839 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 12:01:47.285539 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 12:01:47.289273 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] o y ##f ##has ##ting ##s fact sheet consent form prince edward counties what is human pa ##pi ##llo ##ma ##virus hp ##v hp ##v is a common virus [SEP]


I0402 12:01:47.290937 140736101036928 run_classifier.py:464] tokens: [CLS] o y ##f ##has ##ting ##s fact sheet consent form prince edward counties what is human pa ##pi ##llo ##ma ##virus hp ##v hp ##v is a common virus [SEP]


INFO:tensorflow:input_ids: 101 1051 1061 2546 14949 3436 2015 2755 7123 9619 2433 3159 3487 5721 2054 2003 2529 6643 8197 7174 2863 23350 6522 2615 6522 2615 2003 1037 2691 7865 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.292418 140736101036928 run_classifier.py:465] input_ids: 101 1051 1061 2546 14949 3436 2015 2755 7123 9619 2433 3159 3487 5721 2054 2003 2529 6643 8197 7174 2863 23350 6522 2615 6522 2615 2003 1037 2691 7865 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.294115 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.299968 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 12:01:47.302217 140736101036928 run_classifier.py:468] label: 0 (id = 0)


INFO:tensorflow:*** Example ***


I0402 12:01:47.306040 140736101036928 run_classifier.py:461] *** Example ***


INFO:tensorflow:guid: None


I0402 12:01:47.307583 140736101036928 run_classifier.py:462] guid: None


INFO:tensorflow:tokens: [CLS] i know enough about the purpose protections risks and possible benefits of being included in the bio ##bank to decide that i want to participate [SEP]


I0402 12:01:47.309029 140736101036928 run_classifier.py:464] tokens: [CLS] i know enough about the purpose protections risks and possible benefits of being included in the bio ##bank to decide that i want to participate [SEP]


INFO:tensorflow:input_ids: 101 1045 2113 2438 2055 1996 3800 28548 10831 1998 2825 6666 1997 2108 2443 1999 1996 16012 9299 2000 5630 2008 1045 2215 2000 5589 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.311074 140736101036928 run_classifier.py:465] input_ids: 101 1045 2113 2438 2055 1996 3800 28548 10831 1998 2825 6666 1997 2108 2443 1999 1996 16012 9299 2000 5630 2008 1045 2215 2000 5589 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.312381 140736101036928 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


I0402 12:01:47.313666 140736101036928 run_classifier.py:467] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


INFO:tensorflow:label: 0 (id = 0)


I0402 12:01:47.315371 140736101036928 run_classifier.py:468] label: 0 (id = 0)


ValueError: Unsupported string type: <class 'float'>

In [None]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [None]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [None]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [None]:
model_fn = jupyter_bert.model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [None]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [None]:
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

### Bert eval

In [None]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

## Remove Old Files

In [None]:
# os.remove('../data/bert_training_inputs/train.tsv')
# os.remove('../data/bert_training_inputs/dev.tsv')
# os.remove('../data/bert_training_inputs/test.tsv')

### Define Train, Dev, and Test

In [None]:
# train, dummy = train_test_split(df, test_size=0.2)
# dev, test = train_test_split(dummy, test_size=0.5)

# print('original: ', len(df))
# print('train: ', len(train))
# print('dev: ', len(dev))
# print('test: ', len(test))

In [None]:
# train.insert(0, 'New_ID', range(0, 0 + len(train)))
# train.head()

In [None]:
# train.to_csv('../data/bert_training_inputs/train.tsv', sep='\t', index=False, header=False)

In [None]:
# dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
# dev.head()

In [None]:
# dev.to_csv('../data/bert_training_inputs/dev.tsv', sep='\t', index=False, header=False)

In [None]:
# test.insert(0, 'New_ID', range(0, 0 + len(test)))
# test.head()

In [None]:
# cols = ['New_ID', 'clean_text']
# test = test[cols]
# test.columns = ['id', 'sentence']
# test.head()

In [None]:
# test.to_csv('../data/bert_training_inputs/test.tsv', sep='\t', index=False, header=True)