# Introduction
The goal is to format DataTurk annotations into something that BERT can run with.

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [8]:
import json
from pprint import pprint
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
import importlib

# custom data loading functions
import load_data
import clean_data

In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# test = '../data/data_turk/dummy_data.json'
annotations = '../data/data_turk/Annotations03-29-19.json'

In [20]:
importlib.reload(load_data)
df = load_data.getJSONData(annotations)
df.head()

Unnamed: 0,annotation,fileID,text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.
4,permission_statement,490,"""if you sign your name below, it means that yo..."


In [21]:
to  = 'label'
field = 'annotation'

df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row,field), axis =1) 

In [22]:
df.head()

Unnamed: 0,annotation,fileID,text,label
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0
4,permission_statement,490,"""if you sign your name below, it means that yo...",1


In [23]:
to  = 'clean_text'
field = 'text'
    
df[to] = df.apply(lambda row:clean_data.cleanSents(row, field), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,clean_text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,mobile email pre vaccination checklist please ...
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,date signature of child adolescent participant...
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,if you sign your name below it means that you ...


In [24]:
df['throw_away'] = 'a'

In [25]:
df.head()

Unnamed: 0,annotation,fileID,text,label,clean_text,throw_away
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...,a
1,NON_permission_statement.,370,mobile email pre vaccination checklist (please...,0,mobile email pre vaccination checklist please ...,a
2,NON_permission_statement.,490,date\tsignature of child/adolescent participan...,0,date signature of child adolescent participant...,a
3,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop,a
4,permission_statement,490,"""if you sign your name below, it means that yo...",1,if you sign your name below it means that you ...,a


In [26]:
cols = ['label', 'throw_away', 'clean_text'] 
df = df[cols]
df.head()

Unnamed: 0,label,throw_away,clean_text
0,0,a,my child has already had dtpa vaccination i do...
1,0,a,mobile email pre vaccination checklist please ...
2,0,a,date signature of child adolescent participant...
3,0,a,all you have to do is tell us you want to stop
4,1,a,if you sign your name below it means that you ...


## Remove Old Files

In [27]:
os.remove('../data/bert_training_inputs/train.tsv')
os.remove('../data/bert_training_inputs/dev.tsv')
os.remove('../data/bert_training_inputs/test.tsv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/bert_training_inputs/test.tsv'

### Define Train, Dev, and Test

In [28]:
train, dummy = train_test_split(df, test_size=0.2)
dev, test = train_test_split(dummy, test_size=0.5)

print('original: ', len(df))
print('train: ', len(train))
print('dev: ', len(dev))
print('test: ', len(test))

original:  2063
train:  1650
dev:  206
test:  207


In [29]:
train.insert(0, 'New_ID', range(0, 0 + len(train)))
train.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
2047,0,0,a,contact the calendar review analysis office cr...
1452,1,0,a,i you lose a large amount of your blood
140,2,0,a,you must take your study drug as instructed re...
1206,3,0,a,you can also tell us to stop using your medica...
1893,4,0,a,if you decide to stop no one will be angry or ...


In [31]:
train.to_csv('../data/bert_training_inputs/train.tsv', sep='\t', index=False, header=False)

In [32]:
dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
dev.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
665,0,1,a,you will not be informed if researchers look a...
1789,1,0,a,i understand that if i drive while under the i...
1315,2,0,a,legally every patient is presumed to consent t...
1762,3,0,a,however the research team will be able to use ...
963,4,0,a,the research that may be done is unknown at th...


In [33]:
dev.to_csv('../data/bert_training_inputs/dev.tsv', sep='\t', index=False, header=False)

In [34]:
test.insert(0, 'New_ID', range(0, 0 + len(test)))
test.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
1292,0,0,a,if you drop out of the study you would be aske...
1518,1,0,a,for double blinded studies add neither you nor...
734,2,0,a,date time uw medicine harborview medical cente...
200,3,0,a,all records findings and results of any geneti...
937,4,0,a,there will be about insert number people takin...


In [35]:
cols = ['New_ID', 'clean_text']
test = test[cols]
test.columns = ['id', 'sentence']
test.head()

Unnamed: 0,id,sentence
1292,0,if you drop out of the study you would be aske...
1518,1,for double blinded studies add neither you nor...
734,2,date time uw medicine harborview medical cente...
200,3,all records findings and results of any geneti...
937,4,there will be about insert number people takin...


In [36]:
test.to_csv('../data/bert_training_inputs/test.tsv', sep='\t', index=False, header=True)