# Introduction
The goal is to format DataTurk annotations into something that BERT can run with.

In [1]:
import json
from pprint import pprint
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split

# custom data loading functions
import load_data
import clean_data

In [2]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [3]:
test = '../data/data_turk/dummy_data.json'

In [4]:
df = load_data.getJSONData(test)
df.head()

Unnamed: 0,annotation,fileID,text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.
2,NON_permission_statement.,490,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc..."
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...


In [5]:
to  = 'label'
field = 'annotation'

df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row,field), axis =1) 

In [6]:
df.head()

Unnamed: 0,annotation,fileID,text,label
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0


In [7]:
to  = 'clean_text'
field = 'text'
    
df[to] = df.apply(lambda row:clean_data.cleanSents(row, field), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,clean_text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,you will be given a copy of this form to keep ...


In [8]:
df['throw_away'] = 'a'

In [9]:
df.head()

Unnamed: 0,annotation,fileID,text,label,clean_text,throw_away
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...,a
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop,a
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,tufts medical center tufts university departme...,a
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...,a
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,you will be given a copy of this form to keep ...,a


In [10]:
cols = ['label', 'throw_away', 'clean_text'] 
df = df[cols]
df.head()

Unnamed: 0,label,throw_away,clean_text
0,0,a,my child has already had dtpa vaccination i do...
1,0,a,all you have to do is tell us you want to stop
2,0,a,tufts medical center tufts university departme...
3,1,a,if you agree to being audiotaped but feel unco...
4,0,a,you will be given a copy of this form to keep ...


## Remove Old Files

In [11]:
os.remove('../bert/data/train.tsv')
os.remove('../bert/data/dev.tsv')
os.remove('../bert/data/test.tsv')

### Define Train, Dev, and Test

In [12]:
train, dummy = train_test_split(df, test_size=0.2)
dev, test = train_test_split(dummy, test_size=0.5)

print('original: ', len(df))
print('train: ', len(train))
print('dev: ', len(dev))
print('test: ', len(test))

original:  520
train:  416
dev:  52
test:  52


In [13]:
train.insert(0, 'New_ID', range(0, 0 + len(train)))
train.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
191,0,1,a,lotus root acupuncture i hereby request and co...
307,1,0,a,there is also a small chance that researchers ...
373,2,0,a,i understand and read the english language or ...
115,3,0,a,telehealth and telehealth consultations i have...
259,4,1,a,there will be no charge to parents of children...


In [14]:
train.to_csv('../bert/data/train.tsv', sep='\t', index=False, header=False)

In [15]:
dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
dev.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
219,0,0,a,you should not take part in more than one stud...
487,1,0,a,the next sentence should be included if approp...
319,2,0,a,your insurance company might pay these costs b...
97,3,1,a,the stanford university administrative panel o...
256,4,0,a,delete if not applicable


In [16]:
dev.to_csv('../bert/data/dev.tsv', sep='\t', index=False, header=False)

In [17]:
test.insert(0, 'New_ID', range(0, 0 + len(test)))
test.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
340,0,1,a,medical records hippa privacy information i he...
58,1,1,a,study participants are given placebos so that ...
498,2,0,a,feel free to ask all the questions you want be...
338,3,0,a,have you ever had a serious reaction after rec...
383,4,1,a,rti ci pa you are being asked to give samples ...


In [18]:
cols = ['New_ID', 'clean_text']
test = test[cols]
test.columns = ['id', 'sentence']
test.head()

Unnamed: 0,id,sentence
340,0,medical records hippa privacy information i he...
58,1,study participants are given placebos so that ...
498,2,feel free to ask all the questions you want be...
338,3,have you ever had a serious reaction after rec...
383,4,rti ci pa you are being asked to give samples ...


In [19]:
test.to_csv('../bert/data/test.tsv', sep='\t', index=False, header=True)