# Introduction
The goal is to format DataTurk annotations into something that BERT can run with.

In [1]:
import json
from pprint import pprint
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
test = '../input_data/data_turk/dummy_data.json'

In [3]:
new_rows = []

with open(test) as f:
    for line in f: # need to load each line as a separate json object
        dat_dict = json.loads(line)
        
        content = dat_dict['content'].split("]],")[1]
        fileID = dat_dict['content'].split("]],")[0].replace("[[fileID:", "")
        annotation = dat_dict['annotation']['labels'][0]
        
        row = {
            'annotation':annotation,
            'file_id':fileID,
            'text':content
        }
        
        new_rows.append(row)
        
df = pd.DataFrame(new_rows)

df.head()

Unnamed: 0,annotation,file_id,text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.
2,NON_permission_statement.,490,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc..."
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...


In [4]:
to  = 'label'

def convertAnnotationtoBinary(row):
    """  convert to (0,1), where 1 = permission_statement """
    
    if str(row['annotation']).__contains__('NON'):
        return 0
    else:
        return 1
    
df[to] = df.apply(lambda row:convertAnnotationtoBinary(row), axis =1) 

In [5]:
df.head()

Unnamed: 0,annotation,file_id,text,label
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0


In [6]:
def cleanSents(row):
    """ perform minor text cleaning on all sents """
    
    text = str(row['text']).lower()
    clean_text = re.sub('[^A-Za-z0-9]+', ' ', text).strip()
    return clean_text
    
df['clean_text'] = df.apply(lambda row:cleanSents(row), axis=1)
df.head()

Unnamed: 0,annotation,file_id,text,label,clean_text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,you will be given a copy of this form to keep ...


In [7]:
df['throw_away'] = 'a'

In [8]:
df.head()

Unnamed: 0,annotation,file_id,text,label,clean_text,throw_away
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...,a
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop,a
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,tufts medical center tufts university departme...,a
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...,a
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,you will be given a copy of this form to keep ...,a


In [9]:
cols = ['label', 'throw_away', 'clean_text'] 
df = df[cols]
df.head()

Unnamed: 0,label,throw_away,clean_text
0,0,a,my child has already had dtpa vaccination i do...
1,0,a,all you have to do is tell us you want to stop
2,0,a,tufts medical center tufts university departme...
3,1,a,if you agree to being audiotaped but feel unco...
4,0,a,you will be given a copy of this form to keep ...


## Remove Old Files

In [10]:
os.remove('../bert_inputs/train.tsv')
os.remove('../bert_inputs/dev.tsv')
os.remove('../bert_inputs/test.tsv')

### Define Train, Dev, and Test

In [11]:
train, dummy = train_test_split(df, test_size=0.2)
dev, test = train_test_split(dummy, test_size=0.5)

print('original: ', len(df))
print('train: ', len(train))
print('dev: ', len(dev))
print('test: ', len(test))

original:  520
train:  416
dev:  52
test:  52


In [12]:
train.insert(0, 'New_ID', range(0, 0 + len(train)))
train.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
393,0,1,a,the authorized information will be used by the...
197,1,0,a,i and or my physician counselor have signed th...
112,2,1,a,and since this study may involve patients and ...
226,3,0,a,afterthe first and the 3lrd dose given 6 month...
7,4,1,a,in order to minimize the risk of unintended re...


In [13]:
train.to_csv('../bert_inputs/train.tsv', sep='\t', index=False, header=False)

In [14]:
dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
dev.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
248,0,0,a,if you sign up for but cannot complete any of ...
394,1,0,a,text examples for chemoprevention supportive c...
185,2,0,a,i acknowledge that no guarantees or assurances...
192,3,0,a,intravenous solutions used to maintain hydrati...
169,4,1,a,to do this study we will use the following kin...


In [15]:
dev.to_csv('../bert_inputs/dev.tsv', sep='\t', index=False, header=False)

In [16]:
test.insert(0, 'New_ID', range(0, 0 + len(test)))
test.head()

Unnamed: 0,New_ID,label,throw_away,clean_text
239,0,0,a,section length limit this section should be be...
410,1,0,a,for non us participants please verify existenc...
47,2,0,a,a56 the subjects are only receiving diagnostic...
255,3,0,a,a8 this language can be removed if there is no...
12,4,1,a,we will share coded data with these types of d...


In [17]:
cols = ['New_ID', 'clean_text']
test = test[cols]
test.columns = ['id', 'sentence']
test.head()

Unnamed: 0,id,sentence
239,0,section length limit this section should be be...
410,1,for non us participants please verify existenc...
47,2,a56 the subjects are only receiving diagnostic...
255,3,a8 this language can be removed if there is no...
12,4,we will share coded data with these types of d...


In [18]:
test.to_csv('../bert_inputs/test.tsv', sep='\t', index=False, header=True)