# Introduction
The goal is to format DataTurk annotations into something that BERT can run with.

In [1]:
import json
from pprint import pprint
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split

# custom data loading functions
import load_data
import clean_data

In [6]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [8]:
test = '../data/data_turk/dummy_data.json'

In [9]:
df = load_data.getJSONData(test)
df.head()

Unnamed: 0,annotation,fileID,text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.
2,NON_permission_statement.,490,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc..."
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...


In [10]:
to  = 'label'
field = 'annotation'

df[to] = df.apply(lambda row:clean_data.convertAnnotationtoBinary(row,field), axis =1) 

In [11]:
df.head()

Unnamed: 0,annotation,fileID,text,label
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0


In [13]:
to  = 'clean_text'
field = 'text'
    
df[to] = df.apply(lambda row:clean_data.cleanSents(row, field), axis=1)
df.head()

Unnamed: 0,annotation,fileID,text,label,clean_text
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,tufts medical center tufts university departme...
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,you will be given a copy of this form to keep ...


In [14]:
df['throw_away'] = 'a'

In [15]:
df.head()

Unnamed: 0,annotation,fileID,text,label,clean_text,throw_away
0,NON_permission_statement.,370,/ my child has already had dtpa vaccination i ...,0,my child has already had dtpa vaccination i do...,a
1,NON_permission_statement.,490,all you have to do is tell us you want to stop.,0,all you have to do is tell us you want to stop,a
2,NON_permission_statement.,490,tufts medical center tufts university departme...,0,tufts medical center tufts university departme...,a
3,permission_statement,387,"""if you agree to being audiotaped but feel unc...",1,if you agree to being audiotaped but feel unco...,a
4,NON_permission_statement.,387,you will be given a copy of this form to keep ...,0,you will be given a copy of this form to keep ...,a


In [None]:
cols = ['label', 'throw_away', 'clean_text'] 
df = df[cols]
df.head()

## Remove Old Files

In [None]:
os.remove('../data/bert_inputs/train.tsv')
os.remove('../data/bert_inputs/dev.tsv')
os.remove('../data/bert_inputs/test.tsv')

### Define Train, Dev, and Test

In [None]:
train, dummy = train_test_split(df, test_size=0.2)
dev, test = train_test_split(dummy, test_size=0.5)

print('original: ', len(df))
print('train: ', len(train))
print('dev: ', len(dev))
print('test: ', len(test))

In [None]:
train.insert(0, 'New_ID', range(0, 0 + len(train)))
train.head()

In [None]:
train.to_csv('../data/bert_inputs/train.tsv', sep='\t', index=False, header=False)

In [None]:
dev.insert(0, 'New_ID', range(0, 0 + len(dev)))
dev.head()

In [None]:
dev.to_csv('../data/bert_inputs/dev.tsv', sep='\t', index=False, header=False)

In [None]:
test.insert(0, 'New_ID', range(0, 0 + len(test)))
test.head()

In [None]:
cols = ['New_ID', 'clean_text']
test = test[cols]
test.columns = ['id', 'sentence']
test.head()

In [None]:
test.to_csv('../data/bert_inputs/test.tsv', sep='\t', index=False, header=True)