## Introduction
The goal of this notebook is to ingest all textual content from the `data/raw/all_forms/` dir and re-format sentences so that bert can make predictions on them. 

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
# fundementals 
import os
import re
import pandas as pd
import numpy as np

# zoomies
import dask.dataframe as dd
from dask.multiprocessing import get
from multiprocessing import cpu_count

# custom data loading functions
import load_data
import clean_data

In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# Data location
forms_dir = '../data/raw/all_forms'
raw_df = load_data.getDIRData(forms_dir)
raw_df.head()

Unnamed: 0,id,name,path,rawText
0,1,TAMU - HRPP Informed consent.txt,../data/raw/all_forms/TAMU - HRPP Informed con...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...
1,2,Potomac Primary Care_flu-consent-form.txt,../data/raw/all_forms/Potomac Primary Care_flu...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...
2,3,OSU_Scheduled_Delivery_Consent.txt,../data/raw/all_forms/OSU_Scheduled_Delivery_C...,SCHEDULED DELIVERY: Today’s Date: Da...
3,4,consent_biorepository_12-19-14.txt,../data/raw/all_forms/consent_biorepository_12...,﻿ Informed Consent Form and HIPAA Authorizatio...
4,5,Cambridge_Consent_endodontics2.txt,../data/raw/all_forms/Cambridge_Consent_endodo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...


### Faster Processing into spaCy docs
Here we use dask so that we can do things a little faster...

In [5]:
# just so we can mess it up and not feel bad....
df = raw_df

In [6]:
nCores = cpu_count()
print(nCores) # just 4 for my machine

4


In [7]:
from_field = 'rawText'
copy_to = 'minimalCleaning'

df[copy_to] = dd.from_pandas(df,npartitions=nCores).\
   map_partitions(
      lambda df : df.apply(
         lambda x : clean_data.minimalTextCleaning(x,from_field),axis=1)).\
   compute(scheduler='threads')

df.head()

Unnamed: 0,id,name,path,rawText,minimalCleaning
0,1,TAMU - HRPP Informed consent.txt,../data/raw/all_forms/TAMU - HRPP Informed con...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...,not intended for use without tamu/bcd irb appr...
1,2,Potomac Primary Care_flu-consent-form.txt,../data/raw/all_forms/Potomac Primary Care_flu...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...,patient consent form for seasonal influenza va...
2,3,OSU_Scheduled_Delivery_Consent.txt,../data/raw/all_forms/OSU_Scheduled_Delivery_C...,SCHEDULED DELIVERY: Today’s Date: Da...,scheduled delivery: today s date: date of sche...
3,4,consent_biorepository_12-19-14.txt,../data/raw/all_forms/consent_biorepository_12...,﻿ Informed Consent Form and HIPAA Authorizatio...,informed consent form and hipaa authorization ...
4,5,Cambridge_Consent_endodontics2.txt,../data/raw/all_forms/Cambridge_Consent_endodo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...,informat ional use only consent for endodontic...


Note: this was not possible on the entire corpus the otherway (in hueristic extraction). We limited the conversions to samples only, so even though this is an expensive function, it's possible to manage on a single machine.

In [8]:
%%time

convertFrom = 'minimalCleaning'
convertTo = 'DOC'

df[convertTo] = dd.from_pandas(df,npartitions=nCores).\
   map_partitions(
      lambda df : df.apply(
         lambda x :clean_data.getDocObjects(x, convertFrom),axis=1)).\
   compute(scheduler='threads')

df.head()

CPU times: user 6min 34s, sys: 2min 9s, total: 8min 44s
Wall time: 3min 26s


In [9]:
df.head()

Unnamed: 0,id,name,path,rawText,minimalCleaning,DOC
0,1,TAMU - HRPP Informed consent.txt,../data/raw/all_forms/TAMU - HRPP Informed con...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...,not intended for use without tamu/bcd irb appr...,"(not, intended, for, use, without, tamu, /, bc..."
1,2,Potomac Primary Care_flu-consent-form.txt,../data/raw/all_forms/Potomac Primary Care_flu...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...,patient consent form for seasonal influenza va...,"(patient, consent, form, for, seasonal, influe..."
2,3,OSU_Scheduled_Delivery_Consent.txt,../data/raw/all_forms/OSU_Scheduled_Delivery_C...,SCHEDULED DELIVERY: Today’s Date: Da...,scheduled delivery: today s date: date of sche...,"(scheduled, delivery, :, today, s, date, :, da..."
3,4,consent_biorepository_12-19-14.txt,../data/raw/all_forms/consent_biorepository_12...,﻿ Informed Consent Form and HIPAA Authorizatio...,informed consent form and hipaa authorization ...,"(informed, consent, form, and, hipaa, authoriz..."
4,5,Cambridge_Consent_endodontics2.txt,../data/raw/all_forms/Cambridge_Consent_endodo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...,informat ional use only consent for endodontic...,"(informat, ional, use, only, consent, for, end..."


In [10]:
# won't bother dasking this guy
getFrom = 'DOC'
convertTo = 'sentenceList'

df[convertTo] = df.apply(lambda row:clean_data.getSentenceList(row, getFrom),axis=1)

In [11]:
df.head()

Unnamed: 0,id,name,path,rawText,minimalCleaning,DOC,sentenceList
0,1,TAMU - HRPP Informed consent.txt,../data/raw/all_forms/TAMU - HRPP Informed con...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...,not intended for use without tamu/bcd irb appr...,"(not, intended, for, use, without, tamu, /, bc...","[(not, intended, for, use, without, tamu, /, b..."
1,2,Potomac Primary Care_flu-consent-form.txt,../data/raw/all_forms/Potomac Primary Care_flu...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...,patient consent form for seasonal influenza va...,"(patient, consent, form, for, seasonal, influe...","[(patient, consent, form, for, seasonal, influ..."
2,3,OSU_Scheduled_Delivery_Consent.txt,../data/raw/all_forms/OSU_Scheduled_Delivery_C...,SCHEDULED DELIVERY: Today’s Date: Da...,scheduled delivery: today s date: date of sche...,"(scheduled, delivery, :, today, s, date, :, da...","[(scheduled, delivery, :, today, s, date, :, d..."
3,4,consent_biorepository_12-19-14.txt,../data/raw/all_forms/consent_biorepository_12...,﻿ Informed Consent Form and HIPAA Authorizatio...,informed consent form and hipaa authorization ...,"(informed, consent, form, and, hipaa, authoriz...","[(informed, consent, form, and, hipaa, authori..."
4,5,Cambridge_Consent_endodontics2.txt,../data/raw/all_forms/Cambridge_Consent_endodo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...,informat ional use only consent for endodontic...,"(informat, ional, use, only, consent, for, end...","[(informat, ional, use, only, consent, for, en..."


In [12]:
getFrom = 'sentenceList'
convertTo = 'cleanedSents'

def cleanSentenceList(row, field):
    """  clean sentence list, prepare for export to bert """
    
    cleaned_sents = []
    
    for sentence in row[field]:
        str_sent = sentence.text
        clean_text = re.sub('[^A-Za-z0-9]+', ' ', str_sent).strip()
        cleaned_sents.append(clean_text)

    return cleaned_sents

df[convertTo] = df.apply(lambda row:cleanSentenceList(row, getFrom),axis=1)

In [13]:
df.head()

Unnamed: 0,id,name,path,rawText,minimalCleaning,DOC,sentenceList,cleanedSents
0,1,TAMU - HRPP Informed consent.txt,../data/raw/all_forms/TAMU - HRPP Informed con...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...,not intended for use without tamu/bcd irb appr...,"(not, intended, for, use, without, tamu, /, bc...","[(not, intended, for, use, without, tamu, /, b...",[not intended for use without tamu bcd irb app...
1,2,Potomac Primary Care_flu-consent-form.txt,../data/raw/all_forms/Potomac Primary Care_flu...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...,patient consent form for seasonal influenza va...,"(patient, consent, form, for, seasonal, influe...","[(patient, consent, form, for, seasonal, influ...",[patient consent form for seasonal influenza v...
2,3,OSU_Scheduled_Delivery_Consent.txt,../data/raw/all_forms/OSU_Scheduled_Delivery_C...,SCHEDULED DELIVERY: Today’s Date: Da...,scheduled delivery: today s date: date of sche...,"(scheduled, delivery, :, today, s, date, :, da...","[(scheduled, delivery, :, today, s, date, :, d...",[scheduled delivery today s date date of sched...
3,4,consent_biorepository_12-19-14.txt,../data/raw/all_forms/consent_biorepository_12...,﻿ Informed Consent Form and HIPAA Authorizatio...,informed consent form and hipaa authorization ...,"(informed, consent, form, and, hipaa, authoriz...","[(informed, consent, form, and, hipaa, authori...",[informed consent form and hipaa authorization...
4,5,Cambridge_Consent_endodontics2.txt,../data/raw/all_forms/Cambridge_Consent_endodo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...,informat ional use only consent for endodontic...,"(informat, ional, use, only, consent, for, end...","[(informat, ional, use, only, consent, for, en...",[informat ional use only consent for endodonti...


In [14]:
new_rows = []

for index, row in df.iterrows():
    
    for sent in row['cleanedSents']:
    
        trls_row = {
            'id':row['id'],
            'sentence':sent   
            }
        
        new_rows.append(trls_row)
        
output_df = pd.DataFrame(new_rows)
output_df.head()

Unnamed: 0,id,sentence
0,1,not intended for use without tamu bcd irb appr...
1,1,project title
2,1,you are invited to take part in a research stu...
3,1,the information in this form is provided to he...
4,1,if you decide to take part in the study you wi...


## Remove old files
caution advised

In [15]:
os.remove('../data/raw_bert_inputs/new_data.tsv')

### Output for Bert

In [16]:
output_df.to_csv('../data/raw_bert_inputs/new_data.tsv', sep='\t', index=False, header=True)