## Config

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
from spacy.vocab import Vocab
import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import random
import itertools
from itertools import chain

In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# define the spaCy lib for vocab and vectors
nlp = spacy.load('en')
nlp_larg = spacy.load('en_core_web_lg')

## Data Location

In [5]:
forms_dir = '../input_data/all_forms'

## Semantic Clues
These are the verbs that we believe indicate the prescence of a statement of 'allowable action.'

In [6]:
##  taken from: 
# “I found synonyms for ‘permission’ on the new Thesaurus.com!,”
# www.thesaurus.com. [Online]. Available: https://www.thesaurus.com/browse/permission. 
## [Accessed: 19-Feb-2019].

permission_bases = ["permission", 
                    "authorization", 
                    "authorize",
                    "consent",
                    "assent",
                    "permit",
                    "approve",
                    "grant",
                    "allow",
                    "certify"]

## Add Synonyms
Use WordNet to gather synonyms of the semantic clues.

In [7]:
def addWordNetSynsets(word_list):
    " add synsets to new list "
    updated_list = []
    
    for word in word_list:
        updated_list.append(word)
        synonyms = wordnet.synsets(word, 'v')
        synonyms = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
        
        [updated_list.append(x) for x in synonyms if x not in updated_list]
        
    return set(updated_list)
        
permission_extended = addWordNetSynsets(permission_bases)

print('num elements in extended semantic clues:', len(permission_extended), '\n')

num elements in extended semantic clues: 50 



In [8]:
for clue in permission_extended:
    print(clue)

set_aside
go_for
assent
permit
give_up
attest
certify
countenance
accept
approve
acquiesce
tolerate
reserve
authorization
O.K.
manifest
deed_over
pass
indorse
accord
license
appropriate
grant
empower
accede
licence
permission
leave
earmark
evidence
sanction
demonstrate
allow_for
let
cede
admit
concede
allow
allot
authorise
authorize
consent
yield
give
provide
endorse
clear
take_into_account
okay
award


## Manually Remove Inapproriate Clues

In [9]:
permission_extended = [
    'accept',
    'admit',
    'permission',
    'authorise',
    'allow',
    'give',
    'sanction',
    'assent',
    'approve',
    'give',
    'authorization',
    'accede',
    'accord',
    'permit',
    'concede',
    'attest',
    'provide',
    'grant',
    'cede',
    'authorize',
    'let',
    'allot',
    'licence',
    'certify',
    'consent',
]

## build consent form data structure
get dataframe from each consent form on file

In [10]:
def getData(directory):
    """ returns a dataframe with cleaned filenames, full paths,
    and unprocessed text """
    
    new_rows = []
    
    fileID = 0
    
    # iterate through directory
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            
            fileID += 1
            filepath = subdir + os.sep + file
            
            if filepath.endswith('.txt'):
                wordList = [] # this will include duplicates and stop words
                cleanedFileName = ''.join(e for e in file if e.isalnum())[:-3] 
                
                # perform string operations on each file
                with open(filepath, 'r') as myfile:
                    data = myfile.read().replace('\n', ' ')       
                
                new_rows.append(
                    {
                        'id': fileID,
                        'name':cleanedFileName,
                        'path':filepath,
                        'rawText':data
                    }
                )
    
    return pd.DataFrame(new_rows)

In [11]:
"""
NOTE: running this cell will re-load the data in the dataframe from the dir.
"""
# run the function and store to variable 
df = getData(forms_dir)

# set the index as the id, for future access
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,name,path,rawText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,TAMUHRPPInformedconsent,../input_data/all_forms/TAMU - HRPP Informed c...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...
2,PotomacPrimaryCarefluconsentform,../input_data/all_forms/Potomac Primary Care_f...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...
3,OSUScheduledDeliveryConsent,../input_data/all_forms/OSU_Scheduled_Delivery...,SCHEDULED DELIVERY: Today’s Date: Da...
4,consentbiorepository121914,../input_data/all_forms/consent_biorepository_...,﻿ Informed Consent Form and HIPAA Authorizatio...
5,CambridgeConsentendodontics2,../input_data/all_forms/Cambridge_Consent_endo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...


## Random sampling to speed up development
This will be removed when we want to process the whole corpus.

In [12]:
# comment out to run on whole document collection
n_samples = 15
df = df.sample(n=n_samples)

## Add Clean Text Version

In [13]:
def minimalTextCleaning(row, field):
    """ perform minimal text processing on raw data to new field """
    
    cleaned_text = str(row[field]).lower() # lowercase
    cleaned_text = re.sub(' +', ' ', cleaned_text) # strip redundant whitespace
    return cleaned_text

df['minimalCleaning'] = df.apply(lambda row:minimalTextCleaning(row, 'rawText'),axis=1)

## Convert Raw Text to Spacy Object

In [14]:
convertFrom = 'rawText'
convertTo = 'docOB'

def getDocObjects(row, field):
    " return spacy doc object"
    doc = nlp_larg(str(row[field]).lower())
    return doc

df[convertTo] = df.apply(lambda row:getDocObjects(row, convertFrom),axis=1)

In [15]:
convertFrom = 'minimalCleaning'
convertTo = 'cleaned_docOB'

def getDocObjects(row, field):
    " return spacy doc object"
    doc = nlp_larg(str(row[field]).lower())
    return doc

df[convertTo] = df.apply(lambda row:getDocObjects(row, convertFrom),axis=1)

In [26]:
df.head()

Unnamed: 0_level_0,name,path,rawText,minimalCleaning,docOB,cleaned_docOB,sentList,candidatePermissionStatements
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
407,assenttemplate,../input_data/all_forms/assent-template.txt,﻿ COMIRB #:\t Person in Charge of the Study: [...,﻿ comirb #:\t person in charge of the study: [...,"(﻿, comirb, #, :, \t , person, in, charge, of,...","(﻿, comirb, #, :, \t , person, in, charge, of,...","[(﻿, comirb, #, :, \t , person, in, charge, of...","[(﻿, comirb, #, :, \t , person, in, charge, of..."
135,BROOKLYNPERIODONTICSConsentforsurgicalconsent,../input_data/all_forms/BROOKLYNPERIODONTICS_C...,"MICHAEL ZIDILE, D.D.S. P RAC T IC E LI MI T ED...","michael zidile, d.d.s. p rac t ic e li mi t ed...","(michael, zidile, ,, d.d.s, ., p, rac, t, ic, ...","(michael, zidile, ,, d.d.s, ., p, rac, t, ic, ...","[(michael, zidile, ,, d.d.s, .), (p, rac, t, i...","[(ti, st, r, y, , consent, for, biopsy, with,..."
154,CHOAICFHIPAATemplate,../input_data/all_forms/CHOA_ICFHIPAA_Template...,﻿ Emory University and Children’s Healthcare ...,﻿ emory university and children’s healthcare o...,"(﻿, , emory, university, and, children, ’s, h...","(﻿, emory, university, and, children, ’s, heal...","[(﻿, , emory, university, and, children, ’s, ...","[(﻿, , emory, university, and, children, ’s, ..."
640,Dukeexcesssampleconsentform,../input_data/all_forms/Duke-excess-sample-con...,Form M0345 DUKE UNIVERSITY HEALTH SYS...,form m0345 duke university health system conse...,"(form, m0345, , duke, university, hea...","(form, m0345, duke, university, health, system...","[(form, m0345, , duke, university, he...","[(form, m0345, , duke, university, he..."
205,FloridaInstituteofTechnologyConsentFormforF,../input_data/all_forms/Florida_Institute_of_T...,Consent Form TIV Inactivated Inﬂuenza Vaccine ...,consent form tiv inactivated inﬂuenza vaccine ...,"(consent, form, tiv, inactivated, inﬂuenza, va...","(consent, form, tiv, inactivated, inﬂuenza, va...","[(consent, form, tiv, inactivated, inﬂuenza, v...","[(consent, form, tiv, inactivated, inﬂuenza, v..."


In [17]:
# for i in df['docOB'].head(2):
#     print((dir(i)), '\n')

## extract sentences

In [18]:
def getSentenceList(row, field):
    """ return list of sentences from doc object; each item will be token span """
    return list(row[field].sents)

df['sentList'] = df.apply(lambda row:getSentenceList(row, 'docOB'),axis=1)

In [19]:
# for i in df['sentList'].head(1):
#     for a in i:
#         print(a, '\n')

In [20]:
# def getWordListasString(row):
#     """ return a list of words, including duplicates.
#     NOTE: light cleaning on ingestion """
#     wordList = [] 
    
#     for word in row['rawText'].split():
#         word = re.sub("[^a-zA-Z]+", " ", word).strip().lower()
#         if not word == "":
#             wordList.append(word)
            
#     return " ".join(wordList)
        
# df['cleanedText'] = df.apply(lambda row: getWordListasString(row),axis=1)

# df.head(2)

## preliminary permisison sentence extraction
look for sentences that have a word from the 'clues' list

## TODO: remove duplicates, add count

### NOTE: DO WE WANT SENTENCE PAIRS? THREE SENTENCES?

In [21]:
def getPossiblePermissions(row, permissions_list):
    """ return list of sentences containing the 
    permissions words """
    
    candidates = []
    
    # iterate through a list of sentences
    for sent in row['sentList']:
        
        # iterate through each clue
        for clue in permissions_list:     
            
            # check if sting contains any clues (need to match case)
            if sent.text.__contains__(clue):
                """ NOTE: right not if ANY clue is found, this is enough """
                
                candidates.append(sent)
    
    return candidates
                
# df.apply(lambda row:getPossiblePermissions(row, permission_extended),axis=1)

df['candidatePermissionStatements'] = df.apply(lambda row:getPossiblePermissions(row, permission_extended),axis=1)

df['candidatePermissionStatements'].head(2)

id
407    [(﻿, comirb, #, :, \t , person, in, charge, of...
135    [(ti, st, r, y,  , consent, for, biopsy, with,...
Name: candidatePermissionStatements, dtype: object

In [22]:
for sent in df['candidatePermissionStatements'][:10]:
    [print('sent:', x.text, '\n') for x in sent[:2]]

sent: ﻿ comirb #:	 person in charge of the study: [pi] version date:			 	 assent form for: [title]   what is this study about? 

sent: if i am in the study, i will: [one idea per bullet] 

sent: ti st r y  consent for biopsy with local anesthesia 

sent: understanding all of the above, i request that and hereby provide my informed consent to the treating doctor and his assistants to perform a biopsy. 

sent: ﻿  emory university and children’s healthcare of atlanta consent to be a research subject and hipaa authorization   

sent: ﻿  emory university and children’s healthcare of atlanta consent to be a research subject and hipaa authorization   

sent: form m0345          duke university health system          consent to participate in a research study         duhs biospecimen repository and processing core (brpc)          

sent: facility pi: shannon mccall, md         participant category: excess tissue collection          this is a consent form for a research project. 

sent: consent

In [23]:
# def getEstimatedPermissionDesnity(row):
#     """ return proportion of sentence total that may
#     be statements indicating perissions. Expect much noise. """
     
#     return len(row['permissionsStatements'])/len(row['sentList'])
    
# df['permissionDensity'] = df.apply(lambda row: getEstimatedPermissionDesnity(row),axis=1)

# df.head(2)

In [24]:
# %matplotlib inline
# df['permissionDensity'].hist()

## Important
this represents an important break away from the primary dataframe. here i start to work with list structures to create a list of possible permissions that is no longer tied to filenames

In [25]:
all_sents = []

for sentlist in df['sentList'].to_list():
    [all_sents.append(x) for x in sentlist]
    
all_permissions = []

for permlist in df['permissionsStatements'].to_list():
    [all_permissions.append(x) for x in permlist]

KeyError: 'permissionsStatements'

In [None]:
print('total rough draft permissions: ', len(all_permissions))
print('unique rough draft permissions: ', len(list(set(all_permissions))))

print('total rough draft sentences: ', len(all_sents))
print('unique rough draft sentences: ', len(list(set(all_sents))))

In [None]:
# remove duplucates
all_permissions = list(set(all_permissions))

## pair-wise similarity between our permission 'guesses' and remaining sentences
this is an important step so that we don't restrict ourselves to permission statements that contain the words we dreampt up

# WARNING: WILL NOT SCALE, need to fix


In [None]:
new_rows = []

for sent in all_sents:
    for perm in all_permissions:
        row = {
            'permission':perm,
            'sentence':sent,
            'similarity':sent.similarity(perm)
        }
        new_rows.append(row)

In [None]:
sf = pd.DataFrame(new_rows)
sf.head()

In [None]:
sf['similarity'].hist()

In [None]:
# reduce dataframe, aggregate by mean sim score (weak sents will balance out)
sf = sf.drop(columns=['permission'])
sf = sf.groupby(['sentence'], as_index=False).mean()

sf.head()

In [None]:
sf['similarity'].hist()

In [None]:
high_sim = sf[sf.similarity > .7]
candidates = high_sim['sentence'].to_list()

In [None]:
print('total rough draft candidates: ', len(candidates))
print('unique rough draft candidates: ', len(list(set(candidates))))

In [None]:
[print('sent: ', x, '\n') for x in candidates[:5]]

In [None]:
rough_draft_permissions = all_permissions + candidates

In [None]:
print('total rough draft candidates: ', len(rough_draft_permissions))
print('unique rough draft candidates: ', len(list(set(rough_draft_permissions))))

In [None]:
rough_draft_permissions = list(set(rough_draft_permissions))

## print statements to file

In [None]:
# import csv
# import datetime
# today = str(datetime.date.today())

# file_path = 'statements-'+today+'.csv'
# print(file_path)

# with open(file_path, 'a') as outcsv:   
#     #configure writer to write standard csv file
#     writer = csv.writer(outcsv, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
#     writer.writerow(['raw_text', 'clean_text'])
#     for perm in rough_draft_permissions:
#         #Write item to outcsv
#         raw_text = perm.text
#         clean_text = re.sub('\s+', ' ', re.sub('\W+',' ',\
#                                re.sub('[^A-Za-z0-9]+',' ', \
#                                       re.sub(r'\d+', " ", raw_text)))).strip()
#         writer.writerow([raw_text, clean_text])

# Annotations?

This is the point where injecting further information via annotations may be a good idea. This then would require in import of another local file and a small methods section.

## matrix representations

a few different matrix representations of the permissions sentence

In [None]:
def cleanPermissions(rough_draft_permissions):
    "string cleaning on permissions for td-idf ingestion"
    corpus = []
    
    
    for permission in rough_draft_permissions:
        cleanSentence = []
        for word in permission.text.split():
            word = re.sub("[^a-zA-Z]+", " ", word).strip().lower()
            if not word == "":
                cleanSentence.append(word)
        corpus.append(" ".join(cleanSentence))
    return corpus
            
corpus = cleanPermissions(rough_draft_permissions)

[print(x, '\n') for x in corpus[:10]]

## NOTE: 
`minTermFrequencyThreshold = 0` will result in all possible n_grams and will not scale as input size or ngramSize increases. However, it is the most robust representation of the sentence, and is worth exploring for the time being...

In [None]:
ngramSize = 5
maxTermFrequencyThreshold = .8
minTermFrequencyThreshold = .001

def getTDIDFMatrix(corpus, ngram_range, max_df, min_df):
    """ return td-idf matrix and terms """
    
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, 
                                       ngram_range=(1,ngram_range),
                                       max_df=max_df,min_df=min_df)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    terms = tfidf_vectorizer.get_feature_names()
    
    return tfidf_matrix, terms

# save to a variable 
tdidf_matrix, tdidf_terms = getTDIDFMatrix(corpus, 
                                           ngramSize, 
                                           maxTermFrequencyThreshold,
                                           minTermFrequencyThreshold)

# # print tests
# print('\nfirst few terms:')
# [print(" ", x) for x in tdidf_terms[:10]]

print('\nNumber of terms:', len(tdidf_terms))

In [None]:
sdf = pd.SparseDataFrame(tdidf_matrix, columns=tdidf_terms)

# here we add the sentences back in
sdf['sent'] = corpus

sdf.head()

In [None]:
permission_vectors = []

for perm in rough_draft_permissions:
#     print(perm.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])) # same features, different rows
#     print(perm.vector) # same shape
#     print(perm.vector_norm) # single value
#     print(perm.get_lca_matrix()) # differnt shapes
    permission_vectors.append(perm.vector)