## Config

In [1]:
from platform import python_version
print("VERSION: ", python_version()) # expect 3.7.0

VERSION:  3.7.0


In [2]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
from spacy.vocab import Vocab
import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import random
import itertools
from itertools import chain
from PyDictionary import PyDictionary
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess

In [3]:
# dirty spaCy error workaround:
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

In [4]:
# define the spaCy lib for vocab and vectors
nlp = spacy.load('en')
nlp_larg = spacy.load('en_core_web_lg')

In [5]:
dictionary=PyDictionary()

## Data Location

In [6]:
forms_dir = '../input_data/all_forms'

## Semantic Clues
These are the verbs that we believe indicate the prescence of a statement of 'allowable action.'

In [7]:
##  taken from: 
# “I found synonyms for ‘permission’ on the new Thesaurus.com!,”
# www.thesaurus.com. [Online]. Available: https://www.thesaurus.com/browse/permission. 
## [Accessed: 19-Feb-2019].

semantic_bases = ["permission", 
                    "authorization", 
                    "authorize",
                    "consent",
                    "assent",
                    "permit",
                    "approve",
                    "grant",
                    "allow",
                    "certify"]

## Print Base Definitions

In [8]:
for base in semantic_bases:
    print (base)
    
    for (POS,Def) in dictionary.meaning(base).items():
        print("   ", POS, ":", Def, '\n')

permission




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


    Noun : ['approval to do something', 'the act of giving a formal (usually written'] 

authorization
    Noun : ['a document giving an official instruction or command', 'the power or right to give orders or make decisions', 'official permission or approval', 'the act of conferring legality or sanction or formal warrant'] 

authorize
    Verb : ['grant authorization or clearance for', 'give or delegate power or authority to'] 

consent
    Noun : ['permission to do something'] 

    Verb : ['give an affirmative reply to; respond favorably to'] 

assent
    Noun : ['agreement with a statement or proposal to do something'] 

    Verb : ['to agree or express agreement'] 

permit
    Noun : ['a legal document giving official permission to do something', 'the act of giving a formal (usually written', 'large game fish; found in waters of the West Indies'] 

    Verb : ['consent to, give permission', 'make it possible through a specific action or lack of action for something to happen', 'all

## Add Synonyms
Use WordNet to gather synonyms of the semantic clues.

In [9]:
def addWordNetSynsets(word_list):
    """  add synsets to new list """
    updated_list = []
    
    for word in word_list:
        updated_list.append(word)
        
        v_synonyms = wordnet.synsets(word, "v")
        v_synonyms = set(chain.from_iterable([word.lemma_names() for word in v_synonyms]))
        
        [updated_list.append(x) for x in v_synonyms if x not in updated_list]
        
    return set(updated_list)

In [10]:
extended_semantic_clues = addWordNetSynsets(semantic_bases)

In [11]:
# expect 50
print('unique elements in extended semantic clues:', 
      len(extended_semantic_clues), '\n')

unique elements in extended semantic clues: 50 



In [12]:
for clue in extended_semantic_clues:
    print(clue)

set_aside
leave
permit
deed_over
allow_for
accord
let
empower
award
authorization
allot
O.K.
give
manifest
accept
admit
demonstrate
grant
authorise
licence
go_for
endorse
reserve
evidence
pass
license
tolerate
yield
approve
okay
certify
assent
indorse
concede
allow
permission
cede
appropriate
clear
authorize
earmark
take_into_account
provide
acquiesce
consent
accede
attest
sanction
give_up
countenance


## Check Clue Similarities
Turns out not to be very helpful

In [13]:
clue_similarity_matrix = pd.DataFrame(index=extended_semantic_clues, 
                                      columns=extended_semantic_clues)

for index, row in clue_similarity_matrix.iterrows():
    for base in extended_semantic_clues:
        base_syn = wordnet.synsets(base)
        index_syn = wordnet.synsets(index)
        sim = wordnet.wup_similarity(index_syn[0], base_syn[0])
        
        if sim != None:
            row[base] = wordnet.wup_similarity(index_syn[0], base_syn[0])
        else:
            row[base] = 0.0
        
clue_similarity_matrix = clue_similarity_matrix.astype(float)

Unfortunately, this doesn't help us automate 'culling' of terms we don't really like.

In [14]:
clue_similarity_matrix.describe(include = 'all').transpose().sort_values('mean', ascending = False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sanction,50.0,0.30247,0.218631,0.133333,0.157051,0.211111,0.307692,1.0
permission,50.0,0.299393,0.210655,0.133333,0.157051,0.211111,0.307692,1.0
okay,50.0,0.286088,0.220545,0.125,0.145604,0.190909,0.285714,1.0
O.K.,50.0,0.286088,0.220545,0.125,0.145604,0.190909,0.285714,1.0
consent,50.0,0.280374,0.20481,0.125,0.145604,0.190909,0.285714,1.0
license,50.0,0.270695,0.218933,0.117647,0.135714,0.174242,0.266667,1.0
permit,50.0,0.270695,0.218933,0.117647,0.135714,0.174242,0.266667,1.0
authorization,50.0,0.268195,0.211005,0.117647,0.135714,0.174242,0.266667,1.0
manifest,50.0,0.268195,0.211005,0.117647,0.135714,0.174242,0.266667,1.0
countenance,50.0,0.259503,0.144245,0.133333,0.157051,0.211111,0.307692,1.0


## Remove Inapproriate Clues

Here we define a list of terms we will remove from the semantic clues.

In [15]:
"""
Define a list to remove selected words from the semantic clues. 
WordNet has it's weaknesses.
"""
poorly_fitting_semantic_clues = {
     'manifest',
     'yield',
     'demonstrate',
     'endorse',
     'take_into_account',
     'allow_for',
     'set_aside',
     'clear',
     'acquiesce',
     'indorse',
     'go_for',
     'earmark',
     'license',
     'reserve',
     'pass',
     'tolerate',
     'O.K.',
     'deed_over',
     'award',
     'evidence',
     'appropriate',
     'leave',
     'give_up',
     'empower',
     'okay',
     'countenance'
}

In [16]:
permission_extended = list(poorly_fitting_semantic_clues ^ extended_semantic_clues)

In [17]:
print('num terms after culling: ', len(permission_extended), '\n')

for term in permission_extended:
    print(term)

num terms after culling:  24 

permit
accord
let
authorization
allot
give
accept
admit
grant
authorise
licence
approve
certify
assent
concede
allow
permission
cede
authorize
provide
consent
accede
attest
sanction


## Check definitions

In [18]:
for clue in permission_extended:
    print(clue)
    
    for (POS,Def) in dictionary.meaning(clue).items():
        print("   ", POS, ":", Def, '\n')

permit
    Noun : ['a legal document giving official permission to do something', 'the act of giving a formal (usually written', 'large game fish; found in waters of the West Indies'] 

    Verb : ['consent to, give permission', 'make it possible through a specific action or lack of action for something to happen', 'allow the presence of or allow (an activity'] 

accord
    Noun : ["harmony of people's opinions or actions or characters", 'concurrence of opinion', 'a written agreement between two states or sovereigns', 'sympathetic compatibility'] 

    Verb : ['go together', 'allow to have'] 

let
    Noun : ['a brutal terrorist group active in Kashmir; fights against India with the goal of restoring Islamic rule of India', "a serve that strikes the net before falling into the receiver's court; the ball must be served again"] 

    Verb : ['make it possible through a specific action or lack of action for something to happen', 'actively cause something to happen', 'consent to, give perm

    Verb : ['give over; surrender or relinquish to the physical control of another', 'relinquish possession or control over'] 

authorize
    Verb : ['grant authorization or clearance for', 'give or delegate power or authority to'] 

provide
    Verb : ['give something useful or necessary to', 'give what is desired or needed, especially support, food or sustenance', 'determine (what is to happen in certain contingencies', 'mount or put up', 'make a possibility or provide opportunity for; permit to be attainable or cause to remain', 'supply means of subsistence; earn a living', 'take measures in preparation for'] 

consent
    Noun : ['permission to do something'] 

    Verb : ['give an affirmative reply to; respond favorably to'] 

accede
    Verb : ["yield to another's wish or opinion", 'take on duties or office', 'to agree or express agreement'] 

attest
    Verb : ["provide evidence for; stand as proof of; show by one's behavior, attitude, or external attributes", 'authenticate, aff

## build consent form data structure
get dataframe from each consent form on file

In [19]:
def getData(directory):
    """ returns a dataframe with cleaned filenames, full paths,
    and unprocessed text """
    
    new_rows = []
    
    fileID = 0
    
    # iterate through directory
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            
            fileID += 1
            filepath = subdir + os.sep + file
            
            if filepath.endswith('.txt'):
                
                # perform string operations on each file
                with open(filepath, 'r') as myfile:
                    data = myfile.read().replace('\n', ' ')       
                
                new_rows.append(
                    {
                        'id': fileID,
                        'name':str(file),
                        'path':filepath,
                        'rawText':data
                    }
                )
    
    return pd.DataFrame(new_rows)

In [20]:
"""
NOTE: running this cell will re-load the data in the dataframe from the dir.
"""
# run the function and store to variable 
df = getData(forms_dir)

# set the index as the id, for future access
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,name,path,rawText
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,TAMU - HRPP Informed consent.txt,../input_data/all_forms/TAMU - HRPP Informed c...,﻿NOT INTENDED FOR USE WITHOUT TAMU/BCD IRB APP...
2,Potomac Primary Care_flu-consent-form.txt,../input_data/all_forms/Potomac Primary Care_f...,PATIENT CONSENT FORM FOR SEASONAL INFLUENZA VA...
3,OSU_Scheduled_Delivery_Consent.txt,../input_data/all_forms/OSU_Scheduled_Delivery...,SCHEDULED DELIVERY: Today’s Date: Da...
4,consent_biorepository_12-19-14.txt,../input_data/all_forms/consent_biorepository_...,﻿ Informed Consent Form and HIPAA Authorizatio...
5,Cambridge_Consent_endodontics2.txt,../input_data/all_forms/Cambridge_Consent_endo...,INFORMAT IONAL USE ONLY CONSENT FOR ENDODONTI...


## Random sampling to speed up development
This will be removed when we want to process the whole corpus.

In [21]:
# comment out to run on whole document collection
n_samples = 15
df = df.sample(n=n_samples)

## Add Clean Text To DataFrame

In [22]:
def minimalTextCleaning(row, field):
    """ perform minimal text processing on raw data to new field """
    
    cleaned_text = str(row[field]).lower() # lowercase
    cleaned_text = re.sub(' +', ' ', cleaned_text) # strip redundant whitespace
    cleaned_text.replace("_", "") # we don't want signature lines
    return cleaned_text

df['minimalCleaning'] = df.apply(lambda row:minimalTextCleaning(row, 'rawText'),axis=1)

In [23]:
df.head()

Unnamed: 0_level_0,name,path,rawText,minimalCleaning
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
192,TAMU - Parental Permission Form.txt,../input_data/all_forms/TAMU - Parental Permis...,﻿ Project Title: You are invited to take par...,﻿ project title: you are invited to take part ...
671,Lenox Hill_Transfusion Consent.txt,../input_data/all_forms/Lenox Hill_Transfusion...,North Shore-LIJ Health System is now Northwell...,north shore-lij health system is now northwell...
711,New_York_City_Health_and_Hospitals_Corporation...,../input_data/all_forms/New_York_City_Health_a...,NEW YORK CITY HEALTH AND HOSPITALS CORPORATION...,new york city health and hospitals corporation...
411,HSIRB-IC-Template-and-Instructions-3-6-17.txt,../input_data/all_forms/HSIRB-IC-Template-and-...,﻿This document should be used to develop a con...,﻿this document should be used to develop a con...
709,CHMC_Per-Consent-Rev (1).txt,../input_data/all_forms/CHMC_Per-Consent-Rev (...,Patient Label !920001! Patient Last Name Pat...,patient label !920001! patient last name patie...


In [24]:
df['minimalCleaning'].iloc[0]

"\ufeff project title: you are invited to take part in a research study being conducted by investigator name, a researcher from texas a&m university and funded by [name sponsor/funding source]. the information in this form is provided to help you and your child decide whether or not to take part. if you decide to allow your child to take part in the study, you will be asked to sign this permission form. if you decide you do not want your child to participate, there will be no penalty to you or your child, and your child will not lose any benefits they normally would have. why is this study being done? the purpose of this study is to purpose. why is my child being asked to be in this study? your child is being asked to be in this study because inclusion/exclusion criteria. how many people will be asked to be in this study? number people (participants) will be invited to participate in this study locally. overall, a total of number people will be invited at number/multiple study centers 

## Convert Raw Text to Spacy Object
Once for the raw text, once for the cleaned text.

In [29]:
convertFrom = 'rawText'
convertTo = convertFrom + 'DOC'

def getDocObjects(row, field):
    " return spacy doc object"
    doc = nlp_larg(str(row[field]).lower())
    return doc

df[convertTo] = df.apply(lambda row:getDocObjects(row, convertFrom),axis=1)

In [30]:
convertFrom = 'minimalCleaning'
convertTo = convertFrom + 'DOC'

def getDocObjects(row, field):
    " return spacy doc object"
    doc = nlp_larg(str(row[field]).lower())
    return doc

df[convertTo] = df.apply(lambda row:getDocObjects(row, convertFrom),axis=1)

In [31]:
df.head()

Unnamed: 0_level_0,name,path,rawText,minimalCleaning,rawTextdocOB,minimalCleaningdocOB,rawTextDOC,minimalCleaningDOC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
192,TAMU - Parental Permission Form.txt,../input_data/all_forms/TAMU - Parental Permis...,﻿ Project Title: You are invited to take par...,﻿ project title: you are invited to take part ...,"(﻿, project, title, :, , you, are, invited, ...","(﻿, project, title, :, you, are, invited, to, ...","(﻿, project, title, :, , you, are, invited, ...","(﻿, project, title, :, you, are, invited, to, ..."
671,Lenox Hill_Transfusion Consent.txt,../input_data/all_forms/Lenox Hill_Transfusion...,North Shore-LIJ Health System is now Northwell...,north shore-lij health system is now northwell...,"(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now..."
711,New_York_City_Health_and_Hospitals_Corporation...,../input_data/all_forms/New_York_City_Health_a...,NEW YORK CITY HEALTH AND HOSPITALS CORPORATION...,new york city health and hospitals corporation...,"(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp..."
411,HSIRB-IC-Template-and-Instructions-3-6-17.txt,../input_data/all_forms/HSIRB-IC-Template-and-...,﻿This document should be used to develop a con...,﻿this document should be used to develop a con...,"(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo..."
709,CHMC_Per-Consent-Rev (1).txt,../input_data/all_forms/CHMC_Per-Consent-Rev (...,Patient Label !920001! Patient Last Name Pat...,patient label !920001! patient last name patie...,"(patient, label, , !, 920001, !, patient, las...","(patient, label, !, 920001, !, patient, last, ...","(patient, label, , !, 920001, !, patient, las...","(patient, label, !, 920001, !, patient, last, ..."


## Extract Sentences

In [32]:
getFrom = 'rawTextDOC'
convertTo = getFrom + '_sentenceList'

def getSentenceList(row, field):
    """ return list of sentences from doc object; each item will be token span """
    return list(row[field].sents)

df[convertTo] = df.apply(lambda row:getSentenceList(row, getFrom),axis=1)

In [33]:
df.head()

Unnamed: 0_level_0,name,path,rawText,minimalCleaning,rawTextdocOB,minimalCleaningdocOB,rawTextDOC,minimalCleaningDOC,rawTextDOC_sentenceList
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
192,TAMU - Parental Permission Form.txt,../input_data/all_forms/TAMU - Parental Permis...,﻿ Project Title: You are invited to take par...,﻿ project title: you are invited to take part ...,"(﻿, project, title, :, , you, are, invited, ...","(﻿, project, title, :, you, are, invited, to, ...","(﻿, project, title, :, , you, are, invited, ...","(﻿, project, title, :, you, are, invited, to, ...","[(﻿, project, title, :, , you, are, invited,..."
671,Lenox Hill_Transfusion Consent.txt,../input_data/all_forms/Lenox Hill_Transfusion...,North Shore-LIJ Health System is now Northwell...,north shore-lij health system is now northwell...,"(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","[(north, shore, -, lij, health, system, is, no..."
711,New_York_City_Health_and_Hospitals_Corporation...,../input_data/all_forms/New_York_City_Health_a...,NEW YORK CITY HEALTH AND HOSPITALS CORPORATION...,new york city health and hospitals corporation...,"(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","[(new, york, city, health, and, hospitals, cor..."
411,HSIRB-IC-Template-and-Instructions-3-6-17.txt,../input_data/all_forms/HSIRB-IC-Template-and-...,﻿This document should be used to develop a con...,﻿this document should be used to develop a con...,"(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","[(﻿this, document, should, be, used, to, devel..."
709,CHMC_Per-Consent-Rev (1).txt,../input_data/all_forms/CHMC_Per-Consent-Rev (...,Patient Label !920001! Patient Last Name Pat...,patient label !920001! patient last name patie...,"(patient, label, , !, 920001, !, patient, las...","(patient, label, !, 920001, !, patient, last, ...","(patient, label, , !, 920001, !, patient, las...","(patient, label, !, 920001, !, patient, last, ...","[(patient, label, , !), (920001, !), (patient..."


In [34]:
for list_of_setences in df['rawTextDOC_sentenceList'].head(1):
    for sentence in list_of_setences[:10]:
        print(sentence, '\n')

﻿ project title:   you are invited to take part in a research study being conducted by investigator name, a researcher from texas a&m university and funded by [name sponsor/funding source]. 

the information in this form is provided to help you and your child decide whether or not to take part. 

if you decide to allow your child to take part in the study, you will be asked to sign this permission form. 

if you decide you do not want your child to participate, there will be no penalty to you or your child, and your child will not lose any benefits they normally would have.   

why is this study being done? 

the purpose of this study is to purpose.   

why is my child being asked to be in this study?   

your child is being asked to be in this study because inclusion/exclusion criteria.     

how many people will be asked to be in this study? 

number people (participants) will be invited to participate in this study locally. 



In [35]:
"""
Currently, this is an unused, but potentially helpful function.
"""

def getWordListasString(row):
    """ return a list of words, including duplicates.
    NOTE: light cleaning on ingestion """
    wordList = [] 
    
    for word in row['rawText'].split():
        word = re.sub("[^a-zA-Z]+", " ", word).strip().lower()
        if not word == "":
            wordList.append(word)
            
    return " ".join(wordList)

In [36]:
getFrom = 'rawTextDOC_sentenceList'
convertTo = getFrom + '_stripped'
threshold = 5 # inlcudes whitespace

def removeShortSentences(row, field, length_threshold):
    """  remove extremely short sentences, as they are likely not helpful """
    
    stripped_sent = []
    
    for sentence in row[field]:
        if len(sentence.text) >= length_threshold:
            stripped_sent.append(sentence)
            
    return stripped_sent

df[convertTo] = df.apply(lambda row:removeShortSentences(row, getFrom, threshold),axis=1)

In [37]:
df.head()

Unnamed: 0_level_0,name,path,rawText,minimalCleaning,rawTextdocOB,minimalCleaningdocOB,rawTextDOC,minimalCleaningDOC,rawTextDOC_sentenceList,rawTextDOC_sentenceList_stripped
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
192,TAMU - Parental Permission Form.txt,../input_data/all_forms/TAMU - Parental Permis...,﻿ Project Title: You are invited to take par...,﻿ project title: you are invited to take part ...,"(﻿, project, title, :, , you, are, invited, ...","(﻿, project, title, :, you, are, invited, to, ...","(﻿, project, title, :, , you, are, invited, ...","(﻿, project, title, :, you, are, invited, to, ...","[(﻿, project, title, :, , you, are, invited,...","[(﻿, project, title, :, , you, are, invited,..."
671,Lenox Hill_Transfusion Consent.txt,../input_data/all_forms/Lenox Hill_Transfusion...,North Shore-LIJ Health System is now Northwell...,north shore-lij health system is now northwell...,"(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","(north, shore, -, lij, health, system, is, now...","[(north, shore, -, lij, health, system, is, no...","[(north, shore, -, lij, health, system, is, no..."
711,New_York_City_Health_and_Hospitals_Corporation...,../input_data/all_forms/New_York_City_Health_a...,NEW YORK CITY HEALTH AND HOSPITALS CORPORATION...,new york city health and hospitals corporation...,"(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","(new, york, city, health, and, hospitals, corp...","[(new, york, city, health, and, hospitals, cor...","[(new, york, city, health, and, hospitals, cor..."
411,HSIRB-IC-Template-and-Instructions-3-6-17.txt,../input_data/all_forms/HSIRB-IC-Template-and-...,﻿This document should be used to develop a con...,﻿this document should be used to develop a con...,"(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","(﻿this, document, should, be, used, to, develo...","[(﻿this, document, should, be, used, to, devel...","[(﻿this, document, should, be, used, to, devel..."
709,CHMC_Per-Consent-Rev (1).txt,../input_data/all_forms/CHMC_Per-Consent-Rev (...,Patient Label !920001! Patient Last Name Pat...,patient label !920001! patient last name patie...,"(patient, label, , !, 920001, !, patient, las...","(patient, label, !, 920001, !, patient, last, ...","(patient, label, , !, 920001, !, patient, las...","(patient, label, !, 920001, !, patient, last, ...","[(patient, label, , !), (920001, !), (patient...","[(patient, label, , !), (920001, !), (patient..."


## Check Removal of Sents
Here we check to see how many sentences were removed using thresholding.

In [38]:
for index, row in df.iterrows():
    print(index, 'all sents: ', len(row['rawTextDOC_sentenceList']))
    print(index, 'stripped sents: ', len(row['rawTextDOC_sentenceList_stripped']))
    print()

192 all sents:  420
192 stripped sents:  104

671 all sents:  670
671 stripped sents:  29

711 all sents:  7
711 stripped sents:  6

411 all sents:  804
411 stripped sents:  549

709 all sents:  300
709 stripped sents:  81

104 all sents:  18
104 stripped sents:  18

118 all sents:  1011
118 stripped sents:  162

540 all sents:  390
540 stripped sents:  191

674 all sents:  25
674 stripped sents:  24

455 all sents:  240
455 stripped sents:  22

358 all sents:  189
358 stripped sents:  104

466 all sents:  158
466 stripped sents:  26

42 all sents:  49
42 stripped sents:  42

178 all sents:  148
178 stripped sents:  27

39 all sents:  512
39 stripped sents:  370



## preliminary permisison sentence extraction
look for sentences that have a word from the 'clues' list

## TODO: remove duplicates, add count

### NOTE: DO WE WANT SENTENCE PAIRS? THREE SENTENCES?

In [40]:
getCandidates_from = 'rawTextDOC_sentenceList_stripped'

def getPossiblePermissions(row, permissions_list):
    """ return list of sentences containing the 
    permissions words """
    
    candidates = []
    
    # iterate through a list of sentences
    for sent in row[getCandidates_from]:
        
        # iterate through each clue
        for clue in permissions_list:     
            
            # check if sting contains any clues (need to match case)
            if sent.text.__contains__(clue):
                """ NOTE: right not if ANY clue is found, this is enough """
                
                candidates.append(sent)
                break
    
    return candidates
                
# df.apply(lambda row:getPossiblePermissions(row, permission_extended),axis=1)

df['candidatePermissionStatements'] = df.apply(lambda row:getPossiblePermissions(row, permission_extended),axis=1)

In [41]:
df['candidatePermissionStatements'].head(2)

id
192    [(the, information, in, this, form, is, provid...
671    [(north, shore, -, lij, health, system, is, no...
Name: candidatePermissionStatements, dtype: object

In [42]:
for sent in df['candidatePermissionStatements'][:10]:
    [print('sent:', x.text, '\n') for x in sent[:2]]

sent: the information in this form is provided to help you and your child decide whether or not to take part. 

sent: if you decide to allow your child to take part in the study, you will be asked to sign this permission form. 

sent: north shore-lij health system is now northwell health  consent to blood transfusion 1.   

sent: i have had the opportunity to ask questions, and i consent to the transfusion(s).   

sent: ﻿this document should be used to develop a consent form for: * investigator-initiated studies  * other studies where there is no consent template available from an industry sponsor or cooperative group  if there is a consent template available from the sponsor or cooperative group, use the hsirb informed consent template for industry-sponsored, cooperative group, or external irb submissions (available at http://oprs.usc.edu/hsirb/hsirb-forms).     

sent: general instructions for using this template  1. delete all instructions and examples (in red italics) and delete al

In [None]:
# def getEstimatedPermissionDesnity(row):
#     """ return proportion of sentence total that may
#     be statements indicating perissions. Expect much noise. """
     
#     return len(row['permissionsStatements'])/len(row['sentList'])
    
# df['permissionDensity'] = df.apply(lambda row: getEstimatedPermissionDesnity(row),axis=1)

# df.head(2)

In [None]:
# %matplotlib inline
# df['permissionDensity'].hist()

## Important
this represents an important break away from the primary dataframe. here i start to work with list structures to create a list of possible permissions that is no longer tied to filenames

In [None]:
all_sents = []

for sentlist in df['sentList'].to_list():
    [all_sents.append(x) for x in sentlist]
    
all_permissions = []

for permlist in df['permissionsStatements'].to_list():
    [all_permissions.append(x) for x in permlist]

In [None]:
print('total rough draft permissions: ', len(all_permissions))
print('unique rough draft permissions: ', len(list(set(all_permissions))))

print('total rough draft sentences: ', len(all_sents))
print('unique rough draft sentences: ', len(list(set(all_sents))))

In [None]:
# remove duplucates
all_permissions = list(set(all_permissions))

## pair-wise similarity between our permission 'guesses' and remaining sentences
this is an important step so that we don't restrict ourselves to permission statements that contain the words we dreampt up

# WARNING: WILL NOT SCALE, need to fix


In [None]:
new_rows = []

for sent in all_sents:
    for perm in all_permissions:
        row = {
            'permission':perm,
            'sentence':sent,
            'similarity':sent.similarity(perm)
        }
        new_rows.append(row)

In [None]:
sf = pd.DataFrame(new_rows)
sf.head()

In [None]:
sf['similarity'].hist()

In [None]:
# reduce dataframe, aggregate by mean sim score (weak sents will balance out)
sf = sf.drop(columns=['permission'])
sf = sf.groupby(['sentence'], as_index=False).mean()

sf.head()

In [None]:
sf['similarity'].hist()

In [None]:
high_sim = sf[sf.similarity > .7]
candidates = high_sim['sentence'].to_list()

In [None]:
print('total rough draft candidates: ', len(candidates))
print('unique rough draft candidates: ', len(list(set(candidates))))

In [None]:
[print('sent: ', x, '\n') for x in candidates[:5]]

In [None]:
rough_draft_permissions = all_permissions + candidates

In [None]:
print('total rough draft candidates: ', len(rough_draft_permissions))
print('unique rough draft candidates: ', len(list(set(rough_draft_permissions))))

In [None]:
rough_draft_permissions = list(set(rough_draft_permissions))

## print statements to file

In [None]:
# import csv
# import datetime
# today = str(datetime.date.today())

# file_path = 'statements-'+today+'.csv'
# print(file_path)

# with open(file_path, 'a') as outcsv:   
#     #configure writer to write standard csv file
#     writer = csv.writer(outcsv, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
#     writer.writerow(['raw_text', 'clean_text'])
#     for perm in rough_draft_permissions:
#         #Write item to outcsv
#         raw_text = perm.text
#         clean_text = re.sub('\s+', ' ', re.sub('\W+',' ',\
#                                re.sub('[^A-Za-z0-9]+',' ', \
#                                       re.sub(r'\d+', " ", raw_text)))).strip()
#         writer.writerow([raw_text, clean_text])

# Annotations?

This is the point where injecting further information via annotations may be a good idea. This then would require in import of another local file and a small methods section.

## matrix representations

a few different matrix representations of the permissions sentence

In [None]:
def cleanPermissions(rough_draft_permissions):
    "string cleaning on permissions for td-idf ingestion"
    corpus = []
    
    
    for permission in rough_draft_permissions:
        cleanSentence = []
        for word in permission.text.split():
            word = re.sub("[^a-zA-Z]+", " ", word).strip().lower()
            if not word == "":
                cleanSentence.append(word)
        corpus.append(" ".join(cleanSentence))
    return corpus
            
corpus = cleanPermissions(rough_draft_permissions)

[print(x, '\n') for x in corpus[:10]]

## NOTE: 
`minTermFrequencyThreshold = 0` will result in all possible n_grams and will not scale as input size or ngramSize increases. However, it is the most robust representation of the sentence, and is worth exploring for the time being...

In [None]:
ngramSize = 5
maxTermFrequencyThreshold = .8
minTermFrequencyThreshold = .001

def getTDIDFMatrix(corpus, ngram_range, max_df, min_df):
    """ return td-idf matrix and terms """
    
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, 
                                       ngram_range=(1,ngram_range),
                                       max_df=max_df,min_df=min_df)
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    terms = tfidf_vectorizer.get_feature_names()
    
    return tfidf_matrix, terms

# save to a variable 
tdidf_matrix, tdidf_terms = getTDIDFMatrix(corpus, 
                                           ngramSize, 
                                           maxTermFrequencyThreshold,
                                           minTermFrequencyThreshold)

# # print tests
# print('\nfirst few terms:')
# [print(" ", x) for x in tdidf_terms[:10]]

print('\nNumber of terms:', len(tdidf_terms))

In [None]:
sdf = pd.SparseDataFrame(tdidf_matrix, columns=tdidf_terms)

# here we add the sentences back in
sdf['sent'] = corpus

sdf.head()

In [None]:
permission_vectors = []

for perm in rough_draft_permissions:
#     print(perm.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])) # same features, different rows
#     print(perm.vector) # same shape
#     print(perm.vector_norm) # single value
#     print(perm.get_lca_matrix()) # differnt shapes
    permission_vectors.append(perm.vector)