# Dependencies

In [82]:
import sys
import os
import json
import pandas as pd
import hashlib
import spacy
import re
import random
from importlib import reload
import xlsxwriter
from datetime import datetime
from collections import defaultdict

# local file `futils` for utility funcitons
import futils

# Load Raw Data

In [52]:
"""
Get a list of files from the `data/` directory.
Each is a `.json. file with annotations for multiple informed consent documents. 
Each `.json` file is a single annotator.
"""

data_dir = "../data/"
json_file_list  = []

for subdir, dirs, files in os.walk(data_dir):
        for file in files:
            json_file_list.append(os.path.join(subdir, file))
            
futils.printl(json_file_list)

../data/2020-01-21_Random_46-60_KATHLEEN.json
../data/2020-01-20_Random_1-15_Kathleen.json
../data/2020-01-21_Random_46-60_LIZ.json
../data/2020-01-21_Random_16-30_LIZ.json
../data/2020-01-20_Random_16-30_KATHLEEN.json
../data/2020-01-27_Random_76-90_KATHLEEN.json
../data/2020-01-21_Random_31-45_KATHLEEN.json
../data/2020-01-27_Random_76-90_LIZ.json
../data/2020-01-27_Random_61-75_KATHLEEN.json
../data/2020-01-21_Random_31-45_KAYCEE.json
../data/2020-01-28_Random_91-105_KAYCEE.json
../data/2020-01-27_Random_76-90_KAYCEE.json
../data/2020-01-28_Random_91-105_LIZ.json
../data/2020-01-28_Random_106-120_KATHLEEN.json
../data/2020-01-21_Random_1-15_LIZ.json
../data/2020-01-21_Random_16-30_KAYCEE.json
../data/2020-01-27_Random_61-75_KAYCEE.json
../data/2020-01-21_Random_31-45_LIZ.json
../data/2020-01-28_Random_106-120_LIZ.json
../data/2020-01-21_Random_46-60_KAYCEE.json
../data/2020-01-28_Random_91-105_KATHLEEN.json
../data/2020-01-28_Random_121-134_KAYCEE.json
../data/2020-01-21_Random_1-15

In [53]:
"""
get list of annotators and format their names. 

This is a dictionary that has formatted names, and the `.json` files that each 
annotator annotated.
"""
reload(futils)
annotators = defaultdict()

for file in json_file_list:
    name = futils.format_annotator_name(file)
    if name not in annotators:
        annotators[name] = {'json_files': [file]}
    else:
        annotators[name]['json_files'].append(file)
        
        
futils.printl(annotators, True)

KATHLEEN {'json_files': ['../data/2020-01-21_Random_46-60_KATHLEEN.json', '../data/2020-01-20_Random_1-15_Kathleen.json', '../data/2020-01-20_Random_16-30_KATHLEEN.json', '../data/2020-01-27_Random_76-90_KATHLEEN.json', '../data/2020-01-21_Random_31-45_KATHLEEN.json', '../data/2020-01-27_Random_61-75_KATHLEEN.json', '../data/2020-01-28_Random_106-120_KATHLEEN.json', '../data/2020-01-28_Random_91-105_KATHLEEN.json', '../data/2020-01-28_Random_121-134_KATHLEEN.json']} 

LIZ {'json_files': ['../data/2020-01-21_Random_46-60_LIZ.json', '../data/2020-01-21_Random_16-30_LIZ.json', '../data/2020-01-27_Random_76-90_LIZ.json', '../data/2020-01-28_Random_91-105_LIZ.json', '../data/2020-01-21_Random_1-15_LIZ.json', '../data/2020-01-21_Random_31-45_LIZ.json', '../data/2020-01-28_Random_106-120_LIZ.json', '../data/2020-01-28_Random_121-134_LIZ.json', '../data/2020-01-27_Random_61-75_LIZ.json']} 

KAYCEE {'json_files': ['../data/2020-01-21_Random_31-45_KAYCEE.json', '../data/2020-01-28_Random_91-105_

In [54]:
print(annotators.keys())

dict_keys(['KATHLEEN', 'LIZ', 'KAYCEE'])


In [55]:
%time
"""
Load json objects into a 'raw' format for processing
"""

reload(futils)

for annotator in annotators:
    df_list = []
    annotators[annotator]['document_map'] = {}
    for file in annotators[annotator]['json_files']:
        
        tmp_df, doc_map = futils.load_annotations(annotator, file)
        df_list.append(tmp_df)
        
        for doc_id, content in doc_map.items():
            if doc_id not in annotators[annotator]['document_map']:
                annotators[annotator]['document_map'][doc_id] = content
        
    annotations = pd.concat(df_list, ignore_index=True)
     # add prepared annotation tables 
    annotators[annotator]['annotations'] = annotations
    
print(annotators.keys())

print(annotators['KATHLEEN'].keys())
print(annotators['LIZ'].keys())
print(annotators['KAYCEE'].keys())

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs
dict_keys(['KATHLEEN', 'LIZ', 'KAYCEE'])
dict_keys(['json_files', 'document_map', 'annotations'])
dict_keys(['json_files', 'document_map', 'annotations'])
dict_keys(['json_files', 'document_map', 'annotations'])


In [56]:
"""
check all document maps for consistency
"""

for annotator, _dict in annotators.items():
    print(annotator, len(_dict['document_map'].keys()))
    
"""
check for individual differences. this is good:
    all original documents are represented
"""
print(len(set(annotators['KATHLEEN']['document_map'].keys()) - (annotators['KAYCEE']['document_map'].keys())))
print(len(set(annotators['KATHLEEN']['document_map'].keys()) - set(annotators['LIZ']['document_map'].keys())))
print(len(set(annotators['LIZ']['document_map'].keys()) - set(annotators['KAYCEE']['document_map'].keys())))

KATHLEEN 134
LIZ 134
KAYCEE 134
0
0
0


In [57]:
annotators['KATHLEEN']['annotations'].head()

Unnamed: 0,doc_id,json_filename,KATHLEEN_A,KATHLEEN_B,KATHLEEN_C,KATHLEEN_start,KATHLEEN_end,KATHLEEN_text
0,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,2613,2795,"By signing this form, I am requesting and givi..."
1,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,0,0,0,366,558,I understand that blood or blood products will...
2,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,323,516,I authorize the release of any and all medical...
3,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,104,321,I voluntarily consent to medical care of a rou...
4,36073164,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,4573,4777,Your signature below indicates that you unders...


In [99]:
"""
build the reference structure
"""

document_map = {}

for doc_id, raw_content in annotators['KATHLEEN']['document_map'].items():
    document_map[doc_id] = {'raw_content' : raw_content}
    

document_map.keys()  

dict_keys([95581557, 69408590, 36073164, 37497740, 24000641, 784359, 35262426, 46217133, 23099502, 97398937, 64616759, 58795983, 25969537, 94407046, 99175728, 31716101, 98303194, 27509790, 30924115, 6848654, 71475014, 58411629, 64073684, 15463536, 68411469, 62604978, 35790608, 24397520, 1775988, 70530481, 23440484, 2843282, 85954293, 84061096, 560211, 56347879, 5156065, 93118497, 17651082, 24229633, 25946820, 41320281, 16450710, 52351579, 22857905, 9849800, 96936919, 26577152, 23981145, 7319132, 60208881, 74909134, 7863434, 37431393, 95943802, 30820132, 26369189, 73758940, 60696396, 96278299, 38835297, 75494187, 32141700, 96480505, 69390110, 85095297, 67680881, 7960085, 79490243, 24677931, 66694691, 62892380, 17582431, 10532027, 33035501, 20918211, 80604657, 89016020, 58371701, 39000652, 59904054, 99272690, 2803036, 82742889, 71667445, 73694126, 7405868, 43930936, 2085743, 72274887, 78061880, 11170355, 42082024, 8412981, 65104204, 63443341, 67287213, 65639567, 30814585, 11421395, 21614

In [59]:
"""
load spacy large lib
"""
nlp = spacy.load('en_core_web_lg')

In [100]:
%time
for doc_id, _dict in document_map.items():
    document_map[doc_id]['spacy_obj'] = nlp(_dict['raw_content'])
    document_map[doc_id]['sentence_list'] = list(document_map[doc_id]['spacy_obj'].sents)
                                                                                                                            
document_map.keys()  

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs


dict_keys([95581557, 69408590, 36073164, 37497740, 24000641, 784359, 35262426, 46217133, 23099502, 97398937, 64616759, 58795983, 25969537, 94407046, 99175728, 31716101, 98303194, 27509790, 30924115, 6848654, 71475014, 58411629, 64073684, 15463536, 68411469, 62604978, 35790608, 24397520, 1775988, 70530481, 23440484, 2843282, 85954293, 84061096, 560211, 56347879, 5156065, 93118497, 17651082, 24229633, 25946820, 41320281, 16450710, 52351579, 22857905, 9849800, 96936919, 26577152, 23981145, 7319132, 60208881, 74909134, 7863434, 37431393, 95943802, 30820132, 26369189, 73758940, 60696396, 96278299, 38835297, 75494187, 32141700, 96480505, 69390110, 85095297, 67680881, 7960085, 79490243, 24677931, 66694691, 62892380, 17582431, 10532027, 33035501, 20918211, 80604657, 89016020, 58371701, 39000652, 59904054, 99272690, 2803036, 82742889, 71667445, 73694126, 7405868, 43930936, 2085743, 72274887, 78061880, 11170355, 42082024, 8412981, 65104204, 63443341, 67287213, 65639567, 30814585, 11421395, 21614

In [101]:
"""
build the master data frame
"""

new_rows = []

for doc_id, doc in document_map.items():
    count = 0

    for sent in doc['sentence_list']:
        
        count += 1
        
        row = {
            'doc_id':doc_id,
            'sent_obj':sent,
            'sentence':sent.text,
            'start':sent.start_char,
            'word_count':len(sent),
            'char_count':len(sent.text),
            'sent_id':f"{doc_id}_{count}"
        }
        
        new_rows.append(row)   
        
df = pd.DataFrame(new_rows)
document_map['master'] = df
document_map['master'].head()
    

Unnamed: 0,doc_id,sent_obj,sentence,start,word_count,char_count,sent_id
0,95581557,"(Ct, ), MERCY, HEALTH, \n, GRAND, RAPIDS, ,, M...","Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",0,23,106,95581557_1
1,95581557,"(AND, /, OR, BLOOD, PRODUCTS, \n)",AND/OR BLOOD PRODUCTS \n,107,6,23,95581557_2
2,95581557,"(I, understand, that, I, need, or, may, need, ...",I understand that I need or may need blood and...,130,27,130,95581557_3
3,95581557,"(This, hospital, admission, D, Outpatient, :, ...",This hospital admission D Outpatient: Series o...,261,12,66,95581557_4
4,95581557,"(_, _, _, to)",___to,327,4,5,95581557_5


In [102]:
"""
do some cleaning on the sentences
"""

def clean_sentence(sent):
    """A function to perform text processing on raw data to new field """
    sent = str(sent).strip().encode(encoding = 'ascii',errors = 'replace')
    sent = sent.decode(encoding='ascii',errors='strict')
    sent = str(sent).replace("?", " ")
    # strip redundant whitespace
    sent = re.sub(' +', ' ', sent)
    return " ".join(sent.split())

document_map['master']['clean_sentence'] = df['sentence'].map(lambda row: clean_sentence(row))
document_map['master'].head()

Unnamed: 0,doc_id,sent_obj,sentence,start,word_count,char_count,sent_id,clean_sentence
0,95581557,"(Ct, ), MERCY, HEALTH, \n, GRAND, RAPIDS, ,, M...","Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",0,23,106,95581557_1,"Ct) MERCY HEALTH GRAND RAPIDS, Ml CONSENT TO R..."
1,95581557,"(AND, /, OR, BLOOD, PRODUCTS, \n)",AND/OR BLOOD PRODUCTS \n,107,6,23,95581557_2,AND/OR BLOOD PRODUCTS
2,95581557,"(I, understand, that, I, need, or, may, need, ...",I understand that I need or may need blood and...,130,27,130,95581557_3,I understand that I need or may need blood and...
3,95581557,"(This, hospital, admission, D, Outpatient, :, ...",This hospital admission D Outpatient: Series o...,261,12,66,95581557_4,This hospital admission D Outpatient: Series o...
4,95581557,"(_, _, _, to)",___to,327,4,5,95581557_5,___to


In [103]:
"""
Inclusion exclusion rules
"""

def include_exclude(row):
    """A function to return [0,1] based on inclusion exclusion criteria """
    
    if row['char_count'] < 9:
        return 0
    elif not any(c.isalpha() for c in row['sentence']):
        return 0
    else:
        return 1
    

document_map['master']['include'] = document_map['master'].apply(lambda row: include_exclude(row), axis=1)
document_map['master'].head()

Unnamed: 0,doc_id,sent_obj,sentence,start,word_count,char_count,sent_id,clean_sentence,include
0,95581557,"(Ct, ), MERCY, HEALTH, \n, GRAND, RAPIDS, ,, M...","Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",0,23,106,95581557_1,"Ct) MERCY HEALTH GRAND RAPIDS, Ml CONSENT TO R...",1
1,95581557,"(AND, /, OR, BLOOD, PRODUCTS, \n)",AND/OR BLOOD PRODUCTS \n,107,6,23,95581557_2,AND/OR BLOOD PRODUCTS,1
2,95581557,"(I, understand, that, I, need, or, may, need, ...",I understand that I need or may need blood and...,130,27,130,95581557_3,I understand that I need or may need blood and...,1
3,95581557,"(This, hospital, admission, D, Outpatient, :, ...",This hospital admission D Outpatient: Series o...,261,12,66,95581557_4,This hospital admission D Outpatient: Series o...,1
4,95581557,"(_, _, _, to)",___to,327,4,5,95581557_5,___to,0


In [109]:
futils.printl(document_map['master']['clean_sentence'].sample(10), '\n')

the procedure(s) listed in #2 above including any tissue implants (please initial) Initial the appropriate box: I consent to a pregnancy testing (if appropriate). 

......, 

We understand that these problems also occur in 3-5% of children resulting from natural conception without PGD testing. 

I have read the foregoing information and understand it. 

 

I certify that I am the patient and that I have received a copy of this form. 

Refill requests are to be requested from the pharmacy by fax. 

_ 

M . 

I understand that this procedure is for purposes of diagnosis and/or treatment for (describe reasons for procedure): 



# TODO Manual Correction of forms
# TODO Match annotations to reference 