# Dependencies

In [1]:
import sys
import os
import json
import pandas as pd
import hashlib
import spacy
import re
import random
from importlib import reload
import xlsxwriter
from datetime import datetime
from collections import defaultdict

# local file `futils` for utility funcitons
import futils

# Load Raw Data

In [4]:
"""
Get a list of files from the `data/` directory.
Each is a `.json. file with annotations for multiple informed consent documents. 
Each `.json` file is a single annotator.
"""

class Raw_Annotations():
    """A class to help manage annotations from DataTurks"""
    
    def __init__(self, data_dir = "../data/", nlp_lib="en_core_web_lg"):
        """
        Args: 
            - data_dir (str): a path to a directory of json files 
                containing annotations        
        """
        self.nlp = spacy.load(nlp_lib)
        self.data_dir = data_dir
        self.annotated_files = self.get_json_files()
        self.annotators = self.get_annotators()
        
        # add attributes to hold annotations
        for annotator in self.annotators:
            setattr(self, f"{annotator}_list", [])
            setattr(self, f"{annotator}_df", None)

        self.load_annotations()
        self.build_frames()
        
        self.document_map = self.build_document_map()
        
        
    def format_annotator_name(self, filename):
        """A function to return the formatted name of an annotator given a consistently 
        named file.

        Note: this depends on files named like `_NAME.json`

        Args:
            - filename (str): a file name, expects name after last `_`
                character

        Returns:
            - name (str): a formated string 
        """
        return filename.split("_")[-1].split(".")[0].upper()

    def get_json_files(self):
        """A function to initialize a dictionary for storing annotations 

        Returns:
            - annotated_files (default dict): primary keys are annotators names
        """
        annotated_files = []

        for subdir, dirs, files in os.walk(self.data_dir):
                for file in files:
                    annotated_files.append(os.path.join(subdir, file))

        return annotated_files
    
    def get_annotators(self):
        """A function to get the names of annotators"""
        annotators = []
        for file in self.annotated_files:
            name = self.format_annotator_name(file)
            if name not in annotators:
                annotators.append(name)
                
        return annotators
    
    def get_document_id(self, raw_content):
        """A function to facilitate conversion of raw text into document ids

        Args:
             - raw_content (str): a sufficient portion of the document as to 
                 be unique with a high probability

         Returns:
             - doc_id (int): a document id
        """
        return int(hashlib.sha256(raw_content.encode('utf-8')).hexdigest(), 16) % 10**8
    
    def build_document_map(self):
        """ A function to get a map of documents and ids """
        
        document_map = {}
        
        for json_file in self.annotated_files:
             for annotated_doc in open(json_file):
                json_dump = json.loads(annotated_doc)
                # get raw content
                content = json_dump['content']
                doc_id = self.get_document_id(content)
                document_map[doc_id] = {'raw_content': content,
                                        'from_file':json_file}
                
        return document_map
    
    def get_dumps(self, json_file):
        """A function to return a list of json_dumps
        from a given json_file
        
        Args:
            - json_file (str): the path to the json file
            
        Returns:
            - json_dumps (list): list of json dumps 
        """
        return [json.loads(dump) for dump in open(json_file)]
    
    
    def _clean_raw_annotation_text(self, raw_text_annotation, char_thresh=9):
        """A function to clean sentences
        
        Args: 
            - raw_text_annotation (str): may be multi-sentence annotations
            - char_thresh (int): how many character must the annotation be?
            
        Returns:
            - clean_list (list): a list of clean sentences   
        """
        dirty_str = str(raw_text_annotation).strip().encode(encoding = 'ascii',
                                                       errors = 'replace')
        dirty_str = dirty_str.decode(encoding='ascii', 
                           errors='strict')
        
        dirty_str = str(dirty_str).replace("?", " ")
        # strip redundant whitespace and signature lines 
        dirty_str = re.sub(' +', ' ', dirty_str).replace("_", "")
        
        clean_list = []
        
        for sent in self.nlp(" ".join(dirty_str.split())).sents:
            if len(sent) > char_thresh:
                clean_list.append(sent.text)
        
        return clean_list
    
    def load_annotations(self):
        """A function to load all annotations into a list
        """
        new_rows = []
        
        for json_file in self.annotated_files:
            annotated_docs = self.get_dumps(json_file)
            
            name = self.format_annotator_name(json_file)
            
            for dump in annotated_docs:
                
                # get doc IDs
                doc_id = self.get_document_id(dump['content'])
                
                # handle None annotations
                if dump['annotation'] is None:
                    continue 
                    
                # handle multiple annotations in one spot
                for task in dump['annotation']:
                    label_list = task['label']
                    
                    # handle mutliple points
                    for pt in task['points']:
                        start_char = pt['start']
                        end_char = pt['end']
                        
                        sentences = self._clean_raw_annotation_text(pt['text'])
                        
                        if len(sentences) > 1:
                            for idx, text in enumerate(sentences):
                                if not idx == 0:
                                    start_char += len(sentences[idx - 1])
                                    end_char += len(sentences[idx])
                                    
                                record = {
                                    'doc_id' : doc_id,
                                    'json_filename':json_file,
                                    f'{name}_A': 1 if 'A' in label_list else 0,
                                    f'{name}_B': 1 if 'B' in label_list else 0,
                                    f'{name}_C': 1 if 'B' in label_list else 0,
                                    f'{name}_start':start_char,
                                    f'{name}_end':end_char,
                                    f'{name}_text':text,
                                    f'{name}_sentence_count': idx+1
                                }

                                annotation_list = getattr(self, f"{name}_list")    
                                annotation_list.append(record)
                            
    def build_frames(self):
        """A function to set dataframe attributes
        """
        for name in self.annotators:
            annotation_list = getattr(self, f"{name}_list") 
            setattr(self, f"{name}_df", pd.DataFrame(annotation_list)) 
            
    def save_annotation_files(self, out_dir="../output/"):
        """A function to save files
        
        Args:
            - outdir (str): the directory path of the output
        """
        
        for name in self.annotators:
            df = getattr(self, f"{name}_df") 
            file_name = f"{out_dir}{name}_annotations.csv"
            df.to_csv(file_name, index=False)
            print(f"Saved: '{file_name}'")
            
        
        
        
            
            
        
annotations = Annotations()
print(dir(annotations), '\n')
print(annotations.annotators, '\n')
print(annotations.annotated_files, '\n')
print(annotations.document_map.keys(), '\n')

print(annotations.LIZ_df.head(), '\n')

annotations.save_annotation_files()

['KATHLEEN_df', 'KATHLEEN_list', 'KAYCEE_df', 'KAYCEE_list', 'LIZ_df', 'LIZ_list', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_clean_raw_annotation_text', 'annotated_files', 'annotators', 'build_document_map', 'build_frames', 'data_dir', 'document_map', 'format_annotator_name', 'get_annotators', 'get_document_id', 'get_dumps', 'get_json_files', 'load_annotations', 'nlp', 'save_annotation_files'] 

['KATHLEEN', 'LIZ', 'KAYCEE'] 

['../data/2020-01-21_Random_46-60_KATHLEEN.json', '../data/2020-01-20_Random_1-15_Kathleen.json', '../data/2020-01-21_Random_46-60_LIZ.json', '../data/2020-01-21_Random_16-30_LIZ.json', '../data/2020-01-20_Random_16-30_KATHLEEN.json', '../data/2020-01-27_Random_76-

In [12]:
%time
"""
Load json objects into a 'raw' format for processing
"""

for annotator in annotations:
    df_list = []
    
    for file in annotations[annotator]['json_files']:
        
        tmp_df = load_annotations(annotator, file)
        df_list.append(tmp_df)
        
    annotation_df = pd.concat(df_list, ignore_index=True)
     # add prepared annotation tables 
    annotations[annotator]['annotations'] = annotation_df
    
print(annotations.keys())

print(annotations['KATHLEEN'].keys())
print(annotations['LIZ'].keys())
print(annotations['KAYCEE'].keys())

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs
dict_keys(['KATHLEEN', 'LIZ', 'KAYCEE'])
dict_keys(['json_files', 'annotations'])
dict_keys(['json_files', 'annotations'])
dict_keys(['json_files', 'annotations'])


In [18]:
"""
check all document maps for consistency
"""

for annotator, _dict in annotations.items():
    print(annotator, len(_dict['document_map'].keys()))
    
"""
check for individual differences. this is good:
    all original documents are represented
"""
print(len(set(annotations['KATHLEEN']['document_map'].keys()) - (annotations['KAYCEE']['document_map'].keys())))
print(len(set(annotations['KATHLEEN']['document_map'].keys()) - set(annotations['LIZ']['document_map'].keys())))
print(len(set(annotations['LIZ']['document_map'].keys()) - set(annotations['KAYCEE']['document_map'].keys())))

KATHLEEN 134
LIZ 134
KAYCEE 134
0
0
0


In [19]:
annotations['KATHLEEN']['annotations'].head()

Unnamed: 0,doc_id,json_filename,KATHLEEN_A,KATHLEEN_B,KATHLEEN_C,KATHLEEN_start,KATHLEEN_end,KATHLEEN_text
0,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,2613,2795,"By signing this form, I am requesting and givi..."
1,95581557,../data/2020-01-21_Random_46-60_KATHLEEN.json,0,0,0,366,558,I understand that blood or blood products will...
2,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,323,516,I authorize the release of any and all medical...
3,69408590,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,104,321,I voluntarily consent to medical care of a rou...
4,36073164,../data/2020-01-21_Random_46-60_KATHLEEN.json,1,0,0,4573,4777,Your signature below indicates that you unders...


In [20]:
"""
build the reference structure
"""

document_map = {}

for doc_id, raw_content in annotations['KATHLEEN']['document_map'].items():
    document_map[doc_id] = {'raw_content' : raw_content}
    
document_map.keys()  

dict_keys([95581557, 69408590, 36073164, 37497740, 24000641, 784359, 35262426, 46217133, 23099502, 97398937, 64616759, 58795983, 25969537, 94407046, 99175728, 31716101, 98303194, 27509790, 30924115, 6848654, 71475014, 58411629, 64073684, 15463536, 68411469, 62604978, 35790608, 24397520, 1775988, 70530481, 23440484, 2843282, 85954293, 84061096, 560211, 56347879, 5156065, 93118497, 17651082, 24229633, 25946820, 41320281, 16450710, 52351579, 22857905, 9849800, 96936919, 26577152, 23981145, 7319132, 60208881, 74909134, 7863434, 37431393, 95943802, 30820132, 26369189, 73758940, 60696396, 96278299, 38835297, 75494187, 32141700, 96480505, 69390110, 85095297, 67680881, 7960085, 79490243, 24677931, 66694691, 62892380, 17582431, 10532027, 33035501, 20918211, 80604657, 89016020, 58371701, 39000652, 59904054, 99272690, 2803036, 82742889, 71667445, 73694126, 7405868, 43930936, 2085743, 72274887, 78061880, 11170355, 42082024, 8412981, 65104204, 63443341, 67287213, 65639567, 30814585, 11421395, 21614

In [22]:
"""
load spacy large lib
"""
nlp = spacy.load('en_core_web_lg')

In [23]:
%time
for doc_id, _dict in document_map.items():
    document_map[doc_id]['spacy_obj'] = nlp(_dict['raw_content'])
    document_map[doc_id]['sentence_list'] = list(document_map[doc_id]['spacy_obj'].sents)
                                                                                                                            
document_map.keys()  

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


dict_keys([95581557, 69408590, 36073164, 37497740, 24000641, 784359, 35262426, 46217133, 23099502, 97398937, 64616759, 58795983, 25969537, 94407046, 99175728, 31716101, 98303194, 27509790, 30924115, 6848654, 71475014, 58411629, 64073684, 15463536, 68411469, 62604978, 35790608, 24397520, 1775988, 70530481, 23440484, 2843282, 85954293, 84061096, 560211, 56347879, 5156065, 93118497, 17651082, 24229633, 25946820, 41320281, 16450710, 52351579, 22857905, 9849800, 96936919, 26577152, 23981145, 7319132, 60208881, 74909134, 7863434, 37431393, 95943802, 30820132, 26369189, 73758940, 60696396, 96278299, 38835297, 75494187, 32141700, 96480505, 69390110, 85095297, 67680881, 7960085, 79490243, 24677931, 66694691, 62892380, 17582431, 10532027, 33035501, 20918211, 80604657, 89016020, 58371701, 39000652, 59904054, 99272690, 2803036, 82742889, 71667445, 73694126, 7405868, 43930936, 2085743, 72274887, 78061880, 11170355, 42082024, 8412981, 65104204, 63443341, 67287213, 65639567, 30814585, 11421395, 21614

In [24]:
"""
build the master data frame
"""

new_rows = []

for doc_id, doc in document_map.items():
    count = 0

    for sent in doc['sentence_list']:
        
        count += 1
        
        row = {
            'doc_id':doc_id,
            'sent_obj':sent,
            'sentence':sent.text,
            'start':sent.start_char,
            'word_count':len(sent),
            'char_count':len(sent.text),
            'sent_id':f"{doc_id}_{count}"
        }
        
        new_rows.append(row)   
        
df = pd.DataFrame(new_rows)
document_map['master'] = df
document_map['master'].head()
    

Unnamed: 0,doc_id,sent_obj,sentence,start,word_count,char_count,sent_id
0,95581557,"(Ct, ), MERCY, HEALTH, \n, GRAND, RAPIDS, ,, M...","Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",0,23,106,95581557_1
1,95581557,"(AND, /, OR, BLOOD, PRODUCTS, \n)",AND/OR BLOOD PRODUCTS \n,107,6,23,95581557_2
2,95581557,"(I, understand, that, I, need, or, may, need, ...",I understand that I need or may need blood and...,130,27,130,95581557_3
3,95581557,"(This, hospital, admission, D, Outpatient, :, ...",This hospital admission D Outpatient: Series o...,261,12,66,95581557_4
4,95581557,"(_, _, _, to)",___to,327,4,5,95581557_5


In [25]:
"""
do some cleaning on the sentences
"""

document_map['master']['clean_sentence'] = df['sentence'].map(lambda row: clean_sentence(row))
document_map['master'].head()

Unnamed: 0,doc_id,sent_obj,sentence,start,word_count,char_count,sent_id,clean_sentence
0,95581557,"(Ct, ), MERCY, HEALTH, \n, GRAND, RAPIDS, ,, M...","Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",0,23,106,95581557_1,"Ct) MERCY HEALTH GRAND RAPIDS, Ml CONSENT TO R..."
1,95581557,"(AND, /, OR, BLOOD, PRODUCTS, \n)",AND/OR BLOOD PRODUCTS \n,107,6,23,95581557_2,AND/OR BLOOD PRODUCTS
2,95581557,"(I, understand, that, I, need, or, may, need, ...",I understand that I need or may need blood and...,130,27,130,95581557_3,I understand that I need or may need blood and...
3,95581557,"(This, hospital, admission, D, Outpatient, :, ...",This hospital admission D Outpatient: Series o...,261,12,66,95581557_4,This hospital admission D Outpatient: Series o...
4,95581557,"(_, _, _, to)",___to,327,4,5,95581557_5,___to


In [26]:
"""
Inclusion exclusion rules
"""

def include_exclude(row):
    """A function to return [0,1] based on inclusion exclusion criteria """
    
    if row['char_count'] < 9:
        return 0
    elif not any(c.isalpha() for c in row['sentence']):
        return 0
    else:
        return 1
    

document_map['master']['include'] = document_map['master'].apply(lambda row: include_exclude(row), axis=1)
document_map['master'].head()

Unnamed: 0,doc_id,sent_obj,sentence,start,word_count,char_count,sent_id,clean_sentence,include
0,95581557,"(Ct, ), MERCY, HEALTH, \n, GRAND, RAPIDS, ,, M...","Ct) MERCY HEALTH \nGRAND RAPIDS, Ml \nCONSENT ...",0,23,106,95581557_1,"Ct) MERCY HEALTH GRAND RAPIDS, Ml CONSENT TO R...",1
1,95581557,"(AND, /, OR, BLOOD, PRODUCTS, \n)",AND/OR BLOOD PRODUCTS \n,107,6,23,95581557_2,AND/OR BLOOD PRODUCTS,1
2,95581557,"(I, understand, that, I, need, or, may, need, ...",I understand that I need or may need blood and...,130,27,130,95581557_3,I understand that I need or may need blood and...,1
3,95581557,"(This, hospital, admission, D, Outpatient, :, ...",This hospital admission D Outpatient: Series o...,261,12,66,95581557_4,This hospital admission D Outpatient: Series o...,1
4,95581557,"(_, _, _, to)",___to,327,4,5,95581557_5,___to,0


In [27]:
futils.printl(document_map['master']['clean_sentence'].sample(10), '\n')

I/we have been told that my/our name and address will be kept on, and that this, or any other information which would directly or indirectly identify me/us will not be disclosed or released to any person or entity without my/our written informed consent, except as permitted by law. 

B. ASSIGNMENT OF BENEFITS. 

Consults with other health professionals 

Signature (full name) of Provider Obtaining Consent 

_____________ _ _ Date: 

In addition, any adverse outcomes, including infectious diseases in the recipients or their offspring, and genetic defects in offspring will be reported to the sperm donor if there is any possibility that the donor s reproductive tissue contributed to the adverse outcome. 

Furthermore, I understand that it is possible that a treatment or test that my child receives may have been developed by my child s physician and that he/she may financially benefit from royalty payments accruing from the use of such a test or treatment which has previously been properly

In [39]:
"""
Now we'll 'normalize' and merge the annotations. this means handling multi-sentence annotations. 
"""

for annotator, _dict in annotations.items():
    
    # get the column with text
    text_column = _dict['annotations'].filter(like='text').columns
    
    for idx, row in _dict['annotations'].iterrows():
        
        print(str(row[text_column]))
        
        
#         annotation_obj = nlp(row[text_column])
            
#         if len(list(annotation_obj.sents)) > 1:
#             print(annotation_obj.text, '\n')
        

        
        
#         print(idx, row)

KATHLEEN_text    By signing this form, I am requesting and givi...
Name: 0, dtype: object
KATHLEEN_text    I understand that blood or blood products will...
Name: 1, dtype: object
KATHLEEN_text    I authorize the release of any and all medical...
Name: 2, dtype: object
KATHLEEN_text    I voluntarily consent to medical care of a rou...
Name: 3, dtype: object
KATHLEEN_text    Your signature below indicates that you unders...
Name: 4, dtype: object
KATHLEEN_text    My health care provider has explained that the...
Name: 5, dtype: object
KATHLEEN_text    General description and purpose of the test-My...
Name: 6, dtype: object
KATHLEEN_text    As the patient/patient's authorized representa...
Name: 7, dtype: object
KATHLEEN_text    It is also agreed that frozen donor sperm, qua...
Name: 8, dtype: object
KATHLEEN_text    I/We agree and consent that I/we will obtain t...
Name: 9, dtype: object
KATHLEEN_text    I/We, ________________________________________...
Name: 10, dtype: object
KATHLEEN_

# TODO Manual Correction of forms
# TODO Match annotations to reference