In [1]:
import sys
import os
import json
import pandas as pd
import hashlib
import spacy
import re
import random
from importlib import reload
from datetime import datetime
from collections import defaultdict
from pprint import pprint

In [2]:
"""
Load pre-processed files
"""

document_map = json.load(open("processed_annotations/DOCUMENT_MAP_02-14-2020.json"))
print(document_map.keys())

dict_keys(['95581557', '69408590', '36073164', '37497740', '24000641', '784359', '35262426', '46217133', '23099502', '97398937', '64616759', '58795983', '25969537', '94407046', '99175728', '31716101', '98303194', '27509790', '30924115', '6848654', '71475014', '58411629', '64073684', '15463536', '68411469', '62604978', '35790608', '24397520', '1775988', '70530481', '23440484', '2843282', '85954293', '84061096', '560211', '56347879', '5156065', '93118497', '17651082', '24229633', '25946820', '41320281', '16450710', '52351579', '22857905', '9849800', '96936919', '26577152', '23981145', '7319132', '60208881', '74909134', '7863434', '37431393', '95943802', '30820132', '26369189', '73758940', '60696396', '96278299', '38835297', '75494187', '32141700', '96480505', '69390110', '85095297', '67680881', '7960085', '79490243', '24677931', '66694691', '62892380', '17582431', '10532027', '33035501', '20918211', '80604657', '89016020', '58371701', '39000652', '59904054', '99272690', '2803036', '82742

In [6]:
"""
Load corrputed files, handle 
ANNOYING encoding issues the easy way
"""
dir_path = "corrupt_forms/"

corrupt_files = {}

for filepath in os.listdir(dir_path):
    with open(f"{dir_path}{filepath}", 'r') as file:
        data = str(file.read()).strip().encode('ascii', 'ignore')
        corrupt_files[filepath] = {"data":data}
        
pprint(list(corrupt_files.keys()))

['consent-form-genetics.txt',
 'Routine_Testing_CONSENT_TO_TREATMENT_SAMPLE-DCH_0675CF.txt',
 'ECT consent_Feb2017.txt']


In [7]:
for k,v in document_map.items():
    if "http://www.bmc.org/diagnostic-genetics/services.htm" in v['raw_content']:
        print(f"docuement_ID: {k}")
        print(f"dataturks len: {len(v['raw_content'])}")
        print(f"correct_file len: {len(corrupt_files['consent-form-genetics.txt']['data'])}")

docuement_ID: 98303194
dataturks len: 7405
correct_file len: 7337


In [8]:
for k,v in document_map.items():
    if "MICHIGAN DEPARTMENT OF HEALTH AND HUMAN SERVICES" in v['raw_content']:
        print(f"docuement_ID: {k}")
        print(f"dataturks len: {len(v['raw_content'])}")
        print(f"correct_file len: {len(corrupt_files['Routine_Testing_CONSENT_TO_TREATMENT_SAMPLE-DCH_0675CF.txt']['data'])}")

docuement_ID: 65639567
dataturks len: 4561
correct_file len: 4461


In [9]:
for k,v in document_map.items():
    if "Consent for ElectroConvulsive Therapy (ECT) Page 1 of 2" in v['raw_content']:
        print(f"docuement_ID: {k}")
        print(f"dataturks len: {len(v['raw_content'])}")
        print(f"correct_file len: {len(corrupt_files['ECT consent_Feb2017.txt']['data'])}")

docuement_ID: 24000641
dataturks len: 14094
correct_file len: 13961


In [10]:
"""
NOTE:
    - I don't see clear evidence that these where duplicated.
    - I believe that there is evidence that these (seemingly duplicated annotations) are human errors
    - Will have to handle on the annotation side
"""

"\nNOTE:\n    - I don't see clear evidence that these where duplicated.\n    - I believe that there is evidence that these (seemingly duplicated annotations) are human errors\n    - Will have to handle on the annotation side\n"

In [11]:
print(document_map['24000641']['raw_content'][8000:9000])

print("-----------------------------------------------------")

print(document_map['24000641']['raw_content'][11900:12900])

Y AND BEHAVIORAL SCIENCES 
Patient Identification Information 
Consent for ElectroConvulsive Therapy (ECT) Page 2 of 2 
8) By signing below I agree: 
� 
That a provider has explained and answered all of my questions related to ECT. 

� 
If I have further questions, I have the right to have those questions answered. 

� 
That no guarantees were made concerning the outcome, as the practice of medicine and psychiatry is not an exact science. 

� 
To have ECT. 

� 
That I have identified to a provider any restrictions on the sharing of information learned from the ECT. 

� 
I have not given up my right to refuse treatment at any time. 

� 
That I am entitled to a signed copy of this consent form. 


For the following statement, if the patient does not agree, cross it out with a single line. The patient and provider shall initial, date and time the cross-out: 
� To allow observers or technical advisors to be present during the ECT treatment. 
Patient Signature Date 
******IF THE PATIENT IS 

In [17]:
print(document_map['24000641']['raw_content'])

CONSENT 
DEPARTMENT OF PSYCHIATRY AND BEHAVIORAL SCIENCES 
Patient Identification Information 
Consent for ElectroConvulsive Therapy (ECT) Page 1 of 2 
Date: Patient Name (Print) 
1) I, the undersigned, understand the potential benefits as well as the potential risks involved in treatment of my diagnosis of 
 by means of ECT. I acknowledge that Dr.  has explained the purpose of the procedure, the risks/benefits of the procedure, the alternatives with the risks and benefits and the possibility of complications. 
I hereby give my consent and authorize and request the staff of the Johns Hopkins Hospital to give a series of ECT treatments to me. My doctor intends to begin the treatment course with one of the following (check one):  Unilateral treatments: Specify right  or left   Bilateral treatments  The lead placement may be altered during the treatment series based on the clinical response 
2) The indications of ECT have been explained to me in a manner that I understand. These include p