# Purpose

The purpose of this file is to bring together all of the functions for creating fake-PHI laden files (plus the check file) together for execution.

# Importing necessary libraries
Libraries required in the other iPython notebooks will be imported seperately there.

In [1]:
import ipynb #for importing other necessary files
import pandas #for handling datasets
import os #for finding all of the files in a directory

## Importing required iPython notebooks

These lines will simply run the other notebooks within this one (similarly to how the "import <filename>" command with normal python files works). Some notebooks will not be imported here, but those will be imported within the files that need to use them.

In [2]:
%run PHI_detection_replacement.ipynb #for functions relating to detecting de-identified PHIs and replacing them
#note: the PHI_detection_replacement.ipynb file will be importing JSON_read_write.ipynb, which doesn't have any applications here, which is why it is imported seperately.
%run create_output.ipynb #for functions relating to creating the output files (the .txt and .xml ones)
%run ./debug/debug.ipynb

              0  1      2
0          Emma  F  18688
1        Olivia  F  17921
2           Ava  F  14924
3      Isabella  F  14464
4        Sophia  F  13928
...         ... ..    ...
18024   Zymirah  F      5
18025     Zynah  F      5
18026   Zyniyah  F      5
18027    Zynlee  F      5
18028     Zyona  F      5

[18029 rows x 3 columns]


main function

In [3]:
def generate_synthetic_text(text, file_ID, output_type="csv", previous_data=pd.DataFrame(columns=["altered_texts", "XMLs"]), write=False, debug_mode=False):
    
    #call functions that detect and replace PHIs within the text
    text_PHIs, unrecognized = detect_text_PHIs(text)
    altered_text, PHI_to_tag = substitute_PHIs(text, text_PHIs)
    
    if debug_mode:
        #run debug function (create annotated HTMLs, write to unrecognized.json file)
        create_debug(file_ID, text, text_PHIs, unrecognized, altered_text, PHI_to_tag)  
    
    #check whether write mode is enabled or not
    if write:
        #create a txt file and a XML file as output
        if output_type == "philter_files":
            create_text(altered_text, file_ID)
            data = (altered_text, create_XML(altered_text, PHI_to_tag, file_ID, write=write))
            
        elif output_type == "csv":
            data = create_csv(altered_text, create_XML(altered_text, PHI_to_tag, file_ID, write=False), file_ID, previous_data=previous_data, write=write)
        
    else:
        if output_type == "philter_files":
            data = (altered_text, create_XML(altered_text, PHI_to_tag, file_ID, write=write))
        elif output_type == "csv":
            data = create_csv(altered_text, create_XML(altered_text, PHI_to_tag, file_ID, write=False), file_ID, previous_data=previous_data, write=write)
            
    return data


# Defining a function that loads de-identified text files

This will be drawing from the deidentified_notes directory, so it is reasonably important that all of the files within that directory are .csv files (or, apparently, .txt versions of .csv files are ok as well).

The code for scanning through all of the files within the directory comes from this StackOverflow thread: https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory

The reason I'm assuming that there might be more than 1 notes file is that the goal of this is to produce labels in order to evaluate Philter's performance. A substantial amount of training data will need to be created in order to achieve that purpose, so it would make sense to be able to create fake PHI for multiple files worth of clinical notes rather than one at a time.

In [5]:
def process_multiple_texts(filepath, filename=None, write=False, write_final=False, debug_mode=False, chunksize=1000, progress_counts=1000):
 
    final_data = pd.DataFrame(columns=["altered_texts", "XMLs"]) #creating a dataframe to add final data to (if csv mo)
    file_ID = 0
    
    #iterate through all files in the given directory
    for file in os.scandir(filepath):
        
        file_extension = os.path.splitext(file.path)[1]

        if file_extension == ".csv": #the processing required for reading data from the csv is slightly different from text files, so this is necessary
            for chunk in pd.read_csv(file, chunksize=chunksize):
                for text in chunk:
                    final_data = final_data.append(generate_synthetic_text(text, file_ID, output_type="csv", previous_data=final_data, write=False, debug_mode=debug_mode))
                    
                    if write:
                        generate_synthetic_text(text, file_ID, output_type="philter_files", write=True)
                    
                    if file_ID % progress_counts == 0:
                        print("Finished processing text " + str(file_ID) + ".") 
                    file_ID += 1
                    
        elif file_extension == ".txt":
            with open(file, 'r') as temp_file:              
                text = temp_file.read()
            final_data = generate_synthetic_text(text, file_ID, output_type="csv", previous_data=final_data, write=False, debug_mode=debug_mode)
            
            if write:
                generate_synthetic_text(text, file_ID, output_type="philter_files", write=True)
            
            if file_ID % progress_counts == 0:
                print("Finished processing text " + str(file_ID) + ".")
            file_ID += 1
    
    if write_final:
        print("writing final")
        create_csv(None, None, filename=filename, previous_data=final_data, write=True)
    
    return final_data

In [9]:
filepath = ("./data/deidentified_notes_small_test")

process_multiple_texts(filepath, filename="test", write=True, write_final=True, debug_mode=False, chunksize=100, progress_counts=1)
    
print("done.")

Finished processing text 0.
Finished processing text 1.
Finished processing text 2.
Finished processing text 3.
Finished processing text 4.
Finished processing text 5.
Finished processing text 6.
Finished processing text 7.
Finished processing text 8.
Finished processing text 9.
Finished processing text 10.
Finished processing text 11.
Finished processing text 12.
Finished processing text 13.
writing final
done.


In [8]:
render_displacy_html(-1, "original", jupyter_render=True)

In [7]:
render_displacy_html(-1, "altered", jupyter_render=True)