## File description:

This file creates the following lists and dictionaries:

- Lists of all medical words used in the notes for each race, excluding stop words, punctuation, words containing digits, 1-letter words, and person names

- Lists of all nonmedical words used in the notes for each race, excluding stop words, punctuation, words containing digits, 1-letter words, and person names

- Lists of all (medical and nonmedical combined) words used in the notes for each race, excluding stop words, punctuation, words containing digits, 1-letter words, and person names



In [None]:
### Load required packages ###
from collections import Counter, OrderedDict # This is used to count frequencies of words in a list
import pickle # Use this to save dictionaries and lists
import pandas as pd # Needed to load csv files

import spacy  # This is needed to break sentences into tokens
spacy.prefer_gpu()
import en_core_web_sm
en_core_web_sm.load()
nlp = spacy.load("en_core_web_sm") # Download a trained pipeline
from spacy.lang.en import English  # All inputted string data will be in English
from nltk.tokenize import word_tokenize

### Load data:

In [None]:
df = pd.read_csv(<Path to dataset>)

### Define a function which creates lists of all words that appear in a column of text

In [None]:
def word_occurrences(data, text_column, value_column, value):
    """
    data: pandas dataframe
    text_column: column of notes from which to extract words
    value_column: column of patient characteristics that will be used to subset the data (such as race, gender, and insurance columns)
    value: extract words only from the rows of text_column for which the value_column is equal to value
    returns: list of all words that occur in text_column for the specified rows
    """
    if value != "all" and value != "All":
        allwords = []
        for elem in data[text_column][data[value_column]== value]:
            if type(elem) == str:
                doc = nlp(elem)
                for token in doc:
                    if not " " in token.text and not token.text in {"`", "'s'", "n't"} and len(token.text) > 1:
                        allwords.append(token.text) #Add the (lowercase) lemma of each word to the list of all words
    else:
        allwords = []
        for elem in data[text_column]:
            if type(elem) == str:
                doc = nlp(elem)
                for token in doc:
                    if not " " in token.text and not token.text in {"`", "'s'", "n't"} and len(token.text) > 1:
                        allwords.append(token.text) #Add the (lowercase) lemma of each word to the list of all words
    return(allwords)

# Make medical word lists

### Save all MEDICAL words for White_NonHispanic patients to lists, then save the lists to pickle files

In [None]:
medical_words_White = word_occurrences(df, "text_sections_Cleaned1_medical", "Race_group", "White_NonHispanic")

In [None]:
f = open('medical_words_White.pkl', "wb")
pickle.dump(medical_words_White, f)
f.close()

### Save all MEDICAL words for Black patients to lists, then save the lists to pickle files

In [None]:
medical_words_Black = word_occurrences(df, "text_sections_Cleaned1_medical", "Race_group", "Black")

In [None]:
f = open('medical_words_Black.pkl', "wb")
pickle.dump(medical_words_Black, f)
f.close()

### Save all MEDICAL words for Asian patients to lists, then save the lists to pickle files

In [None]:
medical_words_Asian = word_occurrences(df, "text_sections_Cleaned1_medical", "Race_group", "Asian")

In [None]:
f = open('medical_words_Asian.pkl', "wb")
pickle.dump(medical_words_Asian, f)
f.close()

### Save all MEDICAL words for Hispanic patients to lists, then save the lists to pickle files

In [None]:
medical_words_Hispanic = word_occurrences(df, "text_sections_Cleaned1_medical", "Race_group", "Hispanic")

In [None]:
f = open('medical_words_Hispanic.pkl', "wb")
pickle.dump(medical_words_Hispanic, f)
f.close()

### Save all MEDICAL words for Other patients to lists, then save the lists to pickle files

In [None]:
medical_words_Other = word_occurrences(df, "text_sections_Cleaned1_medical", "Race_group", "Other")

In [None]:
f = open('medical_words_Other.pkl', "wb")
pickle.dump(medical_words_Other, f)
f.close()

### Save all MEDICAL words for Unknown patients to lists, then save the lists to pickle files

In [None]:
medical_words_Unknown = word_occurrences(df, "text_sections_Cleaned1_medical", "Race_group", "Unknown")

In [None]:
f = open('medical_words_Unknown.pkl', "wb")
pickle.dump(medical_words_Unknown, f)
f.close()

# Make nonmedical word lists

### Save all NON-MEDICAL words for White_NonHispanic patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_White = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "Race_group", "White_NonHispanic")

In [None]:
f = open('nonmedical_words_White.pkl', "wb")
pickle.dump(nonmedical_words_White, f)
f.close()

### Save all NON-MEDICAL words for Black patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_Black = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "Race_group", "Black")

In [None]:
f = open('nonmedical_words_Black.pkl', "wb")
pickle.dump(nonmedical_words_Black, f)
f.close()

### Save all NON-MEDICAL words for Asian patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_Asian = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "Race_group", "Asian")

In [None]:
f = open('nonmedical_words_Asian.pkl', "wb")
pickle.dump(nonmedical_words_Asian, f)
f.close()

### Save all NON-MEDICAL words for Hispanic patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_Hispanic = word_occurrences(thor_df, "text_sections_Cleaned1_nonmedical", "Race_group", "Hispanic")

In [None]:
f = open('nonmedical_words_Hispanic.pkl', "wb")
pickle.dump(nonmedical_words_Hispanic, f)
f.close()

### Save all NON-MEDICAL words for Other patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_Other = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "Race_group", "Other")

In [None]:
f = open('nonmedical_words_Other.pkl', "wb")
pickle.dump(nonmedical_words_Other, f)
f.close()

### Save all NON-MEDICAL words for Unknown patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_Unknown = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "Race_group", "Unknown")

In [None]:
f = open('nonmedical_words_Unknown.pkl', "wb")
pickle.dump(nonmedical_words_Unknown, f)
f.close()

# Make word lists for medical and nonmedical words combined

### Save ALL words for White_NonHispanic patients to lists, then save the lists to pickle files

In [None]:
all_words_White = word_occurrences(df, "text_sections_Cleaned1", "Race_group","White")

In [None]:
f = open('Pickle_files/words_White.pkl', "wb")
pickle.dump(all_words_White, f)
f.close()

### Save ALL words for Hispanic patients to lists, then save the lists to pickle files

In [None]:
all_words_Hispanic = word_occurrences(df, "text_sections_Cleaned1", "Race_group","Hispanic")

In [None]:
f = open('Pickle_files/words_Hispanic.pkl', "wb")
pickle.dump(all_words_Hispanic, f)
f.close()

### Save ALL words for Asian patients to lists, then save the lists to pickle files

In [None]:
all_words_Asian = word_occurrences(df, "text_sections_Cleaned1", "Race_group", "Asian")

In [None]:
f = open('Pickle_files/words_Asian.pkl', "wb")
pickle.dump(all_words_Asian, f)
f.close()

### Save ALL words for African American patients to lists, then save the lists to pickle files

In [None]:
all_words_African_American = word_occurrences(df, "text_sections_Cleaned1", "Race_group", "African American")

In [None]:
f = open('Pickle_files/words_African_American.pkl', "wb")
pickle.dump(all_words_African_American, f)
f.close()

### Save ALL words for Other patients to lists, then save the lists to pickle files

In [None]:
all_words_Other = word_occurrences(df, "text_sections_Cleaned1", "Race_group", "Other")

In [None]:
f = open('Pickle_files/words_Other.pkl', "wb")
pickle.dump(all_words_Other, f)
f.close()

### Save ALL words for 'Not available' patients to lists, then save the lists to pickle files

In [None]:
all_words_Not_available = word_occurrences(df, "text_sections_Cleaned1", "Race_group", "Not available")

In [None]:
f = open('Pickle_files/words_Not_available.pkl', "wb")
pickle.dump(all_words_Not_available, f)
f.close()

### Save ALL words for all patients to lists, then save the lists to pickle files

In [None]:
all_words_All = word_occurrences(df, "text_sections_Cleaned1", "Race_group","All")

In [None]:
f = open('Pickle_files/words_All.pkl', "wb")
pickle.dump(all_words_All, f)
f.close()