## File description:

This file creates the following lists and dictionaries:

- Lists of all medical words used in the notes for each insurance type, excluding stop words, punctuation, words containing digits, 1-letter words, and person names

- Lists of all nonmedical words used in the notes for each insurance type, excluding stop words, punctuation, words containing digits, 1-letter words, and person names

- Lists of all (medical and nonmedical combined) words used in the notes for each insurance type, excluding stop words, punctuation, words containing digits, 1-letter words, and person names


In [None]:
### Load required packages ###
from collections import Counter, OrderedDict #This is used to count frequencies of words in a list
import pickle #Use this to save dictionaries and lists
import pandas as pd

import spacy  #This is needed to break sentences into tokens
spacy.prefer_gpu()
import en_core_web_sm
en_core_web_sm.load()
from spacy.lang.en import English  #All inputted string data will be in English
from nltk.tokenize import word_tokenize

## Load data:

In [None]:
df = pd.read_csv(<Path to dataset>)

## Define a function which creates lists of all words that appear in a column of text

In [None]:
def word_occurrences(data, text_column, value_column, value):
    """
    data: pandas dataframe
    text_column: column of notes from which to extract words
    value_column: column of patient characteristics that will be used to subset the data (such as race, gender, and insurance columns)
    value: extract words only from the rows of text_column for which the value_column is equal to value
    returns: list of all words that occur in text_column for the specified rows
    """
    if value != "all" and value != "All":
        allwords = []
        for elem in data[text_column][data[value_column]== value]:
            if type(elem) == str:
                doc = nlp(elem)
                for token in doc:
                    if not " " in token.text and not token.text in {"`", "'s'", "n't"} and len(token.text) > 1:
                        allwords.append(token.text) #Add the (lowercase) lemma of each word to the list of all words
    else:
        allwords = []
        for elem in data[text_column]:
            if type(elem) == str:
                doc = nlp(elem)
                for token in doc:
                    if not " " in token.text and not token.text in {"`", "'s'", "n't"} and len(token.text) > 1:
                        allwords.append(token.text) #Add the (lowercase) lemma of each word to the list of all words
    return(allwords)

# Make medical word lists

### Save all MEDICAL words for low-income insurance patients to lists, then save the lists to pickle files

In [None]:
medical_words_lowincome = word_occurrences(df, "text_sections_Cleaned1_medical", "low_income_insurance", 1)

In [None]:
f = open('medical_words_lowincome.pkl', "wb")
pickle.dump(medical_words_lowincome, f)
f.close()

### Save all MEDICAL words for non-low-income patients to lists, then save the lists to pickle files

In [None]:
medical_words_nonlowincome = word_occurrences(df, "text_sections_Cleaned1_medical", "low_income_insurance", 0)

In [None]:
f = open('medical_words_nonlowincome.pkl', "wb")
pickle.dump(medical_words_nonlowincome, f)
f.close()

# Make nonmedical word lists

### Save all NONMEDICAL words for low-income insurance patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_lowincome = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "low_income_insurance", 1)

In [None]:
f = open('nonmedical_words_lowincome.pkl', "wb")
pickle.dump(nonmedical_words_lowincome, f)
f.close()

### Save all MEDICAL words for non-low-income patients to lists, then save the lists to pickle files

In [None]:
nonmedical_words_nonlowincome = word_occurrences(df, "text_sections_Cleaned1_nonmedical", "low_income_insurance", 0)

In [None]:
f = open('nonmedical_words_nonlowincome.pkl', "wb")
pickle.dump(nonmedical_words_nonlowincome, f)
f.close()

## Make all word lists (medical + nonmedical combined)

In [None]:
all_words_lowincome = word_occurrences(df, "text_sections_Cleaned1", "low_income_insurance", "all")

In [None]:
f = open('all_words_lowincome.pkl', "wb")
pickle.dump(all_words_lowincome, f)
f.close()

In [None]:
all_words_nonlowincome = word_occurrences(df, "text_sections_Cleaned1", "low_income_insurance", "all")

In [None]:
f = open('all_words_nonlowincome.pkl', "wb")
pickle.dump(all_words_nonlowincome, f)
f.close()