## File description:

This file creates the following lists and dictionaries:

- Lists of all medical words used in the notes for each gender, excluding stop words, punctuation, words containing digits, 1-letter words, and person names

- Lists of all nonmedical words used in the notes for each gender, excluding stop words, punctuation, words containing digits, 1-letter words, and person names

- Lists of all (medical and nonmedical combined) words used in the notes for each gender, excluding stop words, punctuation, words containing digits, 1-letter words, and person names



In [None]:
### Load required packages ###
from Word_Frequencies import nonmedical_sentiment_new #Import functions to create lists and dictionaries of words
from collections import Counter, OrderedDict #This is used to count frequencies of words in a list
import pickle #Use this to save dictionaries and lists
import pandas as pd

import spacy  #This is needed to break sentences into tokens
spacy.prefer_gpu()
import en_core_web_sm
en_core_web_sm.load()
from spacy.lang.en import English  #All inputted string data will be in English
from nltk.tokenize import word_tokenize

### Load data:

In [None]:
df = pd.read_csv(<Path to dataset>)

## Define a function which creates lists of all words that appear in a column of text

In [None]:
def word_occurrences(data, text_column, value_column, value):
    """
    data: pandas dataframe
    text_column: column of notes from which to extract words
    value_column: column of patient characteristics that will be used to subset the data (such as race, gender, and insurance columns)
    value: extract words only from the rows of text_column for which the value_column is equal to value
    returns: list of all words that occur in text_column for the specified rows
    """
    if value != "all" and value != "All":
        allwords = []
        for elem in data[text_column][data[value_column]== value]:
            if type(elem) == str:
                doc = nlp(elem)
                for token in doc:
                    if not " " in token.text and not token.text in {"`", "'s'", "n't"} and len(token.text) > 1:
                        allwords.append(token.text) #Add the (lowercase) lemma of each word to the list of all words
    else:
        allwords = []
        for elem in data[text_column]:
            if type(elem) == str:
                doc = nlp(elem)
                for token in doc:
                    if not " " in token.text and not token.text in {"`", "'s'", "n't"} and len(token.text) > 1:
                        allwords.append(token.text) #Add the (lowercase) lemma of each word to the list of all words
    return(allwords)

### Exclude patients with gendered disease site

In [None]:
from utils import filter_data

In [None]:
GENDERED_DISEASE_SITES = ['GU-Prostate',
 'Breast-NOS',
 'Gyn-Uterus',
 'Breast-DCIS',
 'Breast',
 'Gyn-Cervix',
 'Gyn-Vulva',
 'Gyn-NOS',
 'Gyn-Ovary',
 'GU-Testicular',
 'Gyn-Vagina',
 'GU-Penis',
 'Breast-Gynecomastia']


In [None]:
# Removed patients with gendered primary disease sites
df_filtered = filter_data(df, "disease_site",GENDERED_DISEASE_SITES)

# Make medical word lists

### Save all MEDICAL words for female patients to a list, then save that list to a pickle file

In [None]:
medical_words_Female = word_occurrences(df_filtered, "text_sections_Cleaned1_medical", "Gender", "Female")

In [None]:
f = open('medical_words_Female.pkl', "wb")
pickle.dump(medical_words_Female, f)
f.close()

### Save all MEDICAL words for male patients to a list, then save that list to a pickle file

In [None]:
medical_words_Male = word_occurrences(df_filtered, "text_sections_Cleaned1_medical", "Gender", "Male")

In [None]:
f = open('medical_words_Male.pkl', "wb")
pickle.dump(medical_words_Male, f)
f.close()

# Make nonmedical word lists

### Save all NON-MEDICAL words for female patients to a list, then save that list to a pickle file

In [None]:
nonmedical_words_Female = word_occurrences(df_filtered, "text_sections_Cleaned1_nonmedical", "Gender", "Female")

In [None]:
f = open('nonmedical_words_Female.pkl', "wb")
pickle.dump(nonmedical_words_Female, f)
f.close()

### Save all NON-MEDICAL words for male patients to a list, then save that list to a pickle file

In [None]:
nonmedical_words_Male = word_occurrences(df_filtered, "text_sections_Cleaned1_nonmedical", "Gender", "Male")

In [None]:
f = open('nonmedical_words_Male.pkl', "wb")
pickle.dump(nonmedical_words_Male, f)
f.close()

# Make all (medical + nonmedical) word lists

### Save ALL words for female patients to a list, then save that list to a pickle file

In [None]:
all_words_Female = word_occurrences(df_filtered, "text_sections_Cleaned1_nonmedical", "Gender", "All")

In [None]:
f = open('all_words_Female.pkl', "wb")
pickle.dump(all_words_Female, f)
f.close()

### Save ALL words for male patients to a list, then save that list to a pickle file

In [None]:
all_words_Male = word_occurrences(df_filtered, "text_sections_Cleaned1_nonmedical", "Gender", "All")

In [None]:
f = open('all_words_Male.pkl', "wb")
pickle.dump(all_words_Male, f)
f.close()