In [None]:
from tmnt.estimator import BowEstimator
import numpy as np
import gluonnlp as nlp
import os
import umap
from tmnt.preprocess.vectorizer import TMNTVectorizer
from tmnt.configuration import TMNTConfigBOW
from tmnt.trainer import BowVAETrainer
from tmnt.selector import BaseSelector
import pyLDAvis
import funcy
from tmnt.inference import BowVAEInferencer
import regex as re
import pandas as pd
from utils import filter_data, grab_sections, load_data
import random

## Load Data

In [None]:
bb_path = 'PATH'
thor_path = 'PATH'

broadband_data = load_data(bb_path)
thoracic_data = load_data(thor_path)

In [None]:
data = pd.concat([broadband_data, thoracic_data])
data.reset_index(inplace=True, drop=True)

We only use notes with the following sections: Interval History, Assesment and Plan, & History of Present Illness

In [None]:
data = data[data['text'].apply(grab_sections) != '']
data.reset_index(inplace=True, drop=True)

Create demographic dataframes

In [None]:
low_df = filter_data(data, 'insurance', ['low'], include=True)
reg_df = filter_data(data, 'insurance', ['reg'], include=True)
female_df = filter_data(data, 'Gender', ['Female'], include=True)
male_df = filter_data(data, 'Gender', ['Male'], include=True)
nwh_df = filter_data(data, 'Race_group', ['White_NonHispanic', 'Unknown'])
wh_df = filter_data(data, 'Race_group', ['White_NonHispanic'], include=True)

Sample 100 patients per demographic for inference

In [None]:
f_test_pmrns = random.sample(list(set(female_df['PMRN'])), 100)
m_test_pmrns = random.sample(list(set(male_df['PMRN'])), 100)
nw_test_pmrns = random.sample(list(set(nwh_df['PMRN'])), 100)
w_test_pmrns = random.sample(list(set(wh_df['PMRN'])), 100)
l_test_pmrns = random.sample(list(set(low_df['PMRN'])), 100)
r_test_pmrns = random.sample(list(set(reg_df['PMRN'])), 100)

In [None]:
female_test_df = female_df[female_df['PMRN'].isin(set(f_test_pmrns))]
reg_test_df = reg_df[reg_df['PMRN'].isin(r_test_pmrns)]
male_test_df = male_df[male_df['PMRN'].isin(m_test_pmrns)]
nwh_test_df = nwh_df[nwh_df['PMRN'].isin(nw_test_pmrns)]
wh_test_df = wh_df[wh_df['PMRN'].isin(w_test_pmrns)]
low_test_df = low_df[low_df['PMRN'].isin(l_test_pmrns)]

Seperate patients from training and inference

In [None]:
all_test_pmrns = m_test_pmrns+f_test_pmrns+nw_test_pmrns+w_test_pmrns+l_test_pmrns+r_test_pmrns
train_df = data[~(data['PMRN'].isin(all_test_pmrns))]
assert(len(set(train_df['PMRN']).intersection(set(all_test_pmrns))) == 0)

In [None]:
train_notes = train_df['text'].to_list()

Sectionize training data to include 300 tokens of each section: Interval History, Assesment and Plan, & History of Present Illness

In [None]:
notes = [grab_sections(fnote, token_len=300) for fnote in train_notes if grab_sections(fnote)]

## Train Topic Model

Set up hyperparameter search space

In [None]:
config_space = './config_files/config_nolabels.yaml'
tmnt_config = TMNTConfigBOW(config_space).get_configspace()

In [None]:
model_outputs_path = './inference_all_providers_model_outs/' 

#### Automatic Model Selection

In [None]:
tf_vectorizer = TMNTVectorizer(vocab_size=4000)
X, _ = tf_vectorizer.fit_transform(notes)
vocab = tf_vectorizer.get_vocab()

selector = BaseSelector(tmnt_config, iterations=50, searcher='random',
                        scheduler='hyperband', cpus_per_task=2, log_dir='./inference_all_providers_models/_full_model_out')

trainer = BowVAETrainer(vocab, X, X, log_out_dir='./inference_all_providers_models/_full_exps', model_out_dir='./inference_all_providers_models/_full_model_out') # Same train/validation set
estimator = selector.select_model(trainer)

In [None]:
inferencer = BowVAEInferencer(estimator[0], pre_vectorizer= tf_vectorizer)
full_model_dict = inferencer.get_pyldavis_details(X)
pylda_opts = funcy.merge(full_model_dict, {'mds': 'mmds'})
vis_data = pyLDAvis.prepare(**pylda_opts)
pyLDAvis.save_html(vis_data, model_outputs_path+'radOnc_EMR_topics.html')
pyLDAvis.display(vis_data)

In [None]:
inferencer.save(model_dir='./inference_all_providers_models/_full_model_dir')

### Evaluation 

Print out model perplexity and coherence (NPMI)

In [None]:
print(estimator[2])

#### Inference

In [None]:
reloaded_inference = BowVAEInferencer.from_saved(model_dir='./inference_all_providers_models/_full_model_dir')

Top 10 words per topic

In [None]:
reloaded_inference.get_top_k_words_per_topic(10)

In [None]:
female_test_notes = female_test_df['text'].to_list()
reg_test_notes = reg_test_df['text'].to_list()
male_test_notes = male_test_df['text'].to_list()
nwh_test_notes = nwh_test_df['text'].to_list()
wh_test_notes = wh_test_df['text'].to_list()
low_test_notes = low_test_df['text'].to_list()

Sectionize inference data based on same rules as training data

In [None]:
wh_test_notes = [grab_sections(note) for note in wh_test_notes]
nwh_test_notes = [grab_sections(note) for note in nwh_test_notes]
male_test_notes = [grab_sections(note) for note in male_test_notes]
female_test_notes = [grab_sections(note) for note in female_test_notes]
low_test_notes = [grab_sections(note) for note in low_test_notes]
reg_test_notes = [grab_sections(note) for note in reg_test_notes]

Append insurance information to each datapoint 

In [None]:
low_pmrns = set(low_df['PMRN'])
reg_pmrns = set(reg_df['PMRN'])

wh_test_info = []
for note, pmrn, race, gender in zip(wh_test_notes, wh_test_df['PMRN'], wh_test_df['Race_group'], wh_test_df['Gender']):
    if pmrn in low_pmrns:
        inc = 'low'
    elif pmrn in reg_pmrns:
        inc = 'non-low'
    else:
        inc = 'unknown'
    wh_test_info.append((note, pmrn, race, gender, inc))

nwh_test_info = []
for note, pmrn, race, gender in zip(nwh_test_notes, nwh_test_df['PMRN'], nwh_test_df['Race_group'], nwh_test_df['Gender']):
    if pmrn in low_pmrns:
        inc = 'low'
    elif pmrn in reg_pmrns:
        inc = 'non-low'
    else:
        inc = 'unknown'
    nwh_test_info.append((note, pmrn, race, gender, inc))

male_test_info = []
for note, pmrn, race, gender in zip(male_test_notes, male_test_df['PMRN'], male_test_df['Race_group'], male_test_df['Gender']):
    if pmrn in low_pmrns:
        inc = 'low'
    elif pmrn in reg_pmrns:
        inc = 'non-low'
    else:
        inc = 'unknown'
    male_test_info.append((note, pmrn, race, gender, inc))

female_test_info = []
for note, pmrn, race, gender in zip(female_test_notes, female_test_df['PMRN'], female_test_df['Race_group'], female_test_df['Gender']):
    if pmrn in low_pmrns:
        inc = 'low'
    elif pmrn in reg_pmrns:
        inc = 'non-low'
    else:
        inc = 'unknown'
    female_test_info.append((note, pmrn, race, gender, inc))

low_test_info = []
for note, pmrn, race, gender in zip(low_test_notes, low_test_df['PMRN'], low_test_df['Race_group'], low_test_df['Gender']):
    if pmrn in low_pmrns:
        inc = 'low'
    elif pmrn in reg_pmrns:
        inc = 'non-low'
    else:
        inc = 'unknown'
    low_test_info.append((note, pmrn, race, gender, inc))

reg_test_info = []
for note, pmrn, race, gender in zip(reg_test_notes, reg_test_df['PMRN'], reg_test_df['Race_group'], reg_test_df['Gender']):
    if pmrn in low_pmrns:
        inc = 'low'
    elif pmrn in reg_pmrns:
        inc = 'non-low'
    else:
        inc = 'unknown'
    reg_test_info.append((note, pmrn, race, gender, inc))

In [None]:
assert(len(reg_test_info)==len(reg_test_notes))

Sample 4000 notes for each demographic

In [None]:
wh_test_info = random.sample(wh_test_info, 4000)
nwh_test_info = random.sample(nwh_test_info, 4000)
male_test_info = random.sample(male_test_info, 4000)
female_test_info = random.sample(female_test_info, 4000)
low_test_info = random.sample(low_test_info, 4000)
reg_test_info = random.sample(reg_test_info, 4000)

In [None]:
wh_test_notes = [note[0] for note in wh_test_info]
nwh_test_notes = [note[0]  for note in nwh_test_info]
male_test_notes = [note[0]  for note in male_test_info]
female_test_notes = [note[0]  for note in female_test_info]
low_test_notes = [note[0]  for note in low_test_info]
reg_test_notes = [note[0]  for note in reg_test_info]

Get topic encodings for each dataset

In [None]:
wh_encodings = reloaded_inference.encode_texts(wh_test_notes)
nwh_encodings = reloaded_inference.encode_texts(nwh_test_notes)
male_encodings = reloaded_inference.encode_texts(male_test_notes)
female_encodings = reloaded_inference.encode_texts(female_test_notes)
low_encodings = reloaded_inference.encode_texts(low_test_notes)
reg_encodings = reloaded_inference.encode_texts(reg_test_notes)

Export enxodings to json

In [None]:
wh_encodings = [enc.tolist() for enc in wh_encodings]
nwh_encodings = [enc.tolist() for enc in nwh_encodings]
female_encodings = [enc.tolist() for enc in female_encodings]
male_encodings = [enc.tolist() for enc in male_encodings]
low_encodings = [enc.tolist() for enc in low_encodings]
reg_encodings = [enc.tolist() for enc in reg_encodings]
out_d = {'white':wh_encodings, 'non_white':nwh_encodings, 'female':female_encodings, 'male':male_encodings, 'low_inc':low_encodings, 'reg_inc':reg_encodings}

In [None]:
import json
with open('all_provider_inference_encodings.json' ,'w') as j_out:
    json.dump(out_d, j_out)