#  Notebook 4(a) of MBC topic modeling in BHC. 
***
This notebook is building LDA using the 15-topic model parameters 
***

### **Load Libriaries**

In [1]:
# This is the library of packages used in this model with a comment as to why they are needed

# Turn off pesky warnings
import warnings
warnings.filterwarnings("ignore")

# Need numpy and pandas for easier data manipulation
import pandas as pd
import numpy as np

# In order to save data at intermediate points for Excel pivoting and visualization, need the ability to write data to csv
import csv

from openpyxl import load_workbook

# Need datetime for manipulating date-time data in analysis and filtering
import datetime

# Bring in the copy package in case we need it to copy some lists or dataframes
import copy

# Need matplotlib and seaborne for visualizing the data
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt 

# show all columns when displaying pandas dfs
pd.set_option('display.max_columns', None)

# show a time running when executing long loops
from tqdm import tqdm

# Needed for parralellizing the running of the lemmatization and lda model runs, saves big time when assessing data
from joblib import Parallel, delayed 

# Need to be able to hit urls for some of the data tools
import requests

# Bring in the package for regular expressions for easier data manipulation
import re
import pickle
from tqdm import tqdm

# To enable pretty printing, we need to load pprint
from pprint import pprint

# Import the applicable gensim package components for manipulating data and doing LDA modeling
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# bring in spacy for lemmatization of our corpora
import spacy

# Plotting tools that we will need to use 
import pyLDAvis
import pyLDAvis.gensim_models

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en ... This was needed when first setting up spacy in the environment
nlp = spacy.load('en_core_web_sm')

# Bring in tqdm to track progress of for loops
from tqdm import tqdm

# Turn off pesky warnings by ignoring deprecation warnings
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os

  from imp import reload


### **Load processed data**

***
Now we bring in the processed data in pickle files for the list of word lists representing the abstracts
***

In [2]:
# Now load in the dataset we prepped from WoS, first set the path file and filename
with open('Preprocessed data.pickle', "rb") as f:
    data_words = pickle.load(f)  

### **Create the corpus and id2word objects**

In [3]:
# KEY PARAMETERS FOR THIS BLOCK
min_number_of_text = 15 
max_pct_of_text = 0.30
the_word_list = data_words

# Create Dictionary
# id2word = corpora.Dictionary(data_lemmatized)
id2word = corpora.Dictionary(the_word_list)

# First print out the total number of words
print("Before filtering, the total words in the dictionary are:", len(id2word))

# Then run the filter extremes code to get rid of very frequent and infrequent terms
id2word.filter_extremes(no_below = min_number_of_text, no_above = max_pct_of_text)

# Then print out the number of words again
print("After filtering, the total words in the dictionary are:", len(id2word))

# Create the corpus
corpus = [id2word.doc2bow(text) for text in the_word_list]

print("corpus length = ", str(len(corpus)), "\nid2word length = ", str(len(id2word)))

Before filtering, the total words in the dictionary are: 13348
After filtering, the total words in the dictionary are: 2740
corpus length =  1719 
id2word length =  2740


***
### **Building a Range of Topic Models**

***
We need to evaluate the "best number of topics" by creating a range of topic count lda models.
 
We have everything required to train the LDA model. In addition to the corpus and dictionary, you need to provide the number of topics as well. 
Apart from that, alpha and eta are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior. Chunksize is the number of documents to be used in each training chunk. update_every determines how often the model parameters should be updated and passes is the total number of training passes. According to Blei & Hoffman, the best settings for offset (tau_o) and decay (kappa) are tau_o = 64 and kappa = 0.5 with batch size >= 256
***

In [4]:
# # This is my attempt to set up the parallel processing function and do this much quicker in the future
from tqdm import tqdm
lda_mods = []
lower_bound = 10
upper_bound = 30
num_CPUs = 10
isVisual = 25
alpha_val = 'auto'          
random_state = 42           
eta_val = 'auto'            
decay_val = 0.5
offset_val = 64
eval_every_val = 1
passes_val = 40             
iterations_val = 100        
chunksize_val = 256         

# These are the parameter settings from the topic coherence tutorial
# passes=10, iterations=100, random_state=42, eval_every=None,
#         alpha='asymmetric',  # shown to be better than symmetric in most cases
#         decay=0.5, offset=64  # best params from Hoffman paper

# Define a function we will use with is input being the number of topics to run the lda model creation in gensim
def lda_loop(num_topics):

    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=random_state,
                                               update_every=1,
                                               chunksize=chunksize_val,
                                               passes=passes_val,
                                               alpha=alpha_val,
                                               decay = decay_val,
                                               iterations = iterations_val,
                                               offset = offset_val,
                                               eval_every = eval_every_val,
                                               per_word_topics=True)
    return(lda_model)

  
# And now we run the jobs in parallel using the function defined above   
# lda_mods = Parallel(n_jobs = num_CPUs, verbose = isVisual, batch_size = 1)(delayed(dons_lda_loop)(num_topics) 
#                                                                               for num_topics in range(lower_bound, upper_bound))

In [5]:
for num_topics in tqdm(range(lower_bound, upper_bound+1)):
    temp_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=random_state,
                                               chunksize=chunksize_val,
                                               passes=passes_val,
                                               decay = decay_val,
                                               iterations = iterations_val,
                                               offset = offset_val,
                                               eval_every = eval_every_val,
                                               per_word_topics=True)
    lda_mods.append(temp_model)

100%|██████████| 21/21 [15:36<00:00, 44.57s/it]


***
Let's save these models using pickle so we don't lose the time we spent calculating them all <br>
***

In [6]:
# Import the pickle package
import pickle

# Now save the dataframe to a pickle file
with open("LDA Models.pickle", 'wb') as f:
    pickle.dump(lda_mods, f)

***
If we need to read in the file later, this is the code we will use to do so<br>
***

In [7]:
# This is the code needed to actually load the list of lda models in if we have saved them to a pickle file
import pickle

# Now read the pickle dataframe to a csv file 
with open("LDA Models.pickle", "rb") as f:
    lda_mods_loaded = pickle.load(f)

len(lda_mods_loaded)

21

#### Compute Model Perplexity and Coherence Score
Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. I found topic coherence score more helpful.
***

In [8]:
# from tqdm import tqdm
keep_em = []

for l, i in tqdm(enumerate(lda_mods)):# Compute Perplexity
    lda_model = i
    pplex = lda_model.log_perplexity(corpus)
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_trigrams2, dictionary=id2word, coherence='c_v')
    coherence_model_lda = CoherenceModel(model=lda_model, texts = the_word_list, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    keep_em.append([l+lower_bound, pplex, coherence_lda])
    ##print('\nPerplexity: ', )  # a measure of how good the model is. lower the better.
    # Compute Coherence Score
    # print('\nCoherence Score: ', coherence_lda)
    
    pprint(keep_em[l])
print("Completed the run")

1it [00:20, 20.16s/it]

[10, -7.031104207275535, 0.5061998459213439]


2it [00:40, 20.30s/it]

[11, -7.027120153605062, 0.5409591629108247]


3it [01:00, 20.30s/it]

[12, -7.033381941970648, 0.5268854653844749]


4it [01:22, 20.78s/it]

[13, -7.036881837873324, 0.5639511926349284]


5it [01:43, 20.91s/it]

[14, -7.033436780934565, 0.5565689095073323]


6it [02:04, 20.95s/it]

[15, -7.038631090461407, 0.5511520359919386]


7it [02:26, 21.40s/it]

[16, -7.04328343048125, 0.5485439813170135]


8it [02:48, 21.48s/it]

[17, -7.048979445623456, 0.5051741537666141]


9it [03:10, 21.61s/it]

[18, -7.047214896043586, 0.5272514103013184]


10it [03:33, 21.96s/it]

[19, -7.044671433830526, 0.5312004387498718]


11it [03:56, 22.24s/it]

[20, -7.057341493808021, 0.48851963661232994]


12it [04:19, 22.72s/it]

[21, -7.051016204714973, 0.4932336565714549]


13it [04:43, 23.04s/it]

[22, -7.051351673734361, 0.4976564616198068]


14it [05:07, 23.41s/it]

[23, -7.054298091737433, 0.48889005916517464]


15it [05:31, 23.47s/it]

[24, -7.0659724839135185, 0.45127985220349304]


16it [05:56, 23.83s/it]

[25, -7.064547037357756, 0.4582942994291956]


17it [06:20, 23.90s/it]

[26, -7.063375938386607, 0.4470100347746033]


18it [06:45, 24.28s/it]

[27, -7.0563295807090975, 0.46485277893853516]


19it [07:10, 24.59s/it]

[28, -7.067945660754975, 0.43932405284203047]


20it [07:34, 24.39s/it]

[29, -7.062452976173203, 0.4327431722342714]


21it [07:58, 22.80s/it]

[30, -7.064449520276917, 0.4177706757261012]
Completed the run





In [9]:
# Save the file with the perplexity and coherence by # of topics
keep_em2 = pd.DataFrame(keep_em)
keep_em2.columns = ["num_topics", "perplexity", "coherence"]

# Now save the dataframe to a csv file
keep_em2.to_csv("Topic Coherence and Perplexity for the Models (15).csv", mode ='a', header = False)

### **Exploring Individual Topic Models**
***
This is where we can pull out one of the specific topic models to explore the results in some of the visualizations below. <br>
What we need to do is put in the number of topics that we want to look at and then step through the rest of the visualizations below.
***

In [10]:
# Which count of topics model do you want to explore
counts_wanted = 15

# # Use this command if you loaded the file from pickle
# lda_mods = lda_mods_loaded
lower_bound = 10

# Pull the desired model out of the list of topic models
lda_model = lda_mods[counts_wanted - lower_bound]

#### View the topics in LDA model
The above LDA model is built with 15 different topics where each topic is a combination of keywords and each keyword contributes a certain weightage to the topic.<br>
You can see the top ten keywords for each topic and their weightage(importance) using lda_model.print_topics() as shown below. <br>
***

In [11]:
# Print the Keywords in the topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

[(0,
  '0.044*"assessment" + 0.029*"disorder" + 0.022*"diagnosis" + '
  '0.022*"interview" + 0.017*"screen" + 0.017*"appointment" + '
  '0.017*"questionnaire" + 0.016*"phq" + 0.014*"reminder" + 0.012*"diagnose"'),
 (1,
  '0.021*"telehealth" + 0.019*"session" + 0.015*"virtual" + 0.015*"visit" + '
  '0.014*"information" + 0.012*"video" + 0.012*"security" + '
  '0.012*"appointment" + 0.009*"covid" + 0.009*"online"'),
 (2,
  '0.057*"student" + 0.043*"school" + 0.019*"family" + 0.013*"child" + '
  '0.011*"screen" + 0.009*"teen" + 0.008*"social" + 0.008*"district" + '
  '0.008*"pandemic" + 0.007*"resource"'),
 (3,
  '0.042*"code" + 0.032*"real" + 0.026*"study" + 0.022*"analysis" + '
  '0.022*"symptom" + 0.021*"world" + 0.020*"drug" + 0.017*"disorder" + '
  '0.017*"cohort" + 0.016*"project"'),
 (4,
  '0.025*"ehr" + 0.015*"software" + 0.013*"billing" + 0.010*"staff" + '
  '0.010*"claim" + 0.008*"record" + 0.008*"code" + 0.008*"cost" + '
  '0.008*"management" + 0.007*"note"'),
 (5,
  '0.042*"ad

In [12]:
top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'weight']).to_csv("top_words.csv")

***
#### Visualize the topics keywords
Now that the LDA model is built, the next step is to examine its topics and the associated keywords.
There is no better tool than pyLDAvis package’s interactive chart and is designed to work well with jupyter notebooks.
This is a great exploratory tool for us to look at the topics and explore some of the lemmatized words.

The study according to Sievert and Shirley found that the optimal value to use for lambda is 0.6 based on testing real world data with assessments by experts.
The recommendation actually means that it is probably best not to use the top 10 or top 30 weighted words by probability overall

Reference is:
Sievert, Carson, and Kenneth Shirley. "LDAvis: A method for visualizing and interpreting topics."
Proceedings of the workshop on interactive language learning, visualization, and interfaces. 2014.
https://www.aclweb.org/anthology/W14-3110.pdf

Another good reference that discusses the parameters is this one from the writer of gensim
https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html
***

In [13]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False, R = 10)
vis

In [14]:
pyLDAvis.save_html(vis, 'pyLDAvis for 15 topic model.html')

#### How to interpret the visual
Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart.
If you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.
We have successfully built a relatively good topic model.
***

***
#### Dominant topic and its percentage contribution in each document
In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. <br> 
The below code extracts this dominant topic for each sentence and shows the weight of the topic and the keywords in a nicely formatted output. <br>
This way, you will know which document belongs predominantly to which topic. <br>
We will save the output of this table so we can explore it in a spreadsheet. <br>
***

In [22]:
data = pd.read_csv('MBC Dataset Combined.csv')

def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=the_word_list)

In [29]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.Dominant_Topic = df_dominant_topic.Dominant_Topic + 1 # set topic number to start from 1
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5,0.8478,"ehr, software, billing, staff, claim, record, ...","[medication, assist, treatment, simplify, sche..."
1,1,5,0.9462,"ehr, software, billing, staff, claim, record, ...","[essential, step, business, grow, present, day..."
2,2,5,0.9258,"ehr, software, billing, staff, claim, record, ...","[billing, lab, click, add, panel, lab, code, c..."
3,3,5,0.9708,"ehr, software, billing, staff, claim, record, ...","[report, capitation, billing, overview, capita..."
4,4,11,0.4981,"measurement, progress, assessment, clinician, ...","[integrate, care, pathway, integrate, care, pa..."


In [30]:
# Now save the dataframe to a csv file with a datetime stamp 
dominant_filename = "Dominant Topics by Document for "+ str(counts_wanted) + " Topics Model " + ".csv"
df_dominant_topic.to_csv(dominant_filename)

***
#### Most Representative Topic
We explore particular texts if we want to see what a representative article looks like by taking the documents with the highest topic scores <br>
***

In [31]:
# Call up the document you want to look at by the biggest score for each topic
df_dt2 = df_dominant_topic
df_dt2.sort_values("Topic_Perc_Contrib", ascending = False, inplace = True)
df_dt2.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
285,285,7,0.9994,"therapist, feedback, session, study, rating, a...","[even, therapist, improvement, note, session, ..."
226,226,7,0.9992,"therapist, feedback, session, study, rating, a...","[detect, potential, breach, alliance, breach, ..."
272,272,15,0.9992,"scale, rating, score, sample, validity, study,...","[evaluation, outcome, become, increasingly, ro..."
236,236,7,0.9991,"therapist, feedback, session, study, rating, a...","[couple, complete, outcome, rating, scale, fin..."
283,283,7,0.999,"therapist, feedback, session, study, rating, a...","[distress, goal, well, relationship, item, que..."


In [36]:
df_dt2.to_csv("Most representative documents.csv")

In [32]:
# Get the count of documents per dominant topic
counts_dom = pd.DataFrame(df_dt2.Dominant_Topic.value_counts())
counts_dom.reset_index(inplace = True)
counts_dom.columns = ["Dominant_Topic", "Count_of_Docs"]
counts_dom.to_csv("count of documents per dominant topic.csv")
counts_dom

Unnamed: 0,Dominant_Topic,Count_of_Docs
0,9,367
1,12,276
2,5,252
3,13,220
4,11,116
5,7,92
6,8,80
7,10,74
8,6,62
9,14,52


In [33]:
df_dt3 = df_dt2.drop_duplicates(["Dominant_Topic", "Keywords"], keep = "first" )
df_dt3.sort_values("Dominant_Topic", ascending = True, inplace = True)
df_dt3 = pd.merge(df_dt3, counts_dom, on = "Dominant_Topic", how = "left")
df_dt3.sort_values("Dominant_Topic")

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Count_of_Docs
0,863,1,0.9982,"assessment, disorder, diagnosis, interview, sc...","[come, look, learn, patient, health, questionn...",28
1,1509,2,0.9973,"telehealth, session, virtual, visit, informati...","[practice, opportunity, create, list, add, add...",41
2,156,3,0.9962,"student, school, family, child, screen, teen, ...","[ayer, baisd, supervisor, special, education, ...",32
3,519,4,0.9979,"code, real, study, analysis, symptom, world, d...","[behavioral, disorder, characterize, presence,...",9
4,1429,5,0.9971,"ehr, software, billing, staff, claim, record, ...","[mental, health, billing, feel, gauntlet, diff...",252
5,740,6,0.9968,"addiction, opioid, substance, recovery, medica...","[medication, assist, treatment, involve, speci...",62
6,285,7,0.9994,"therapist, feedback, session, study, rating, a...","[even, therapist, improvement, note, session, ...",92
7,1330,8,0.9973,"community, woman, depression, adult, illness, ...","[pride, unify, force, bring, people, together,...",80
8,1293,9,0.9974,"digital, company, user, team, deliver, product...","[health, company, health, lead, provider, evid...",367
9,719,10,0.9944,"suicide, veteran, risk, crisis, awareness, res...","[recognize, suicide, prevention, awareness, mo...",74


***
Here we print the full text of the most representative document
***

In [35]:
# Now let's print out the actual text for each document with the highest topic score
for i in range(0, len(df_dt3), 1):
    print("Topic", df_dt3.iloc[i].Dominant_Topic.astype(int), "dominates", df_dt3.iloc[i].Count_of_Docs, "documents and has keywords\n")
    print(df_dt3.iloc[i].Keywords,"\n")
    print("The most representative document for Topic", df_dt3.iloc[i].Dominant_Topic.astype(int), "is:\n")
    print(data.iloc[df_dt3.iloc[i]["Document_No"]].text, "\n\n")
#     print(df_dt3.iloc[topic]["Document_No"])

Topic 1 dominates 28 documents and has keywords

assessment, disorder, diagnosis, interview, screen, appointment, questionnaire, phq, reminder, diagnose 

The most representative document for Topic 1 is:

If you've come here looking to learn more about patient health questionnaires used to assess mental and behavioral health disorders, you're in the right place. This blog provides information that helps answer the common question "What is a patient health questionnaire?" Well discuss one of the most common patient health questionnaires and how it's scored and then briefly touch on a popular variant of this patient mental health screener. More importantly, though, well provide an overview of why commonly used patient health questionnaires are coming up short. Well conclude the piece by sharing some of the key qualities that providers should look for when choosing a better alternative to these flawed solutions. Patient Health Questionnaire: Defining the Concept Despite sounding like a br

***
#### Topic distribution across documents
We want to understand the volume and distribution of topics in order to judge how widely it was discussed. The below table exposes that information. <br>
***

In [42]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
df_dominant_topics.Dominant_Topic = df_dominant_topics.Dominant_Topic + 1 # set topic number to start from 1

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0,5,"ehr, software, billing, staff, claim, record, ...",28.0,0.0163
1,5,"ehr, software, billing, staff, claim, record, ...",41.0,0.0239
2,5,"ehr, software, billing, staff, claim, record, ...",32.0,0.0186
3,5,"ehr, software, billing, staff, claim, record, ...",9.0,0.0052
4,11,"measurement, progress, assessment, clinician, ...",252.0,0.1466
...,...,...,...,...
1714,9,"digital, company, user, team, deliver, product...",,
1715,12,"feel, think, thing, day, stress, come, start, ...",,
1716,5,"ehr, software, billing, staff, claim, record, ...",,
1717,9,"digital, company, user, team, deliver, product...",,


In [43]:
df_dominant_topics.to_csv("Percentage of Documents for Each Topic.csv")

#### Topic weights per document

In [44]:
# Pull off the topic weights per document
num_docs = len(corpus)
num_topics = len(lda_model.get_topics())

# Now create a dataframe to store the topic weights for each document
doc_topics = pd.DataFrame(np.zeros((num_docs, num_topics)))
doc_topics.columns = ["Topic"+str(i) for i in range(1, num_topics+1)]

# Start the count and then loop through the lda_model to pull the topic weights off for each document and populate the df doc_topics
count = 0
for i in tqdm(range(0,num_docs)):
    temp_list = lda_model.get_document_topics(corpus[i])
    for j in range(0, len(temp_list)):
        doc_topics.iloc[count, temp_list[j][0]] = temp_list[j][1]
    count = count+1

100%|██████████| 1719/1719 [00:01<00:00, 1101.59it/s]


In [45]:
doc_topics.columns = ['MBC intake', 'Telehealth', 'School-Based Treatment', 'Drug Study', 'Reimbursement', 'Substance Abuse', 'Therapeutic Alliance',
                     'Community Mental Health', 'MBC Platform', 'Veterans', 'Value-Based Care', 'Therapy Process', 'Integrated healthcare', 'Family and Autism', 'Psychometrics']
doc_topics.index.name = 'Document'
doc_topics

Unnamed: 0_level_0,MBC intake,Telehealth,School-Based Treatment,Drug Study,Reimbursement,Substance Abuse,Therapeutic Alliance,Community Mental Health,MBC Platform,Veterans,Value-Based Care,Therapy Process,Integrated healthcare,Family and Autism,Psychometrics
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0.00000,0.000000,0.0,0.000000,0.847811,0.146523,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,0.00000,0.000000,0.0,0.038076,0.946162,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2,0.02788,0.000000,0.0,0.000000,0.925781,0.041338,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.00000,0.000000,0.0,0.000000,0.970829,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.00000,0.000000,0.0,0.000000,0.326422,0.015836,0.000000,0.0,0.000000,0.000000,0.498076,0.000000,0.157992,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1714,0.00000,0.000000,0.0,0.000000,0.000000,0.215382,0.000000,0.0,0.724446,0.000000,0.000000,0.000000,0.000000,0.0,0.046609
1715,0.00000,0.161522,0.0,0.000000,0.000000,0.291192,0.000000,0.0,0.196345,0.000000,0.000000,0.332132,0.000000,0.0,0.000000
1716,0.00000,0.000000,0.0,0.000000,0.243594,0.089915,0.050582,0.0,0.213650,0.000000,0.206039,0.052803,0.142391,0.0,0.000000
1717,0.00000,0.000000,0.0,0.000000,0.000000,0.243742,0.043388,0.0,0.286185,0.013418,0.166354,0.079801,0.166159,0.0,0.000000


In [46]:
doc_topics.to_csv("Topic Weights.csv")

In [37]:
sent_topics_df = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
    sent_topics_df = pd.concat([sent_topics_df,
grp.sort_values(['Perc_Contribution'], ascending=[0]).head(2)], axis=0)
sent_topics_df.reset_index(drop=True, inplace=True)
sent_topics_df.columns = [
   'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text'
]
sent_topics_df.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,0,0.9982,"assessment, disorder, diagnosis, interview, sc...","[come, look, learn, patient, health, questionn..."
1,0,0.9976,"assessment, disorder, diagnosis, interview, sc...","[health, questionnaire, become, standard, ment..."
2,1,0.9973,"telehealth, session, virtual, visit, informati...","[practice, opportunity, create, list, add, add..."
3,1,0.9962,"telehealth, session, virtual, visit, informati...","[mental, health, practice, owner, important, p..."
4,2,0.9962,"student, school, family, child, screen, teen, ...","[ayer, baisd, supervisor, special, education, ..."


In [38]:
sent_topics_df.to_csv("Top two most representative documents.csv")