<a href="https://colab.research.google.com/github/Colsai/shirabayashi_data606/blob/main/data_preparation_eda/LDA_vis_initial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Workplan Topic Modeling
## Initial Test of LDA Modeling
- Looking through the current OIG work projects and group them using LDA.

In [70]:
#Pip install
!pip install pyLDAvis
!python -m spacy download en_core_web_md -qq
from IPython.display import clear_output
clear_output()

In [71]:
###########################
# Packages                #
###########################
import pandas as pd
import sklearn
import nltk
import spacy
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis.gensim_models
import en_core_web_md
import gensim
import random
import pyLDAvis
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
from gensim.test.utils import datapath
import warnings

warnings.filterwarnings('ignore')

# Set options for specific packages
nltk.download(['punkt', 
               'stopwords'])

# Visualise inside a notebook
pyLDAvis.enable_notebook()

sns.set()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [72]:
###########################
# Load CSVs (Github)      #
###########################
workplan_df = pd.read_csv("https://github.com/Colsai/scott_data606/blob/main/Data_Sources/HHS_OIG_workplans.csv?raw=true")

#Drop dead column
workplan_df.drop(columns = 'Office of Evaluation and Inspections', 
                 inplace = True)

reports_df = pd.read_csv("https://github.com/Colsai/scott_data606/blob/main/Data_Sources/HHS_OIG_Reports.csv?raw=true")

In [None]:
# Save model to disk.
def save_model(path = '/content/drive/MyDrive/DATA_606/lda_model', 
               model = Lda_model)
    temp_file = datapath([path])
    model.save(temp_file)

# Load a potentially pretrained model from disk.
def load_model(path = ''):
    saved_model = LdaModel.load(path)
    return saved_model

## Topic Modeling for Workplan df
Look at topics of previous projects decided on

In [73]:
workplan_df.head()

Unnamed: 0,Announced or Revised,Agency,Title,Component,Report Number(s),Expected Issue Date (FY),Website_Link,Summary
0,Completed,Administration for Children and Families,States' Accuracy of Reporting TANF Spending In...,Office of Audit Services,A-02-17-02005; W-00-17-25100,2021,https://oig.hhs.gov/reports-and-publications/w...,The Temporary Assistance for Needy Families (T...
1,Completed,Administration for Children and Families,Head Start: Review of Single Audit Findings an...,Office of Audit Services,"A-02-16-02009, A-09-16-01004, A-06-17-07003;...",2018,https://oig.hhs.gov/reports-and-publications/w...,Effective for awards made on or after December...
2,Completed,Administration for Children and Families,Unaccompanied Children Program Grantee Reviews,Office of Audit Services,A-02-16-02013; A-04-16-03566; A-02-16-02007;...,2020,https://oig.hhs.gov/reports-and-publications/w...,"Under the Homeland Security Act of 2002, § 462..."
3,Nov-16,Administration for Children and Families,Recommendation Follow-Up: Office of Refugee Re...,Office of Evaluation and Inspections,OEI-09-16-00260,2017,https://oig.hhs.gov/reports-and-publications/w...,"Under the Homeland Security Act of 2002, § 462..."
4,October 2020,Administration for Children and Families,ACF Oversight of Guardian Ad Litem Requirement...,Office of Evaluation and Inspections,OEI-12-16-00120,2022,https://oig.hhs.gov/reports-and-publications/w...,As a condition of receiving Child Abuse Preven...


In [74]:
###########################
# Clean Data              #
###########################

from nltk.stem import RegexpStemmer

stopwords = nltk.corpus.stopwords.words('english')

#Add stopwords
stopwords.append(['on',
                  'or', 
                  'to', 
                  'a', 
                  'as', 
                  'of', 
                  'for',
                  'this', 
                  'by', 
                  's',
                  'oig'])

#Set tokenizer to additionally remove punctuation
tokenizer = RegexpTokenizer(r'\w+')

wp_init_srs = [paragraph.lower() for paragraph in workplan_df["Summary"]]

tokenized_sums = [[i for i in tokenizer.tokenize(sent) if i not in stopwords] 
                  for sent in wp_init_srs]

# Regex Stemmer (Other Stemmers didn't seem to work well)
Reg_stemmer = RegexpStemmer("ing$|s$|ies$")

tokenized_stemmed_sums = [[Reg_stemmer.stem(word) for word in sent] for sent in tokenized_sums]

# Set new column of tokenized summaries
workplan_df["summary_tokenized"] = tokenized_stemmed_sums

workplan_df["summary_token_num"] = [len(sent) for sent in tokenized_stemmed_sums]

In [75]:
dictionary = Dictionary(workplan_df["summary_tokenized"])

#Filter extreme token values (no tokens below 5, no tokens in over 60% of documents)
dictionary.filter_extremes(no_below=5, 
                           no_above=0.5, 
                           keep_n=1000)

In [76]:
corpus = [dictionary.doc2bow(doc) for doc in workplan_df["summary_tokenized"]]

In [77]:
###############################
# LDA Data: Normal LDA        #
###############################
lda_model = LdaMulticore(corpus=corpus, 
                         id2word=dictionary, 
                         iterations=100, 
                         num_topics=10,
                         workers = 8, 
                         passes=20)

#Visualize the model with pyLDAvis
lda_model_visualize = pyLDAvis.gensim_models.prepare(lda_model,
                                                     corpus = corpus, 
                                                     dictionary = dictionary)
lda_model_visualize

In [81]:
#Topics Generated
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.064*"hospital" + 0.055*"medicare" + 0.040*"payment" + 0.034*"care" + 0.030*"inpatient" + 0.017*"claim" + 0.015*"beneficiary" + 0.013*"snf" + 0.013*"hospice" + 0.013*"transfer"
Topic: 1 
Words: 0.030*"federal" + 0.030*"fund" + 0.028*"health" + 0.027*"hh" + 0.022*"grant" + 0.018*"program" + 0.016*"control" + 0.015*"audit" + 0.014*"award" + 0.012*"center"
Topic: 2 
Words: 0.035*"financial" + 0.029*"data" + 0.025*"act" + 0.023*"drug" + 0.021*"review" + 0.020*"audit" + 0.016*"program" + 0.015*"care" + 0.014*"require" + 0.014*"statement"
Topic: 3 
Words: 0.030*"nih" + 0.026*"risk" + 0.024*"cm" + 0.024*"health" + 0.022*"laboratory" + 0.022*"test" + 0.020*"act" + 0.018*"clinical" + 0.015*"diagnose" + 0.015*"medicare"
Topic: 4 
Words: 0.062*"state" + 0.037*"care" + 0.025*"program" + 0.023*"federal" + 0.021*"children" + 0.020*"medicaid" + 0.018*"provider" + 0.017*"requirement" + 0.016*"health" + 0.012*"act"
Topic: 5 
Words: 0.048*"nurs" + 0.043*"home" + 0.023*"report" + 0.022*

In [78]:
#Check how it classifies

n = random.randint(0,len(workplan_df))

print(workplan_df.iloc[n]['Summary'], end = '')

for index, score in sorted(lda_model[corpus[n]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

Histocompatibility laboratories typically provide testing required for bone marrow and solid organ transplantation services.  Cost information for these laboratories must be accurate and in sufficient detail to support payments made for services provided (42 CFR § 413.24(a) and (c)). Costs claimed in the cost report must be related to the care of beneficiaries; reasonable, necessary, and proper; and allowable under Medicare regulations (42 CFR § 413.9(a), (b), and (c)(3)). From March 31, 2013, through September 30, 2014, histocompatibility laboratories reported $131 million in reimbursable costs on their most recent cost reports. We will determine whether payments to histocompatibility laboratories were made in accordance with Medicare requirements.
Score: 0.5455043315887451	 
Topic: 0.055*"payment" + 0.031*"medicare" + 0.018*"provider" + 0.014*"treatment" + 0.012*"use" + 0.012*"improper" + 0.011*"cm" + 0.011*"review" + 0.010*"procedure" + 0.010*"part"

Score: 0.4406994879245758	 
Topi

## Coherence Model Score
Score Coherence of Model - u_mass coherence score

In [80]:
#LDA Model Coherence Score
cm = CoherenceModel(model=lda_model, 
                    corpus=corpus, 
                    coherence="u_mass")

coherence = cm.get_coherence()  # get coherence value

print(coherence)

-2.4697035389113178


## Tfidf Model

In [82]:
from gensim import corpora, models
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=8, id2word=dictionary, passes=20, workers=8)

In [83]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.016*"medicare" + 0.012*"payment" + 0.012*"claim" + 0.010*"hospital" + 0.009*"physician" + 0.008*"hospice" + 0.008*"bill" + 0.008*"inpatient" + 0.007*"financial" + 0.006*"b"
Topic: 1 Word: 0.013*"opioid" + 0.008*"covid" + 0.008*"19" + 0.008*"drug" + 0.007*"response" + 0.006*"data" + 0.006*"fda" + 0.006*"use" + 0.006*"nurs" + 0.006*"health"
Topic: 2 Word: 0.011*"saving" + 0.008*"efficient" + 0.006*"promote" + 0.005*"investment" + 0.005*"infrastructure" + 0.004*"coordinate" + 0.004*"delivery" + 0.004*"initiative" + 0.004*"accountability" + 0.003*"quality"
Topic: 3 Word: 0.016*"telehealth" + 0.016*"mco" + 0.015*"drug" + 0.012*"part" + 0.012*"payment" + 0.011*"snf" + 0.011*"beneficiar" + 0.010*"medicaid" + 0.010*"managed" + 0.010*"transfer"
Topic: 4 Word: 0.012*"state" + 0.010*"children" + 0.008*"fund" + 0.008*"program" + 0.007*"medicaid" + 0.007*"federal" + 0.007*"nih" + 0.007*"provider" + 0.007*"cost" + 0.007*"grant"
Topic: 5 Word: 0.017*"charge" + 0.014*"incentive" + 0.0

In [84]:
#corpus = random.randint(0,len(corpus))
n = 300
print(workplan_df.iloc[n]['Summary'], end = '')
for index, score in sorted(lda_model_tfidf[corpus[n]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

Opioid abuse and overdose deaths are at crisis levels in the United States, with approximately 49,000 Americans dying from opioids in 2017, an increase from more than 42,000 in 2016.   Consistent with previous OIG work in Medicare Part D and Medicaid, we will determine the extent to which beneficiaries are receiving extreme amounts of opioids through Indian Health Service (IHS), as well as IHS-employed prescribers and IHS-run pharmacies that have questionable prescribing or dispensing patterns.  This review will also determine how IHS prevents and detects opioid misuse or abuse, as well as how it enforces its opioid-related policies.
Score: 0.867845892906189	 
Topic: 0.013*"opioid" + 0.008*"covid" + 0.008*"19" + 0.008*"drug" + 0.007*"response" + 0.006*"data" + 0.006*"fda" + 0.006*"use" + 0.006*"nurs" + 0.006*"health"

Score: 0.11825235933065414	 
Topic: 0.016*"telehealth" + 0.016*"mco" + 0.015*"drug" + 0.012*"part" + 0.012*"payment" + 0.011*"snf" + 0.011*"beneficiar" + 0.010*"medicaid"

In [85]:
from gensim import corpora, models
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.08032733781460998),
 (1, 0.06680720739964825),
 (2, 0.09129169023397705),
 (3, 0.3014371703370182),
 (4, 0.10750763877481936),
 (5, 0.0975324093123733),
 (6, 0.07958149937396115),
 (7, 0.14055324611860445),
 (8, 0.07958149937396115),
 (9, 0.06680720739964825),
 (10, 0.11986450085023498),
 (11, 0.09129169023397705),
 (12, 0.10207762237340029),
 (13, 0.07027662305930223),
 (14, 0.5171559088252629),
 (15, 0.14270484092135052),
 (16, 0.25582682754837544),
 (17, 0.09486684244312205),
 (18, 0.043921290159713654),
 (19, 0.07958149937396115),
 (20, 0.17840877449025908),
 (21, 0.07958149937396115),
 (22, 0.05115627371246922),
 (23, 0.11031683130214026),
 (24, 0.05036841079505216),
 (25, 0.06824174153246501),
 (26, 0.11986450085023498),
 (27, 0.07485289362938097),
 (28, 0.11986450085023498),
 (29, 0.0936236473738031),
 (30, 0.07485289362938097),
 (31, 0.07675029713657146),
 (32, 0.08714078579554667),
 (33, 0.1950648186247466),
 (34, 0.19233576186664642),
 (35, 0.05546744955815209),
 (36, 