In [1]:
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
from gensim import corpora
from nltk.stem.wordnet import WordNetLemmatizer
import pickle
import gensim
import pyLDAvis.gensim
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import enchant
import contractions
import re
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
import os
from collections import Counter
import warnings
from docx import Document
from docx.shared import Inches
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


# Import Data

In [2]:
comments_df = pd.read_excel(r'assets/Federal Data Strategy Comments_UsecaseTaggingAssignments_CombinedAll_v2.xlsx',
                           sheet_name='All Comments',
                           usecols=11)

def upper_apply(x):
    """
    Converts strings to uppercase, conserving nan values. Meant to be applied to pandas df column.
    """
    if x is not np.nan:
        return str(x).upper()
    else:
        return np.nan

comments_df['Principle'] = comments_df['Principle'].apply(upper_apply)
comments_df['Best Practice'] = comments_df['Best Practice'].apply(upper_apply)
comments_df['Use Case'] = comments_df['Use Case'].apply(upper_apply)
comments_df['Mechanism'] = comments_df['Mechanism'].apply(upper_apply)
comments_df['Other'] = comments_df['Other'].apply(upper_apply)
comments_df['Exclude'] = comments_df['Exclude'].apply(upper_apply)

df = comments_df[(comments_df["Exclude"] != "X")&((comments_df['Use Case'] == "X"))].dropna(subset=['Instance']).reset_index().drop(labels='index',axis=1)

# Text Cleaning
We'll use the following function to clean our texts and return a list of tokens:

In [3]:
raw_text = df['Instance'].tolist()

In [4]:
def clean(doc, spellcheck=True):
    """
    Prepares text for NLP by stripping html tags, urls, email addresses, and misspellings.
    It also expands contractions and lowercases everything. Finally, it only keeps words that 
    are at least three characters long, do not contain a number, and are no more than 17 chars long.
    
    Arguments:
        doc (str): A single instance of feedback.
        spellcheck (bool): Whether or not to use the enchant library to strip misspellings.
        
    Returns:
        normalized (str): The normalized string.
    """
    
    def strip_html_tags(text):
        soup = BeautifulSoup(text, "html.parser")
        stripped_text = soup.get_text()
        return stripped_text

    def strip_urls(text):
        #url regex
        url_re = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
        stripped_text = url_re.sub('',text)
        return stripped_text

    def strip_emails(text):
        #email address regex
        email_re = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
        stripped_text = email_re.sub('',text)
        return stripped_text

    def strip_nonsense(text):
        # leave words that are at least three characters long, do not contain a number, and are no more 
        # than 17 chars long
        no_nonsense = re.findall(r'\b[a-z][a-z][a-z]+\b',text)
        stripped_text = ' '.join(w for w in no_nonsense if w != 'nan' and len(w) <= 17)
        return stripped_text

    def expand_contractions(text, contraction_mapping=contractions.contractions_dict):

            contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                              flags=re.IGNORECASE|re.DOTALL)
            def expand_match(contraction):
                match = contraction.group(0)
                first_char = match[0]
                if contraction_mapping.get(match):
                    expanded_contraction = contraction_mapping.get(match)
                else:
                    expanded_contraction = contraction_mapping.get(match.lower())
                if expanded_contraction:
                    expanded_contraction = first_char+expanded_contraction[1:]
                    return expanded_contraction
                else:
                    pass

            expanded_text = contractions_pattern.sub(expand_match, text)
            expanded_text = re.sub("'", "", expanded_text)
            return expanded_text

    def strip_misspellings(text):
        d = enchant.Dict("en_US")
        words_to_add = ['api','git','github','apis']
        for w in words_to_add:
            d.add(w)
        
        tokenizer = ToktokTokenizer()
        tokens = tokenizer.tokenize(text)
        non_dict_words = set([word for word in tokens if d.check(word) is False and re.match('^[a-zA-Z ]*$',word)])
        stripped_text = " ".join([x for x in tokens if x not in non_dict_words])
        return stripped_text
    
    doc = doc.lower()
    contraction_free = expand_contractions(doc)
    tag_free = strip_html_tags(contraction_free)
    url_free = strip_urls(tag_free)
    email_free = strip_emails(url_free)
    if spellcheck:
        misspelling_free = strip_misspellings(email_free)
        normalized = strip_nonsense(misspelling_free)

    else:
        normalized = strip_nonsense(email_free)
    
    return normalized

# Tokenization

In [6]:
def tokenize(text):
    """
    Tokenizes a string.
    
    Arguments:
        text (str):  Text to tokenize.
     
    Returns:
        lda_tokens (list): a list of tokens
    """
    parser = English()
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
def get_lemma(word):
    '''
    Uses WordNetLemmatizer to get the lemma for each token.
    
    Arugments:
        token (str): a single token (i.e. word)
        
    Returns:
        lemma (str): the token's lemma.
    '''
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    elif 'research' in lemma:
        return 'research'
    elif 'dataset' in lemma:
        return 'dataset'
    else:
        return lemma
    


In [8]:
def prepare_text_for_lda(text, lemma = True, remove_extra_stopwords=True):
    '''
    Prepares text of latent dirichlet allocation by cleaning documents with clean(),
    tokenize documents with tokenize(), removing stopwords, and lemmatizing. Optionally,
    you can expand the stopword list and/or use stemming instead of lemmatizing.
    
    Arguments:
        text (str): a single instance of feedback (i.e. one document)
        lemma (bool): Whether or not to use lemmas. Default True. If False, stem.
        remove_extra_stopwords (bool): Whether or not to expand nltk stopword list with
            user-defined stopwords. Highly advised.
        
    Returns:
        tokens (list): a list of lemmas
    
    '''
    
    en_stop = set(nltk.corpus.stopwords.words('english'))
    other_stopwords = ['process','better','governance','form','create','identify','liked','across','support','within','level','project',
                   'exist','file','quality','example','privacy','public','policy','readable','require','availability','national','emerge',
                   'using','college','improve','well','one','critical','include','key','air','state','high_priority','potential',
                   'stakeholder','source','develop','information','help','solution','machine','private','available','research','meta',
                   'liking','data','service_third','text','via','open','sector','use','change','also','need','standard','business',
                   'would','rate','making','continue','provide','pre_can','enable','issue','verify','product','randomize_evaluation',
                   'management','enterprise','language','outcome','real_time','strategy','access','small','agency','release','allow',
                   'solution_fill','evaluation','apply','best','local','want','randomize','scorecard','case','set','entity','ensure',
                   'mission','many_different','ass','user','datasets','government','work','system','decision','exchange','real','likes',
                   'security','individual','administration','post','important','new','time','among','federal','analysis','could','practice',
                   'common','unite','make','times','like','protect','organization','program','base','list','tool','different','must',
                   'resource','multiple','result','share','sharing','record','study','relate','link','platform','may','obtain','effort',
                   'focus','regard','build','contain','easy','reduce','impact','group','current','receive','area','advance','inform',
                   'additionally','bulk','combine','team','utilize','found','find','approach','idea','significant','address','say','id',
                   'firm','report','application','measure','understand','center','analyze','format','sample','match','high','host','single',
                   'utilizer','vocabulary','elements','aim','involve','ways','increase','large','goal','third','site','move','etc','often',
                   'implement','office','phase','table','problem','trend','annual','specific','dive','cross','many','plan','interest']
    
    clean_text = clean(text)
    tokens = tokenize(clean_text)
    tokens = [token for token in tokens if token not in en_stop]
    
    if remove_extra_stopwords:
        other_stopwords = set(other_stopwords)
        #stems to remove
        useless_stems = set(['agenc','public','govern','feder','data','would'])
        
        if lemma:
            tokens = [get_lemma(token) for token in tokens]
            tokens = [token for token in tokens if token not in other_stopwords]
        else:
            stemmer = nltk.stem.SnowballStemmer('english')
            tokens = [stemmer.stem(token) for token in tokens]
            tokens = [token for token in tokens if token not in useless_stems]
        
    else:
        if lemma:
            tokens = [get_lemma(token) for token in tokens]
        else:
            stemmer = nltk.stem.SnowballStemmer('english')
            tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

In [9]:
def make_topics(text_data, num_topics=10, num_words = 5, passes=15):
    '''
    Writes docs and corpus to disk and then generates topic models using LDA.
    
    Arguments:
        text_data (list):
    
    '''
    
    if not os.path.exists('uc_corpus_path'):
        os.makedirs('uc_corpus_path')
    
    
    corpus_path = os.path.join(os.getcwd(),'uc_corpus_path','corpus.pkl')
    dict_path = os.path.join(os.getcwd(),'uc_corpus_path','dictionary.gensim')
    model_path = os.path.join(os.getcwd(),'uc_corpus_path','model.gensim')

    
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    pickle.dump(corpus, open(corpus_path, 'wb'))
    dictionary.save(dict_path)
    
    num_topics = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=passes,random_state=123)
    ldamodel.save(model_path)
    topics = ldamodel.print_topics(num_words=num_words)
    for i, topic in enumerate(topics):
        topics = ", ".join(topic[1].split('"')[1::2])
        print(f'Topic {i}:  {topics}')

# Finding Topics with Lemmatization & Including Bigram Phrases

In [106]:
lemma_bigram_path = os.path.join(os.getcwd(),'uc_lemma_bigram_path')

#if the path doesn't exist, make it
if not os.path.exists(lemma_bigram_path):
    os.makedirs(lemma_bigram_path)
    
#if the path exists, delete all the files in it
else:
    for dirpath, dirnames, filenames in os.walk(lemma_bigram_path):
        for file_name in filenames:
            file_path = os.path.join(dirpath,file_name)
            os.remove(file_path)

In [107]:
lemma_phrase_corpus_path = os.path.join(lemma_bigram_path,'uc_lemma_phrase_corpus.txt')
df['Lemma Normalized Instance'] = df['Instance'].apply(lambda x: " ".join(prepare_text_for_lda(x,lemma=True)))

#create corpus for phrases
for doc in df['Lemma Normalized Instance']:
    with open(lemma_phrase_corpus_path,'a') as f:
        f.write(doc)
        
sentences = Text8Corpus(lemma_phrase_corpus_path)
phrases = Phrases(sentences, min_count=1, threshold=1)  # train model
bigram = Phraser(phrases) # construct bigram model

In [108]:
#write each instance to its own txt file so gensim can access it
for i, doc in enumerate(df['Lemma Normalized Instance']):
    phrase_txt_path = os.path.join(lemma_bigram_path,f'uc_lemma_phrase_text_{i}.txt')
    with open(phrase_txt_path,'w+') as f:
        f.write(doc)

In [109]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

other_stopwords = set(other_stopwords)

lemma_phrase_text_data = []
for dirpath, dirnames, filenames in os.walk(lemma_bigram_path):
    for file_name in filenames:
        if hasNumbers(file_name):
            file_path = os.path.join(dirpath,file_name)
            sentences = Text8Corpus(file_path)
            phrase_tokens = []
            for token in bigram[sentences]:
                phrase_tokens.append(token)
            
            lemma_phrase_text_data.append([word for words in phrase_tokens for word in words if word not in other_stopwords])
        else:
            pass     

In [110]:
num_topics = [2,4,5,8,10]
num_words = [5,8,10,15,20]

for n_topics, n_words in product(num_topics,num_words):
    print("="*80)
    print(f'Finding {n_topics} topics of {n_words} keywords...')
    make_topics(lemma_phrase_text_data,num_topics=n_topics,num_words=n_words,passes=50)

Finding 2 topics of 5 keywords...
Topic 0:  model, industry, services, provider, training
Topic 1:  benefit, department, opioid_overdose, health, integrate
Finding 2 topics of 8 keywords...
Topic 0:  model, industry, services, provider, training, community, analytics, administrative
Topic 1:  benefit, department, opioid_overdose, health, integrate, risk, performance, dashboard
Finding 2 topics of 10 keywords...
Topic 0:  model, industry, services, provider, training, community, analytics, administrative, reporting, development
Topic 1:  benefit, department, opioid_overdose, health, integrate, risk, performance, dashboard, funding, patient
Finding 2 topics of 15 keywords...
Topic 0:  model, industry, services, provider, training, community, analytics, administrative, reporting, development, standardize, workforce, collect, capability, grant
Topic 1:  benefit, department, opioid_overdose, health, integrate, risk, performance, dashboard, funding, patient, design, services, community, pilo

Topic 0:  provider, claim, child, head_start, grant
Topic 1:  taxonomy, benefit, sec, contract, surety
Topic 2:  pilot, cohort, financial_aid, contract, funding
Topic 3:  dashboard, opioid_overdose, health, design, services
Topic 4:  industry, standardize, company, manage, intelligent_assistant
Topic 5:  student, employment, partnership, community, cloud
Topic 6:  regulatory, opportunity, job, similar, food
Topic 7:  foreign_assistance, budget, student, department, reporting
Finding 8 topics of 8 keywords...
Topic 0:  provider, claim, child, head_start, grant, collect, development, pattern
Topic 1:  taxonomy, benefit, sec, contract, surety, vacancy, dot, market
Topic 2:  pilot, cohort, financial_aid, contract, funding, long_term, warehouse, insight
Topic 3:  dashboard, opioid_overdose, health, design, services, epidemic, opioid_epidemic, patient
Topic 4:  industry, standardize, company, manage, intelligent_assistant, structure, contract, surety_bond
Topic 5:  student, employment, partn

Topic 0:  claim, provider, job, document, claim_dataset, procedure, dataset, burden, fraud, clinical_registry, protocol, market, objective, request, patient
Topic 1:  taxonomy, benefit, risk, sec, contract, market, surety, vacancy, credit, capital, technology, reference, statement, tag, analytics
Topic 2:  financial_aid, warehouse, insight, dictionary, environment, store, take, streamline, request, benefit, element, years, student_aid, long_term, legal
Topic 3:  design, health, payment, criminal_justice, patient, dashboard, administrative, health_care, department, clinical_trial, partner, million, performance, services, provider
Topic 4:  industry, standardize, structure, contract, intelligent_assistant, workforce, company, surety_bond, learning, regulatory, investment, model, web_service, reporting, manage
Topic 5:  administrative, survey, eligible, cloud, benefit, design, forensics, regional, snap_administrative, capture, health, services, expand, backup, household
Topic 6:  food, re

## Topic Model Viz


In [139]:
num_topics = 9
num_words = 10
data = lemma_phrase_text_data


print("="*80)
print(f'Finding {num_topics} topics of {num_words} keywords...')
make_topics(data,num_topics=num_topics,num_words=num_words)

Finding 9 topics of 10 keywords...
Topic 0:  provider, claim, payment, document, health, patient, health_care, claim_dataset, procedure, type
Topic 1:  contract, taxonomy, surety, benefit, sec, tag, risk, market, web_service, statement
Topic 2:  financial_aid, warehouse, dictionary, vacancy, insight, store, school, environment, take, streamline
Topic 3:  dashboard, opioid_overdose, funding, pilot, services, design, department, epidemic, health, laws
Topic 4:  company, standardize, intelligent_assistant, costs, structure, conduct, automate, vet, eviction, database
Topic 5:  student, employment, design, partnership, job, training, administrative, capability, expand, services
Topic 6:  regulatory, food, manage, domain, similar, safety, framework, regulate, model, risk
Topic 7:  foreign_assistance, budget, reporting, department, student, performance, community, comment, department_veteran, veteran_affairs
Topic 8:  head_start, child, analytics, provider, early_childhood, grant, visualizati

In [140]:
dictionary = gensim.corpora.Dictionary.load('uc_corpus_path/dictionary.gensim')
corpus = pickle.load(open('uc_corpus_path/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('uc_corpus_path/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)

pyLDAvis.display(lda_display)           #to display inline
#pyLDAvis.show(lda_display)             #to display in browser

# Writing Topic Shares to Excel

### Use Cases
The model will be the one saved above using 7 topics with 8 keywords

In [142]:
dictionary = gensim.corpora.Dictionary.load('uc_corpus_path/dictionary.gensim')
corpus = pickle.load(open('uc_corpus_path/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('uc_corpus_path/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(data=lda_display,fileobj='Use Case Topics.html')

In [143]:
use_case_topics = ['Healthcare','Finance','Financial Aid','Pilot Programs','Other','Employment','Regulation',
                   'Foreign Affairs and Veterans','Children']
use_case_topic_cols = []
for topic in use_case_topics:
    use_case_topic_col = f'Use Case Topic - {topic}'
    use_case_topic_cols.append(use_case_topic_col)
    df[use_case_topic_col] = np.nan

In [144]:
for i, doc in enumerate(corpus):
    for topics in lda.get_document_topics(bow=doc):
        topic_num = topics[0]
        topic_share = topics[1] 
        df.at[i,f'Use Case Topic - {use_case_topics[topic_num]}'] = topic_share  

In [145]:
#make sure every doc has at least one topic score
df[use_case_topic_cols].dropna(how='all').shape[0] == df.shape[0]

True

In [146]:
df['Primary Use Case Topic'] = np.nan
#Create Col with name of most prevalent topic
primary_topics = []
for row in df[use_case_topic_cols].values:
    values = list(np.nan_to_num(row))
    max_value = max(values)
    max_index = values.index(max_value)
    primary_topics.append(use_case_topics[max_index])

df['Primary Use Case Topic'] = primary_topics

### Best Practices
Need to run and save best model in the other notebook first!

In [148]:
bp_comments_df = pd.read_excel(r'assets/Federal Data Strategy Comments_UsecaseTaggingAssignments_CombinedAll_v2.xlsx',
                              sheet_name='All Comments',
                              usecols=11)

bp_comments_df['Principle'] = bp_comments_df['Principle'].apply(upper_apply)
bp_comments_df['Best Practice'] = bp_comments_df['Best Practice'].apply(upper_apply)
bp_comments_df['Use Case'] = bp_comments_df['Use Case'].apply(upper_apply)
bp_comments_df['Mechanism'] = bp_comments_df['Mechanism'].apply(upper_apply)
bp_comments_df['Other'] = bp_comments_df['Other'].apply(upper_apply)
bp_comments_df['Exclude'] = bp_comments_df['Exclude'].apply(upper_apply)

bp_df = bp_comments_df[(bp_comments_df["Exclude"] != "X")&((bp_comments_df['Best Practice'] == "X"))].dropna(subset=['Instance']).reset_index().drop(labels='index',axis=1)

In [149]:
bp_dictionary = gensim.corpora.Dictionary.load('bp_corpus_path/dictionary.gensim')
bp_corpus = pickle.load(open('bp_corpus_path/corpus.pkl', 'rb'))
bp_lda = gensim.models.ldamodel.LdaModel.load('bp_corpus_path/model.gensim')
bp_lda_display = pyLDAvis.gensim.prepare(bp_lda, bp_corpus, bp_dictionary, sort_topics=False)
pyLDAvis.save_html(data=bp_lda_display, fileobj='Best Practice Topics.html')

In [150]:
best_practice_topics = ['Governance','Privacy and Security','Open Access','Usability','Quality and Standards']
best_practice_topic_cols = []
for topic in best_practice_topics:
    best_practice_topic_col = f'Best Practice Topic - {topic}'
    best_practice_topic_cols.append(best_practice_topic_col)
    bp_df[best_practice_topic_col] = np.nan

In [151]:
for i, doc in enumerate(bp_corpus):
    for topics in bp_lda.get_document_topics(bow=doc):
        topic_num = topics[0]
        topic_share = topics[1] 
        bp_df.at[i,f'Best Practice Topic - {best_practice_topics[topic_num]}'] = topic_share  

In [152]:
#make sure every doc has at least one topic score
bp_df[best_practice_topic_cols].dropna(how='all').shape[0] == bp_df.shape[0]

True

In [153]:
bp_df['Primary Best Practice Topic'] = np.nan
#Create Col with name of most prevalent topic
primary_topics = []
for row in bp_df[best_practice_topic_cols].values:
    values = list(np.nan_to_num(row))
    max_value = max(values)
    max_index = values.index(max_value)
    primary_topics.append(best_practice_topics[max_index])

bp_df['Primary Best Practice Topic'] = primary_topics

### Write it all to one Excel file

In [154]:
writer = pd.ExcelWriter('Federal Data Strategy Comments.xlsx')
comments_df.to_excel(writer,sheet_name='All Comments',index=False)
df.to_excel(writer,sheet_name='Use Cases',index=False)
bp_df.to_excel(writer,sheet_name='Best Practices',index=False)
writer.save()

## Create word docs with top instances for each topic
For each topic, create a word doc for the top 50% of instances within that topic.

In [155]:
bp_doc_counts = round(bp_df['Primary Best Practice Topic'].value_counts() / 2).to_dict()
uc_doc_counts = round(df['Primary Use Case Topic'].value_counts() / 2).to_dict()

In [156]:
bp_doc_path = os.path.join(os.getcwd(),'bp_word_docs')

#if the path doesn't exist, make it
if not os.path.exists(bp_doc_path):
    os.makedirs(bp_doc_path)
    
#if the path exists, delete all the files in it
else:
    for dirpath, dirnames, filenames in os.walk(bp_doc_path):
        for file_name in filenames:
            file_path = os.path.join(dirpath,file_name)
            os.remove(file_path)

for k in bp_doc_counts:
    rows = bp_df[bp_df['Primary Best Practice Topic'] == k].sort_values(by=f'Best Practice Topic - {k}',ascending=False).head(int(bp_doc_counts[k]))
    for i, instance in enumerate(rows['Instance'].str.strip()):
        document = Document()
        heading = f'Best Practices - {k} - #{i+1}'
        document.add_heading(heading, 0)
        p = document.add_paragraph(instance)
        file_path = os.path.join(bp_doc_path,heading+'.docx')
        document.save(file_path)


In [157]:
uc_doc_path = os.path.join(os.getcwd(),'uc_word_docs')

#if the path doesn't exist, make it
if not os.path.exists(uc_doc_path):
    os.makedirs(uc_doc_path)
    
#if the path exists, delete all the files in it
else:
    for dirpath, dirnames, filenames in os.walk(uc_doc_path):
        for file_name in filenames:
            file_path = os.path.join(dirpath,file_name)
            os.remove(file_path)

for k in uc_doc_counts:
    rows = df[df['Primary Use Case Topic'] == k].sort_values(by=f'Use Case Topic - {k}',ascending=False).head(int(uc_doc_counts[k]))
    for i, instance in enumerate(rows['Instance'].str.strip()):
        document = Document()
        heading = f'Use Cases - {k} - #{i+1}'
        document.add_heading(heading, 0)
        p = document.add_paragraph(instance)
        file_path = os.path.join(uc_doc_path,heading+'.docx')
        document.save(file_path)
