In [1]:
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
from gensim import corpora
from nltk.stem.wordnet import WordNetLemmatizer
import pickle
import gensim
import pyLDAvis.gensim
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import enchant
import contractions
import re
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
import os
from collections import Counter
import warnings
from docx import Document
from docx.shared import Inches
warnings.filterwarnings('ignore')

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


# Import Data
Import the spreadsheet and get the instances that have been tagged as a Best Practice.

In [2]:
comments_df = pd.read_excel(r'assets/2018-08-29_Federal Data Strategy Comments_UsecaseTaggingAssignments_CombinedAll_v4.xlsx',
                            sheet_name='All Comments',
                            usecols=11)

In [3]:
def upper_apply(x):
    """
    Converts strings to uppercase, conserving nan values. Meant to be applied to pandas df column.
    """
    
    if x is not np.nan:
        return str(x).upper()
    else:
        return np.nan

comments_df['Principle'] = comments_df['Principle'].apply(upper_apply)
comments_df['Best Practice'] = comments_df['Best Practice'].apply(upper_apply)
comments_df['Use Case'] = comments_df['Use Case'].apply(upper_apply)
comments_df['Mechanism'] = comments_df['Mechanism'].apply(upper_apply)
comments_df['Other'] = comments_df['Other'].apply(upper_apply)
comments_df['Exclude'] = comments_df['Exclude'].apply(upper_apply)

# Define Functions

In [6]:
def clean(doc, spellcheck=True):
    """
    Prepares text for NLP by stripping html tags, urls, email addresses, and misspellings.
    It also expands contractions and lowercases everything. Finally, it only keeps words that 
    are at least three characters long, do not contain a number, and are no more than 17 chars long.
    
    Arguments:
        doc (str): A single instance of feedback.
        spellcheck (bool): Whether or not to use the enchant library to strip misspellings.
        
    Returns:
        normalized (str): The normalized string.
    """
    
    
    def strip_html_tags(text):
        """
        Strips html from a string.
        """
        
        soup = BeautifulSoup(text, "html.parser")
        stripped_text = soup.get_text()
        return stripped_text

    def strip_urls(text):
        """
        Strips urls from a string.
        """
        
        url_re = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
        stripped_text = url_re.sub('',text)
        return stripped_text

    def strip_emails(text):
        """
        Strips emails from a string.
        """
        
        email_re = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
        stripped_text = email_re.sub('',text)
        return stripped_text

    def strip_nonsense(text):
        """
        Returns words from a string that are at least three characters long, do not contain a number, and are no more 
        than 17 chars long.
        """
        no_nonsense = re.findall(r'\b[a-z][a-z][a-z]+\b',text)
        stripped_text = ' '.join(w for w in no_nonsense if w != 'nan' and len(w) <= 17)
        return stripped_text

    def expand_contractions(text, contraction_mapping=contractions.contractions_dict):
        """
        Expands contractions within a string. For example, can't would become cannot.
        """

        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                              flags=re.IGNORECASE|re.DOTALL)
        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            if contraction_mapping.get(match):
                expanded_contraction = contraction_mapping.get(match)
            else:
                expanded_contraction = contraction_mapping.get(match.lower())
            if expanded_contraction:
                expanded_contraction = first_char+expanded_contraction[1:]
                return expanded_contraction
            else:
                pass

        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
        return expanded_text

    def strip_misspellings(text):
        """
        Strips misspelled words from a string.
        """
        
        d = enchant.Dict("en_US")
        #add these words to dictionary
        words_to_add = ['api','git','github','apis']
        for w in words_to_add:
            d.add(w)
        
        tokenizer = ToktokTokenizer()
        tokens = tokenizer.tokenize(text)
        non_dict_words = set([word for word in tokens if d.check(word) is False and re.match('^[a-zA-Z ]*$',word)])
        stripped_text = " ".join([x for x in tokens if x not in non_dict_words])
        return stripped_text
    
    doc = doc.lower()
    contraction_free = expand_contractions(doc)
    tag_free = strip_html_tags(contraction_free)
    url_free = strip_urls(tag_free)
    email_free = strip_emails(url_free)
    if spellcheck:
        misspelling_free = strip_misspellings(email_free)
        normalized = strip_nonsense(misspelling_free)

    else:
        normalized = strip_nonsense(email_free)
    
    return normalized

def get_wordnet_pos(treebank_tag):
    """
    Converts the part of speech tag returned by nltk.pos_tag() to a value that can be passed to the pos kwarg
    of wordnet_lemmatizer.lemmatize()
    """
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def get_lemmas(document,useless_lemmas=None):
    en_stop = set(nltk.corpus.stopwords.words('english'))
    
    if not useless_lemmas:
        useless_lemmas = set(['data', 'strategy', 'government','federal','agency',
                              'level','program','provide','public','issue','create','practice',
                              'need','receive','example','include','would','could','across','include','making','also', 
                              'like','likes','liked','liking','make','include','use','using','within','need','often','want','best',
                              'practice','may','one','ensure','base','across','sec','might','lei','must','well'])
    else:
        useless_lemmas = set(useless_lemmas)

    stopwords = en_stop | useless_lemmas

    text = nltk.word_tokenize(clean(document))
    word_pos = nltk.pos_tag(text)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word, pos in word_pos:
        pos = get_wordnet_pos(pos)
        lemma = wordnet_lemmatizer.lemmatize(word,pos=pos)
        if 'research' in lemma:
            lemmas.append('research')
        elif 'dataset' in lemma:
            lemmas.append('dataset')
        else:
            lemmas.append(lemma)

    lemmas = [lemma for lemma in lemmas if lemma not in stopwords]
    return " ".join(lemma for lemma in lemmas)

def get_lemmas_and_phrases(all_lemmas, instance_type):
    
    #create path if it doesnt exist
    lemma_bigram_path = os.path.join(os.getcwd(),f'{instance_type}_lemma_bigram_path')
    if not os.path.exists(lemma_bigram_path):
        os.makedirs(lemma_bigram_path)

    #if the path exists, delete all the files in it
    else:
        for dirpath, dirnames, filenames in os.walk(lemma_bigram_path):
            for file_name in filenames:
                file_path = os.path.join(dirpath,file_name)
                os.remove(file_path)
    lemma_phrase_corpus_path = os.path.join(lemma_bigram_path,f'{instance_type}_lemma_phrase_corpus.txt')

    #create corpus for phrases
    for doc in all_lemmas:
        with open(lemma_phrase_corpus_path,'a') as f:
            f.write(doc)
    
    #train bigram model
    sentences = Text8Corpus(lemma_phrase_corpus_path)
    phrases = Phrases(sentences, min_count=1, threshold=1)  # train model
    bigram = Phraser(phrases) # construct bigram model           
    
    
    #write each doc to its own txt file so gensim can access it later
    for i, doc in enumerate(all_lemmas):
        phrase_txt_path = os.path.join(lemma_bigram_path,f'{instance_type}_lemma_phrase_text_{i}.txt')
        with open(phrase_txt_path,'w+') as f:
            f.write(doc)
            
    def hasNumbers(inputString):
        """
        Quick check to see if str has a digit in it.
        """
        return any(char.isdigit() for char in inputString)

    lemma_phrase_text_data = []
    for dirpath, dirnames, filenames in os.walk(lemma_bigram_path):
        for file_name in filenames:
            if hasNumbers(file_name):
                file_path = os.path.join(dirpath,file_name)
                sentences = Text8Corpus(file_path)
                phrase_tokens = []
                for token in bigram[sentences]:
                    phrase_tokens.append(token)

                lemma_phrase_text_data.append([item for sublist in phrase_tokens for item in sublist])
            else:
                pass  
    return lemma_phrase_text_data

def make_topics(text_data, instance_type, num_topics=10, num_words = 5, passes=15):
    '''
    Writes docs and corpus to disk and then generates topic models using LDA.
    
    Arguments:
        text_data (list):
    
    '''
    
    if not os.path.exists(f'{instance_type}_corpus_path'):
        os.makedirs(f'{instance_type}_corpus_path')
    
    
    corpus_path = os.path.join(os.getcwd(),f'{instance_type}_corpus_path','corpus.pkl')
    dict_path = os.path.join(os.getcwd(),f'{instance_type}_corpus_path','dictionary.gensim')
    model_path = os.path.join(os.getcwd(),f'{instance_type}_corpus_path','model.gensim')

    
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    pickle.dump(corpus, open(corpus_path, 'wb'))
    dictionary.save(dict_path)
    
    num_topics = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=passes,random_state=123)
    ldamodel.save(model_path)
    topics = ldamodel.print_topics(num_words=num_words)
    for i, topic in enumerate(topics):
        topics = ", ".join(topic[1].split('"')[1::2])
        print(f'Topic {i}:  {topics}')

# Best Practices Topic Modeling

In [9]:
# filter out exclusions, select only best practices, drop rows without a comment
df = comments_df[(comments_df["Exclude"] != "X")&((comments_df['Best Practice'] == "X"))].dropna(subset=['Instance']).reset_index().drop(labels='index',axis=1)

In [10]:
all_lemmas = df['Instance'].apply(get_lemmas).tolist()

In [11]:
lemma_phrase_text_data = get_lemmas_and_phrases(all_lemmas,'bp')

In [None]:
num_topics = [2,4,5,8,10]
num_words = [5,8,10,15,20]

for n_topics, n_words in product(num_topics,num_words):
    print("="*80)
    print(f'Finding {n_topics} topics of {n_words} keywords...')
    make_topics(lemma_phrase_text_data,'bp',num_topics=n_topics,num_words=n_words)

# Best Practices Topic Model Visualization
Take an interesting model from the output above and plug in the values for `num_topics`, `num_words`, and `data`. This will recreate the model, writing it to disk. Then we can use the pyLDAvis library to visually inspect this topic model.

In [12]:
num_topics = 4
num_words = 8
data = lemma_phrase_text_data
instance_type = 'bp' #bp means best practice

print("="*80)
print(f'Finding {num_topics} topics of {num_words} keywords...')
make_topics(data,instance_type,num_topics=num_topics,num_words=num_words)

Finding 4 topics of 8 keywords...
Topic 0:  information, governance, management, organization, decision, business, process, policy
Topic 1:  research, dataset, effort, state, available, access, service, information
Topic 2:  standard, access, policy, system, process, user, support, require
Topic 3:  access, work, system, state, organization, research, information, service


In [13]:
dictionary = gensim.corpora.Dictionary.load(f'{instance_type}_corpus_path/dictionary.gensim')
corpus = pickle.load(open(f'{instance_type}_corpus_path/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load(f'{instance_type}_corpus_path/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)

 
pyLDAvis.display(lda_display)           #to display inline
#pyLDAvis.show(lda_display)             #to display in browser

# Use Cases

In [14]:
uc_df = comments_df[(comments_df["Exclude"] != "X")&((comments_df['Use Case'] == "X"))].dropna(subset=['Instance']).reset_index().drop(labels='index',axis=1)

In [15]:
other_stopwords = ['process','better','governance','form','create','identify','liked','across','support','within','level','project',
                   'exist','file','quality','example','public','policy','readable','require','availability','national','emerge',
                   'using','college','improve','well','one','critical','include','key','air','state','high_priority','potential',
                   'stakeholder','source','develop','information','help','solution','machine','private','available','research','meta',
                   'liking','data','service_third','text','via','open','sector','use','change','also','need','standard','business',
                   'would','rate','making','continue','provide','pre_can','enable','issue','verify','product','randomize_evaluation',
                   'management','enterprise','language','outcome','real_time','strategy','access','small','agency','release','allow',
                   'solution_fill','evaluation','apply','best','local','want','randomize','scorecard','case','set','entity','ensure',
                   'mission','many_different','ass','user','datasets','government','work','system','decision','exchange','real','likes',
                   'security','individual','administration','post','important','new','time','among','federal','analysis','could','practice',
                   'common','unite','make','times','like','protect','organization','program','base','list','tool','different','must',
                   'resource','multiple','result','share','sharing','record','study','relate','link','platform','may','obtain','effort',
                   'focus','regard','build','contain','easy','reduce','impact','group','current','receive','area','advance','inform',
                   'additionally','bulk','combine','team','utilize','found','find','approach','idea','significant','address','say','id',
                   'firm','report','application','measure','understand','center','analyze','format','sample','match','high','host','single',
                   'utilizer','vocabulary','element','aim','involve','ways','increase','large','goal','third','site','move','etc','often',
                   'implement','office','phase','table','problem','trend','annual','specific','dive','cross','many','plan','interest',
                   'elements']

In [16]:
all_lemmas = uc_df['Instance'].apply(lambda x: get_lemmas(x,useless_lemmas=other_stopwords)).tolist()

In [17]:
lemma_phrase_text_data = get_lemmas_and_phrases(all_lemmas,'uc')

In [26]:
num_topics = [2,4,5,8,10]
num_words = [5,8,10,15,20]

for n_topics, n_words in product(num_topics,num_words):
    print("="*80)
    print(f'Finding {n_topics} topics of {n_words} keywords...')
    make_topics(lemma_phrase_text_data,'uc',num_topics=n_topics,num_words=n_words)

Finding 2 topics of 5 keywords...
Topic 0:  contract, service, health, design, patient
Topic 1:  benefit, employment, dashboard, value, service
Finding 2 topics of 8 keywords...
Topic 0:  contract, service, health, design, patient, community, dataset, model
Topic 1:  benefit, employment, dashboard, value, service, document, student, good
Finding 2 topics of 10 keywords...
Topic 0:  contract, service, health, design, patient, community, dataset, model, department, pilot
Topic 1:  benefit, employment, dashboard, value, service, document, student, good, administrative, community
Finding 2 topics of 15 keywords...
Topic 0:  contract, service, health, design, patient, community, dataset, model, department, pilot, provider, way, cost, integrate, industry
Topic 1:  benefit, employment, dashboard, value, service, document, student, good, administrative, community, integrate, analytics, dataset, number, standardize
Finding 2 topics of 20 keywords...
Topic 0:  contract, service, health, design, 

Topic 0:  provider, regulatory, health, veteran_suicide, mental_health
Topic 1:  dashboard, benefit, taxonomy, warehouse, capability
Topic 2:  veteran, standardize, administrative, privacy, manage
Topic 3:  contract, pilot, design, fund, service
Topic 4:  opioid_overdose, law, service, epidemic, dashboard
Topic 5:  claim_dataset, fraud, track, insight, vacancy
Topic 6:  industry, community, student, workforce, economy
Topic 7:  budget, performance, analytics, student, department
Finding 8 topics of 8 keywords...
Topic 0:  provider, regulatory, health, veteran_suicide, mental_health, integrate, health_care, way
Topic 1:  dashboard, benefit, taxonomy, warehouse, capability, community, metric, opportunity
Topic 2:  veteran, standardize, administrative, privacy, manage, intelligent_assistant, amp, service
Topic 3:  contract, pilot, design, fund, service, life, community, child
Topic 4:  opioid_overdose, law, service, epidemic, dashboard, opioid_epidemic, health, opioid_disorder
Topic 5:  c

Topic 0:  regulatory, claim, claim_dataset, dataset, provider, health, patient, health_care, metric, payment, school, identity, framework, grant, way
Topic 1:  taxonomy, dashboard, benefit, sec, community, opportunity, capability, technology, cognitive, tag, dataset, mobile, skill, digital, market
Topic 2:  pilot, fund, service, administrative, snap, veteran, standardize, funding, start, cohort, way, engagement, intelligent_assistant, service_delivery, scale
Topic 3:  contract, surety, warehouse, head_start, child, financial_aid, serve, community, year, dictionary, surety_bond, initiative, standardize, creation, early_childhood
Topic 4:  service, design, education, workforce, analytics, web_service, privacy, requirement, protocol, clinical_registry, outcomes, development, dashboard, establish, order
Topic 5:  fraud, criminal_justice, year, veteran, expect, united, lead, forensics, award, supplier, dot, million, activity, prevent, transparency
Topic 6:  industry, community, comment, eco

# Use Cases Topic Model Visualization
Take an interesting model from the output above and plug in the values for `num_topics`, `num_words`, and `data`. This will recreate the model, writing it to disk. Then we can use the pyLDAvis library to visually inspect this topic model.

In [18]:
num_topics = 8
num_words = 8
data = lemma_phrase_text_data
instance_type = 'uc' #bp means best practice

print("="*80)
print(f'Finding {num_topics} topics of {num_words} keywords...')
make_topics(data,instance_type,num_topics=num_topics,num_words=num_words)

Finding 8 topics of 8 keywords...
Topic 0:  provider, regulatory, health, veteran_suicide, mental_health, integrate, health_care, way
Topic 1:  dashboard, benefit, taxonomy, warehouse, capability, community, metric, opportunity
Topic 2:  veteran, standardize, administrative, privacy, manage, intelligent_assistant, amp, service
Topic 3:  contract, pilot, design, fund, service, life, community, child
Topic 4:  opioid_overdose, law, service, epidemic, dashboard, opioid_epidemic, health, opioid_disorder
Topic 5:  claim_dataset, fraud, track, insight, vacancy, criminal_justice, united, claim
Topic 6:  industry, community, student, workforce, economy, employment, risk, good
Topic 7:  budget, performance, analytics, student, department, document, integrate, web_service


In [19]:
dictionary = gensim.corpora.Dictionary.load(f'{instance_type}_corpus_path/dictionary.gensim')
corpus = pickle.load(open(f'{instance_type}_corpus_path/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load(f'{instance_type}_corpus_path/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)

 
pyLDAvis.display(lda_display)           #to display inline
#pyLDAvis.show(lda_display)             #to display in browser

Given the outputs, we'll settle for these models, naming them in bold.

### Best Practices
4  - Topics of 8 keywords
 - Topic 1 (**Governance**):  information, governance, management, organization, decision, business, process, policy
 - Topic 2 (**Research**):  research, dataset, effort, state, available, access, service, information
 - Topic 3 (**Standards**):  standard, access, policy, system, process, user, support, require
 - Topic 4 (**Access**):  access, work, system, state, organization, research, information, service

### Use Cases
8  - Topics of 8 keywords
 - Topic 1 (**Healthcare**):  provider, regulatory, health, veteran_suicide, mental_health, integrate, health_care, way
 - Topic 2 (**Analytics**):  dashboard, benefit, taxonomy, warehouse, capability, community, metric, opportunity
 - Topic 3 (**Veterans**):  veteran, standardize, administrative, privacy, manage, intelligent_assistant, amp, service
 - Topic 4 (**Communities**):  contract, pilot, design, fund, service, life, community, child
 - Topic 5 (**Opioids**):  opioid_overdose, law, service, epidemic, dashboard, opioid_epidemic, health, opioid_disorder
 - Topic 6 (**Fraud**):  claim_dataset, fraud, track, insight, vacancy, criminal_justice, united, claim
 - Topic 7 (**Economy**):  industry, community, student, workforce, economy, employment, risk, good
 - Topic 8 (**Budgeting**):  budget, performance, analytics, student, department, document, integrate, web_service

# Calculating Topic Shares and Writing to Excel and Word

In [20]:
# Use Cases
dictionary = gensim.corpora.Dictionary.load('uc_corpus_path/dictionary.gensim')
corpus = pickle.load(open('uc_corpus_path/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('uc_corpus_path/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.save_html(data=lda_display,fileobj='Use Case Topics.html')

use_case_topics = ['Healthcare','Analytics','Veterans','Communities','Opioids','Fraud','Economy',
                   'Budgeting']
use_case_topic_cols = []
for topic in use_case_topics:
    use_case_topic_col = f'Use Case Topic - {topic}'
    use_case_topic_cols.append(use_case_topic_col)
    uc_df[use_case_topic_col] = np.nan
    
for i, doc in enumerate(corpus):
    for topics in lda.get_document_topics(bow=doc):
        topic_num = topics[0]
        topic_share = topics[1] 
        uc_df.at[i,f'Use Case Topic - {use_case_topics[topic_num]}'] = topic_share  
        
#make sure every doc has at least one topic score
if uc_df[use_case_topic_cols].dropna(how='all').shape[0] == uc_df.shape[0]:
    print("Passed Test!")
else:
    raise Exception
    
uc_df['Primary Use Case Topic'] = np.nan
#Create Col with name of most prevalent topic
primary_topics = []
for row in uc_df[use_case_topic_cols].values:
    values = list(np.nan_to_num(row))
    max_value = max(values)
    max_index = values.index(max_value)
    primary_topics.append(use_case_topics[max_index])

uc_df['Primary Use Case Topic'] = primary_topics

Passed Test!


In [21]:
# Best Practices
bp_df = df
bp_dictionary = gensim.corpora.Dictionary.load('bp_corpus_path/dictionary.gensim')
bp_corpus = pickle.load(open('bp_corpus_path/corpus.pkl', 'rb'))
bp_lda = gensim.models.ldamodel.LdaModel.load('bp_corpus_path/model.gensim')
bp_lda_display = pyLDAvis.gensim.prepare(bp_lda, bp_corpus, bp_dictionary, sort_topics=False)
pyLDAvis.save_html(data=bp_lda_display, fileobj='Best Practice Topics.html')

best_practice_topics = ['Governance','Research','Standards','Access']
best_practice_topic_cols = []
for topic in best_practice_topics:
    best_practice_topic_col = f'Best Practice Topic - {topic}'
    best_practice_topic_cols.append(best_practice_topic_col)
    bp_df[best_practice_topic_col] = np.nan
    
for i, doc in enumerate(bp_corpus):
    for topics in bp_lda.get_document_topics(bow=doc):
        topic_num = topics[0]
        topic_share = topics[1] 
        bp_df.at[i,f'Best Practice Topic - {best_practice_topics[topic_num]}'] = topic_share  
        
#make sure every doc has at least one topic score
if bp_df[best_practice_topic_cols].dropna(how='all').shape[0] == bp_df.shape[0]:
    print("Test passed!")
else:
    raise Exception
    
bp_df['Primary Best Practice Topic'] = np.nan
#Create Col with name of most prevalent topic
primary_topics = []
for row in bp_df[best_practice_topic_cols].values:
    values = list(np.nan_to_num(row))
    max_value = max(values)
    max_index = values.index(max_value)
    primary_topics.append(best_practice_topics[max_index])

bp_df['Primary Best Practice Topic'] = primary_topics

Test passed!


In [22]:
# Write both dataframes to single excel file with the rest of the data
writer = pd.ExcelWriter('Federal Data Strategy Comments.xlsx')
comments_df.to_excel(writer,sheet_name='All Comments',index=False)
uc_df.to_excel(writer,sheet_name='Use Cases',index=False)
bp_df.to_excel(writer,sheet_name='Best Practices',index=False)
writer.save()

## Create word docs with top instances for each topic
For each topic, create a word doc for the top 50% of instances within that topic.

In [62]:
bp_doc_counts = round(bp_df['Primary Best Practice Topic'].value_counts() / 2).to_dict()
uc_doc_counts = round(uc_df['Primary Use Case Topic'].value_counts() / 2).to_dict()

In [65]:
bp_doc_path = os.path.join(os.getcwd(),'bp_word_docs')

#if the path doesn't exist, make it
if not os.path.exists(bp_doc_path):
    os.makedirs(bp_doc_path)
    
#if the path exists, delete all the files in it
else:
    for dirpath, dirnames, filenames in os.walk(bp_doc_path):
        for file_name in filenames:
            file_path = os.path.join(dirpath,file_name)
            os.remove(file_path)

for k in bp_doc_counts:
    rows = bp_df[bp_df['Primary Best Practice Topic'] == k].sort_values(by=f'Best Practice Topic - {k}',ascending=False).head(int(bp_doc_counts[k]))
    for i, instance in enumerate(rows['Instance'].str.strip()):
        document = Document()
        heading = f'Best Practices - {k} - #{i+1}'
        document.add_heading(heading, 0)
        p = document.add_paragraph(instance)
        file_path = os.path.join(bp_doc_path,heading+'.docx')
        document.save(file_path)


In [66]:
uc_doc_path = os.path.join(os.getcwd(),'uc_word_docs')

#if the path doesn't exist, make it
if not os.path.exists(uc_doc_path):
    os.makedirs(uc_doc_path)
    
#if the path exists, delete all the files in it
else:
    for dirpath, dirnames, filenames in os.walk(uc_doc_path):
        for file_name in filenames:
            file_path = os.path.join(dirpath,file_name)
            os.remove(file_path)

for k in uc_doc_counts:
    rows = uc_df[uc_df['Primary Use Case Topic'] == k].sort_values(by=f'Use Case Topic - {k}',ascending=False).head(int(uc_doc_counts[k]))
    for i, instance in enumerate(rows['Instance'].str.strip()):
        document = Document()
        heading = f'Use Cases - {k} - #{i+1}'
        document.add_heading(heading, 0)
        p = document.add_paragraph(instance)
        file_path = os.path.join(uc_doc_path,heading+'.docx')
        document.save(file_path)
