In [53]:
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
from gensim import corpora
from nltk.stem.wordnet import WordNetLemmatizer
import pickle
import gensim
import pyLDAvis.gensim
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
import enchant
import contractions
import re
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Import Data
Import the spreadsheet and get the instances that have been tagged as a Best Practice.

In [54]:
comments_df = pd.read_excel(r'assets/Federal Data Strategy Comments_UsecaseTaggingAssignments_CombinedAll_v2.xlsx',
                            sheet_name='All Comments',
                            usecols=11)

In [55]:
def upper_apply(x):
    """
    Converts strings to uppercase, conserving nan values. Meant to be applied to pandas df column.
    """
    
    if x is not np.nan:
        return str(x).upper()
    else:
        return np.nan

comments_df['Principle'] = comments_df['Principle'].apply(upper_apply)
comments_df['Best Practice'] = comments_df['Best Practice'].apply(upper_apply)
comments_df['Use Case'] = comments_df['Use Case'].apply(upper_apply)
comments_df['Mechanism'] = comments_df['Mechanism'].apply(upper_apply)
comments_df['Other'] = comments_df['Other'].apply(upper_apply)
comments_df['Exclude'] = comments_df['Exclude'].apply(upper_apply)

In [56]:
df = comments_df[(comments_df["Exclude"] != "X")&((comments_df['Best Practice'] == "X"))].dropna(subset=['Instance']).reset_index().drop(labels='index',axis=1)

# Text Cleaning
We'll use the following function to clean our texts and return a list of tokens:

In [57]:
#get comments in a list
raw_text = df['Instance'].tolist()

In [58]:
def clean(doc, spellcheck=True):
    """
    Prepares text for NLP by stripping html tags, urls, email addresses, and misspellings.
    It also expands contractions and lowercases everything. Finally, it only keeps words that 
    are at least three characters long, do not contain a number, and are no more than 17 chars long.
    
    Arguments:
        doc (str): A single instance of feedback.
        spellcheck (bool): Whether or not to use the enchant library to strip misspellings.
        
    Returns:
        normalized (str): The normalized string.
    """
    
    
    def strip_html_tags(text):
        soup = BeautifulSoup(text, "html.parser")
        stripped_text = soup.get_text()
        return stripped_text

    def strip_urls(text):
        #url regex
        url_re = re.compile(r"""(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""")
        stripped_text = url_re.sub('',text)
        return stripped_text

    def strip_emails(text):
        #email address regex
        email_re = re.compile(r'(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)')
        stripped_text = email_re.sub('',text)
        return stripped_text

    def strip_nonsense(text):
        # leave words that are at least three characters long, do not contain a number, and are no more 
        # than 17 chars long
        no_nonsense = re.findall(r'\b[a-z][a-z][a-z]+\b',text)
        stripped_text = ' '.join(w for w in no_nonsense if w != 'nan' and len(w) <= 17)
        return stripped_text

    def expand_contractions(text, contraction_mapping=contractions.contractions_dict):

            contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                              flags=re.IGNORECASE|re.DOTALL)
            def expand_match(contraction):
                match = contraction.group(0)
                first_char = match[0]
                if contraction_mapping.get(match):
                    expanded_contraction = contraction_mapping.get(match)
                else:
                    expanded_contraction = contraction_mapping.get(match.lower())
                if expanded_contraction:
                    expanded_contraction = first_char+expanded_contraction[1:]
                    return expanded_contraction
                else:
                    pass

            expanded_text = contractions_pattern.sub(expand_match, text)
            expanded_text = re.sub("'", "", expanded_text)
            return expanded_text

    def strip_misspellings(text):
        d = enchant.Dict("en_US")
        words_to_add = ['api','git','github','apis']
        for w in words_to_add:
            d.add(w)
        
        tokenizer = ToktokTokenizer()
        tokens = tokenizer.tokenize(text)
        non_dict_words = set([word for word in tokens if d.check(word) is False and re.match('^[a-zA-Z ]*$',word)])
        stripped_text = " ".join([x for x in tokens if x not in non_dict_words])
        return stripped_text
    
    doc = doc.lower()
    contraction_free = expand_contractions(doc)
    tag_free = strip_html_tags(contraction_free)
    url_free = strip_urls(tag_free)
    email_free = strip_emails(url_free)
    if spellcheck:
        misspelling_free = strip_misspellings(email_free)
        normalized = strip_nonsense(misspelling_free)

    else:
        normalized = strip_nonsense(email_free)
    
    return normalized

# Tokenization

In [59]:
def tokenize(text):
    """
    Tokenizes a string.
    
    Arguments:
        text (str):  Text to tokenize.
     
    Returns:
        lda_tokens (list): a list of tokens
    """    
    parser = English()
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [60]:
def get_lemma(token):
    '''
    Uses WordNetLemmatizer to get the lemma for each token.
    
    Arugments:
        token (str): a single token (i.e. word)
        
    Returns:
        lemma (str): the token's lemma.
    '''
    
    lemma = wn.morphy(token)
    if lemma is None:
        return token
    elif 'research' in lemma:
        return 'research'
    elif 'dataset' in lemma:
        return 'datasets'
    else:
        return lemma

In [61]:
def prepare_text_for_lda(text, lemma = True, remove_extra_stopwords=True):
    '''
    Prepares text of latent dirichlet allocation by cleaning documents with clean(),
    tokenize documents with tokenize(), removing stopwords, and lemmatizing. Optionally,
    you can expand the stopword list and/or use stemming instead of lemmatizing.
    
    Arguments:
        text (str): a single instance of feedback (i.e. one document)
        lemma (bool): Whether or not to use lemmas. Default True. If False, stem.
        remove_extra_stopwords (bool): Whether or not to expand nltk stopword list with
            user-defined stopwords. Highly advised.
        
    Returns:
        tokens (list): a list of lemmas
    
    '''
    
    
    #create stopwords
    en_stop = set(nltk.corpus.stopwords.words('english'))
    
    clean_text = clean(text)
    tokens = tokenize(clean_text)
    tokens = [token for token in tokens if token not in en_stop]
    
    if remove_extra_stopwords:
        useless_lemmas = set(['data', 'strategy', 'government','federal','agency',
                             'level','program','provide','public','issue','create','practice',
                             'need','receive','example','include','would','could','across','include','making','also', 
                             'like','likes','liked','liking','make','include','use','using','within','need','often','want','best',
                             'practice','may','one','ensure','base','across','sec','might'])
        #stems to remove
        useless_stems = set(['agenc','public','govern','feder','data','would'])
        
        if lemma:
            tokens = [get_lemma(token) for token in tokens]
            tokens = [token for token in tokens if token not in useless_lemmas]
        else:
            stemmer = nltk.stem.SnowballStemmer('english')
            tokens = [stemmer.stem(token) for token in tokens]
            tokens = [token for token in tokens if token not in useless_stems]
        
    else:
        if lemma:
            tokens = [get_lemma(token) for token in tokens]
        else:
            stemmer = nltk.stem.SnowballStemmer('english')
            tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

In [62]:
def make_topics(text_data, num_topics=10, num_words = 5, passes=15):
    '''
    Writes docs and corpus to disk and then generates topic models using LDA.
    
    Arguments:
        text_data (list):
    
    '''
    
    if not os.path.exists('bp_corpus_path'):
        os.makedirs('bp_corpus_path')
    
    
    corpus_path = os.path.join(os.getcwd(),'bp_corpus_path','corpus.pkl')
    dict_path = os.path.join(os.getcwd(),'bp_corpus_path','dictionary.gensim')
    model_path = os.path.join(os.getcwd(),'bp_corpus_path','model.gensim')

    
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    pickle.dump(corpus, open(corpus_path, 'wb'))
    dictionary.save(dict_path)
    
    num_topics = num_topics
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=passes,random_state=123)
    ldamodel.save(model_path)
    topics = ldamodel.print_topics(num_words=num_words)
    for i, topic in enumerate(topics):
        topics = ", ".join(topic[1].split('"')[1::2])
        print(f'Topic {i}:  {topics}')

# Finding Topics with Lemmatization & Including Bigram Phrases

In [63]:
lemma_bigram_path = os.path.join(os.getcwd(),'bp_lemma_bigram_path')

if not os.path.exists(lemma_bigram_path):
    os.makedirs(lemma_bigram_path)
    
#if the path exists, delete all the files in it
else:
    for dirpath, dirnames, filenames in os.walk(lemma_bigram_path):
        for file_name in filenames:
            file_path = os.path.join(dirpath,file_name)
            os.remove(file_path)

In [64]:
lemma_phrase_corpus_path = os.path.join(lemma_bigram_path,'bp_lemma_phrase_corpus.txt')
df['Lemma Normalized Instance'] = df['Instance'].apply(lambda x: " ".join(prepare_text_for_lda(x,lemma=True)))

#create corpus for phrases
for doc in df['Lemma Normalized Instance']:
    with open(lemma_phrase_corpus_path,'a') as f:
        f.write(doc)
        
sentences = Text8Corpus(lemma_phrase_corpus_path)
phrases = Phrases(sentences, min_count=1, threshold=1)  # train model
bigram = Phraser(phrases) # construct bigram model

In [65]:
#write each instance to its own txt file so gensim can access it
for i, doc in enumerate(df['Lemma Normalized Instance']):
    phrase_txt_path = os.path.join(lemma_bigram_path,f'bp_lemma_phrase_text_{i}.txt')
    with open(phrase_txt_path,'w+') as f:
        f.write(doc)

In [66]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

lemma_phrase_text_data = []
for dirpath, dirnames, filenames in os.walk(lemma_bigram_path):
    for file_name in filenames:
        if hasNumbers(file_name):
            file_path = os.path.join(dirpath,file_name)
            sentences = Text8Corpus(file_path)
            phrase_tokens = []
            for token in bigram[sentences]:
                phrase_tokens.append(token)
            
            lemma_phrase_text_data.append([item for sublist in phrase_tokens for item in sublist])
        else:
            pass     

In [74]:
num_topics = [2,4,5,8,10]
num_words = [5,8,10,15,20]

for n_topics, n_words in product(num_topics,num_words):
    print("="*80)
    print(f'Finding {n_topics} topics of {n_words} keywords...')
    make_topics(lemma_phrase_text_data,num_topics=n_topics,num_words=n_words)

Finding 2 topics of 5 keywords...
Topic 0:  access, private_sector, governance, quality, process
Topic 1:  access, policy, information, governance, state
Finding 2 topics of 8 keywords...
Topic 0:  access, private_sector, governance, quality, process, policy, share, open
Topic 1:  access, policy, information, governance, state, enterprise_governance, management, system
Finding 2 topics of 10 keywords...
Topic 0:  access, private_sector, governance, quality, process, policy, share, open, user, standard
Topic 1:  access, policy, information, governance, state, enterprise_governance, management, system, standard, decision
Finding 2 topics of 15 keywords...
Topic 0:  access, private_sector, governance, quality, process, policy, share, open, user, standard, information, access_augmentation, system, privacy_security, guidance
Topic 1:  access, policy, information, governance, state, enterprise_governance, management, system, standard, decision, college_scorecard, privacy_security, census_bur

Topic 0:  process, access, legal_entity, governance, non_sensitive, quality, sensitive, system, private_sector, information, decision_accountability, sharing, decision, share, high_quality, protect_privacy, state, maximize_amount, organization, student_success
Topic 1:  policy, governance, access, enterprise_governance, strategic_asset, privacy_security, state, information, system, meta_management, address, open_standard, individual, impact_evaluation, security, private_partnership, security_privacy, match, properly_manage, assistance
Topic 2:  policy, access, information, state_local, available, census_bureau, open, new_technology, system, manage, standard, share, private_sector, product, governance_framework, governance, protect, review, decision, global_supply
Topic 3:  access, state, college_scorecard, policy, open, standard, access_augmentation, user, infrastructure, exchange, privacy_security, work, set, available_quickly, commercial_venture, governance, research, information, re

Topic 0:  quality, governance, access, sharing, student_success, educational_equity, system, decision
Topic 1:  access, governance, enterprise_governance, impact_evaluation, open_standard, match, resource_constraint, randomize_evaluation
Topic 2:  information, governance, access, user, census_bureau, available, private_sector, product
Topic 3:  access, exchange, individual, commercial_venture, user, open, system, state_local
Topic 4:  standard, quality, open, information_collection, working_group, management, census_bureau, important
Topic 5:  access, commercialization_innovation, private_sector, state_local, system, decision_accountability, security_privacy, policy_purpose
Topic 6:  access, state, information, tool, open, policy, work, access_augmentation
Topic 7:  access, state, information, legal_entity, policy, non_sensitive, private_sector, sensitive
Topic 8:  policy, governance, access, research, privacy_security, process, interoperability_framework, enable
Topic 9:  college_scor

# Topic Model Viz

In [93]:
num_topics = 5
num_words = 10
data = lemma_phrase_text_data

print("="*80)
print(f'Finding {num_topics} topics of {num_words} keywords...')
make_topics(data,num_topics=num_topics,num_words=num_words)

Finding 5 topics of 10 keywords...
Topic 0:  process, access, legal_entity, governance, non_sensitive, quality, sensitive, system, private_sector, information
Topic 1:  policy, governance, access, enterprise_governance, strategic_asset, privacy_security, state, information, system, meta_management
Topic 2:  policy, access, information, state_local, available, census_bureau, open, new_technology, system, manage
Topic 3:  access, state, college_scorecard, policy, open, standard, access_augmentation, user, infrastructure, exchange
Topic 4:  access, quality, governance, information, standard, research, private_sector, management, interoperability_framework, new


In [94]:
dictionary = gensim.corpora.Dictionary.load('bp_corpus_path/dictionary.gensim')
corpus = pickle.load(open('bp_corpus_path/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('bp_corpus_path/model.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)

 
pyLDAvis.display(lda_display)           #to display inline
#pyLDAvis.show(lda_display)             #to display in browser