In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from collections import OrderedDict

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim import utils, models


# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import nltk
from nltk.corpus import stopwords
import string
from bs4 import BeautifulSoup
from nltk.stem.wordnet import WordNetLemmatizer

from utils.clean_funcs.clean import remove_stopwords, make_bigrams, lemmatization, sent_to_words
from utils.calc_funcs.calc import compute_coherence_values

In [None]:
warnings.filterwarnings('ignore')
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
#code to clean text
# stop = set(stopwords.words('english'))
# exclude =set(string.punctuation)
# useless_words = ['would','could','should','le','non','federal','way','hour','lack','make','lot','getting','use','believe','thing']
# for word in useless_words:
#     stop.add(word)

## Set up dataframe

In [None]:
data = pd.read_excel('data.xlsx')

df = data[['AGENCY','COMPONENT','SUB_COMPONENT','GRADELEVEL','SUP_STATUS','Please briefly describe an example of one burdensome administrative task or process which you believe is "low value"']]
df.columns = ['AGENCY','COMPONENT','SUB_COMPONENT','GRADELEVEL','SUP_STATUS','TEXT']

In [None]:
df.head()

In [None]:
full_df = df[df['TEXT'].isnull()==False]
full_df = df[df['TEXT'].isna()==False]

In [None]:
full_df = df[df['COMPONENT'].isna()==False]

In [None]:
full_df = df[df['GRADELEVEL'].isna()==False]

In [None]:
full_df.dropna(subset=['TEXT'],inplace=True)

In [None]:
#df_ag = full_df[full_df['AGENCY']=='Department of Agriculture']


In [None]:
#data_ag = df_ag['TEXT'].values.tolist()

In [None]:
#full_df.fillna('other')
#full_df.replace(pd.isna,'other')

In [None]:
unique_comps = full_df['COMPONENT'].unique()
unique_agenics = full_df['AGENCY'].unique()

## Remove emails and newline characters

In [None]:
'''# Remove Emails
data_ag = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data_ag]

# Remove new line characters
data_ag = [re.sub('\s+', ' ', str(sent)) for sent in data_ag]

# Remove distracting single quotes
data_ag = [re.sub("\'", "", str(sent)) for sent in data_ag]'''

In [None]:
# def sent_to_words(sentences):
#     for sentence in sentences:
#         yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

#data_ag_words = list(sent_to_words(data_ag))

In [None]:
# print(data_ag_words[:2])

## Build bigram and trigrams

In [None]:
'''# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_ag_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_ag_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)'''

In [None]:
# print(trigram_mod[bigram_mod[data_ag_words[12]]])

## Remove Stopwords, make Bigrams and lemmatize

In [None]:
# nlp = spacy.load('en', disable=['parser', 'ner'])


In [None]:
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use','none'])

In [None]:
# # Remove Stop Words
# data_words_nostops = remove_stopwords(data_ag_words)

# # Form Bigrams
# data_words_bigrams = make_bigrams(data_words_nostops)

# # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# # python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# # Do lemmatization keeping only noun, adj, vb, adv
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])



In [None]:
# print(data_lemmatized[:3])

## create dictionary and corpus

In [None]:
# Create Dictionary
# id2word = corpora.Dictionary(data_lemmatized)

# # Create Corpus
# texts = data_lemmatized

# # Term Document Frequency
# corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# print(corpus[:2])

## Find optimal number of topics

In [None]:
#moved to utils

In [None]:
#model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=20, step=2)


In [None]:
#moved to  calc


In [None]:
scores = {'Department of Agriculture':{
'scores':{'total':8,'manager':4,'nonmanager':8
}},
'Department of Commerce':{
'scores':{'total':6,'manager':4,'nonmanager':6
}},
'Department of Defense':{
'scores':{'total':10,'manager':16,'nonmanager':8
}},
'Department of Education':{
'scores':{'total':6,'manager':14,'nonmanager':8
}},
'Department of Energy':{
'scores':{'total':4,'manager':18,'nonmanager':6
}},
'Department of Health and Human Services':{
'scores':{'total':6,'manager':10,'nonmanager':10
}},
'Department of Homeland Security':{
'scores':{'total':6,'manager':6,'nonmanager':6
}},
'Department of Housing and Urban Development':{
'scores':{'total':4,'manager':10,'nonmanager':4
}},
'Department of Justice':{
'scores':{'total':8,'manager':12,'nonmanager':4
}},
'Department of Labor':{
'scores':{'total':8,'manager':14,'nonmanager':6
}},
'Department of State':{
'scores':{'total':14,'manager':6,'nonmanager':6
}},
'Department of the Interior':{
'scores':{'total':10,'manager':14,'nonmanager':4
}},
'Department of the Treasury':{
'scores':{'total':8,'manager':10,'nonmanager':8
}},
'Department of Transportation':{
'scores':{'total':6,'manager':16,'nonmanager':6
}},
'Department of Veterans Affairs':{
'scores':{'total':8,'manager':12,'nonmanager':6
}},
'Environmental Protection Agency':{
'scores':{'total':14,'manager':12,'nonmanager':10
}},
'General Services Administration':{
'scores':{'total':18,'manager':8,'nonmanager':6
}},
'National Aeronautics and Space Administration':{
'scores':{'total':4,'manager':18,'nonmanager':6
}},
'National Science Foundation':{
'scores':{'total':4,'manager':16,'nonmanager':12
}},
'Nuclear Regulatory Commission':{
'scores':{'total':6,'manager':16,'nonmanager':6
}},
'Office of Personnel Management':{
'scores':{'total':6,'manager':12,'nonmanager':6
}},
'Social Security Administration':{
'scores':{'total':10,'manager':6,'nonmanager':6
}},
'Small Business Administration':{
'scores':{'total':6,'manager':16,'nonmanager':6
}},
'U.S. Agency for International Development':{
'scores':{'total':4,'manager':12,'nonmanager':8
}
}}

In [None]:
# scores.keys()

# Get Topics for agency level data

In [None]:
def get_topics_agencies(df,topic_num):
    data_2 = df['TEXT'].values.tolist()

    # Remove Emails
    data_2 = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data_2]

    # Remove new line characters
    data_2 = [re.sub('\s+', ' ', str(sent)) for sent in data_2]

    # Remove distracting single quotes
    data_2 = [re.sub("\'", "", str(sent)) for sent in data_2]

    data_words_2 = list(sent_to_words(data_2))

    # Build the bigram and trigram models
    bigram2 = gensim.models.Phrases(data_words_2, min_count=5, threshold=50) # higher threshold fewer phrases.
    trigram2 = gensim.models.Phrases(bigram2[data_words_2], threshold=50)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod2 = gensim.models.phrases.Phraser(bigram2)
    trigram_mod2= gensim.models.phrases.Phraser(trigram2)

    # Remove Stop Words
    data_words_nostops2 = remove_stopwords(data_words_2)

    # Form Bigrams
    data_words_bigrams2 = make_bigrams(data_words_nostops2,bigram_mod2)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized2 = lemmatization(data_words_bigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word2 = corpora.Dictionary(data_lemmatized2)

    # Create Corpus
    texts2 = data_lemmatized2

    # Term Document Frequency
    corpus2 = [id2word2.doc2bow(text) for text in texts2]

    #model_list, coherence_values = compute_coherence_values(dictionary=id2word2, corpus=corpus2, texts=data_lemmatized2, start=2, limit=20, step=2)

    #print(coherence_values)
    #max_coherence_score = max(coherence_values)
    #best_num_loc = coherence_values.index(max_coherence_score)
    #best_topic_num = (coherence_values.index(max_coherence_score) + 1) *2
    #print (best_topic_num)

    #best_model = model_list[best_num_loc]

    #model_topics = best_model.show_topics(formatted=False)

    model_2 = gensim.models.ldamodel.LdaModel( corpus=corpus2, num_topics=topic_num, id2word=id2word2)

    #pprint(best_model.print_topics(num_words=8))
    #pprint(model_2.print_topics(num_words=8))

    return model_2.show_topics(num_words=8,formatted=True)


In [None]:
def get_topics(df):


    data_2 = df['TEXT'].values.tolist()
    if len(data_2)==0:
        return 'empty'

    # Remove Emails
    data_2 = [re.sub('\S*@\S*\s?', '', str(sent)) for sent in data_2]

    # Remove new line characters
    data_2 = [re.sub('\s+', ' ', str(sent)) for sent in data_2]

    # Remove distracting single quotes
    data_2 = [re.sub("\'", "", str(sent)) for sent in data_2]

    data_words_2 = list(sent_to_words(data_2))

    # Build the bigram and trigram models
    bigram2 = gensim.models.Phrases(data_words_2, min_count=5, threshold=50) # higher threshold fewer phrases.
    trigram2 = gensim.models.Phrases(bigram2[data_words_2], threshold=50)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod2 = gensim.models.phrases.Phraser(bigram2)
    trigram_mod2= gensim.models.phrases.Phraser(trigram2)

    # Remove Stop Words
    data_words_nostops2 = remove_stopwords(data_words_2)

    # Form Bigrams
    data_words_bigrams2 = make_bigrams(data_words_nostops2,bigram_mod2)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized2 = lemmatization(data_words_bigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word2 = corpora.Dictionary(data_lemmatized2)

    # Create Corpus
    texts2 = data_lemmatized2

    # Term Document Frequency
    corpus2 = [id2word2.doc2bow(text) for text in texts2]
    try:
        model_list, coherence_values = compute_coherence_values(dictionary=id2word2, corpus=corpus2, texts=data_lemmatized2, start=2, limit=20,\
                   step=2,id2word=id2word2)
    except ValueError:
        return 'no data'

    #print(coherence_values)
    max_coherence_score = max(coherence_values)
    best_num_loc = coherence_values.index(max_coherence_score)
    best_topic_num = (coherence_values.index(max_coherence_score) + 1) *2
    #print (best_topic_num)

    #best_model = model_list[best_num_loc]

    #model_topics = best_model.show_topics(formatted=False)

    model_2 = gensim.models.ldamodel.LdaModel( corpus=corpus2, num_topics=best_topic_num, id2word=id2word2)

    #pprint(best_model.print_topics(num_words=8))
    #pprint(model_2.print_topics(num_words=8))

    return model_2.show_topics(num_words=8,formatted=True)

In [None]:
topic_dict_agency={}
key_values = ['total', 'manager', 'nonmanager']

for agency in unique_agenics:
    topic_dict_agency[agency] = get_topics_agencies(full_df[full_df['AGENCY']==agency],scores.get(agency).get('scores').get('total'))


In [None]:
agency_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in topic_dict_agency.items() ])).T
agency_df.to_csv('Agency_Topics.csv')

In [None]:
topic_dict_agency_mang={}
key_values = ['total', 'manager', 'nonmanager']

for agency in unique_agenics:
    topic_dict_agency_mang[agency] = get_topics_agencies(full_df[(full_df['AGENCY']==agency) & (full_df['SUP_STATUS']==1)],scores.get(agency).get('scores').get('manager'))

In [None]:
agency_df_mang = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in topic_dict_agency_mang.items() ])).T
agency_df_mang.to_csv('Agency_Topics_Senior_Manager.csv')

In [None]:
topic_dict_agency_nonmang={}
key_values = ['total', 'manager', 'nonmanager']

for agency in unique_agenics:
    topic_dict_agency_nonmang[agency] = get_topics_agencies(full_df[(full_df['AGENCY']==agency) & (full_df['SUP_STATUS']==0)],scores.get(agency).get('scores').get('nonmanager'))

In [None]:
agency_df_nonmang = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in topic_dict_agency_nonmang.items() ])).T
agency_df_nonmang.to_csv('Agency_Non_Manager_Topics.csv')

In [None]:
unique_grades = full_df['GRADELEVEL'].unique()
grade_topic_dict = {}
for grade in unique_grades:
    grade_topic_dict[grade] = get_topics(full_df[full_df['GRADELEVEL']==grade])

In [None]:
gs_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in grade_topic_dict.items() ])).T
gs_df.to_csv('GS_Topics.csv')

In [None]:
unique_comps_topics_mang = {}

for agency in unique_agenics:
    temp_dict = {}

    for comps in full_df[(full_df['AGENCY']==agency) & (full_df['SUP_STATUS']==1)]['COMPONENT'].unique():
        temp_dict[comps] = get_topics(full_df[(full_df['AGENCY']==agency) & (full_df['SUP_STATUS']==1)&(full_df['COMPONENT']==comps)])
    
    unique_comps_topics_mang[agency] = temp_dict

    

In [None]:
comps_mang_df = pd.DataFrame([(k,k1,pd.Series(v1)) for k,v in unique_comps_topics_mang.items() for k1,v1 in v.items() ],columns=['Agency','Component','topics'])

In [None]:
headers_list = ['Agency','Component','topics','topic 0','topic 1','topic 2', 'topic 3','topic 4','topic 5','topic 6','topic 7', 'topic 8','topic 9']
comps_mang_df = comps_mang_df.reindex(columns = headers_list)

In [None]:
for i in range(len(comps_mang_df)):
    for t in range((10)):
        try:
            comps_mang_df['topic '+str(t)].iloc[i] = comps_mang_df['topics'].iloc[i][t][1]
        except KeyError:
            pass


In [None]:
comps_mang_df.to_csv('Component_SR_Manager.csv')

In [None]:
unique_comps_topics_nonmang = {}

for agency in unique_agenics:
    temp_dict = {}

    for comps in full_df[(full_df['AGENCY']==agency) & (full_df['SUP_STATUS']==0)]['COMPONENT'].unique():
        temp_dict[comps] = get_topics(full_df[(full_df['AGENCY']==agency) & (full_df['SUP_STATUS']==0)&(full_df['COMPONENT']==comps)])
    
    unique_comps_topics_nonmang[agency] = temp_dict

In [None]:
comps_nonmang_df = pd.DataFrame([(k,k1,pd.Series(v1)) for k,v in unique_comps_topics_nonmang.items() for k1,v1 in v.items() ],columns=['Agency','Component','topics'])
headers_list = ['Agency','Component','topics','topic 0','topic 1','topic 2', 'topic 3','topic 4','topic 5','topic 6','topic 7', 'topic 8','topic 9']
comps_nonmang_df = comps_nonmang_df.reindex(columns = headers_list)
for i in range(len(comps_nonmang_df)):
    for t in range((10)):
        try:
            comps_nonmang_df['topic '+str(t)].iloc[i] = comps_nonmang_df['topics'].iloc[i][t][1]
        except KeyError:
            pass


In [None]:
comps_nonmang_df.to_csv('Component_Non_Manager.csv')

In [None]:
unique_comps_topics = {}

for agency in unique_agenics:
    temp_dict = {}

    for comps in full_df[(full_df['AGENCY']==agency)]['COMPONENT'].unique():
        temp_dict[comps] = get_topics(full_df[(full_df['AGENCY']==agency)&(full_df['COMPONENT']==comps)])
    
    unique_comps_topics[agency] = temp_dict

In [None]:
comps_df = pd.DataFrame([(k,k1,pd.Series(v1)) for k,v in unique_comps_topics.items() for k1,v1 in v.items() ],columns=['Agency','Component','topics'])
headers_list = ['Agency','Component','topics','topic 0','topic 1','topic 2', 'topic 3','topic 4','topic 5','topic 6','topic 7', 'topic 8','topic 9']
comps_df = comps_df.reindex(columns = headers_list)
for i in range(len(comps_df)):
    for t in range((10)):
        try:
            comps_df['topic '+str(t)].iloc[i] = comps_df['topics'].iloc[i][t][1]
        except KeyError:
            pass

In [None]:
comps_df.to_csv('Component_Topics.csv')

In [None]:
f = open('GS_Topics.csv','r')


In [None]:
f_text = f.read()
list1=re.sub(r"[^a-zA-Z]+", ' ',f_text)
g

In [None]:
f.close()

In [None]:
g = open('Agency_Topics.csv','r')
g_text = g.read()
listg=re.sub(r"[^a-zA-Z]+", ' ',g_text)
g.close()

In [None]:
h = open('Component_SR_manager.csv','r')
h_text = h.read()
listh=re.sub(r"[^a-zA-Z]+", ' ',h_text)
h.close()

In [None]:
j = open('Component_Topics.csv','r')
j_text = j.read()
listj=re.sub(r"[^a-zA-Z]+", ' ',j_text)
j.close()

In [None]:
k = open('Component_Topics.csv','r')
k_text = k.read()
listk=re.sub(r"[^a-zA-Z]+", ' ',k_text)
k.close()

In [None]:
l = open('Agency_Topics_Senior_Manager.csv','r')
l_text = l.read()
listl=re.sub(r"[^a-zA-Z]+", ' ',l_text)
l.close()

In [None]:
new_word = listl+listk+listj+listh+listg+list1

In [None]:
text_file = open('words.txt',"w")
text_file.write(new_word)
text_file.close()

In [1]:
import gensim.downloader as api

dataset = api.load("text8")
dataset = [wd for wd in dataset]


In [None]:
from gensim import corpora
import gensim

dct = corpora.Dictionary(dataset)
corpus = [dct.doc2bow(line) for line in dataset]

# Build the bigram models
bigram = gensim.models.phrases.Phrases(dataset, min_count=3, threshold=10)

# Construct bigram
print(bigram[dataset[0]])

In [None]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[dataset], threshold=10)

# Construct trigram
print(trigram[bigram[dataset[0]]])

In [4]:
from ngrams_maker import NGramsMaker

maker = NGramsMaker(4, dataset)




n = 2
n = 3
n = 4
Done.


In [6]:
#print(maker._ngram_dict[3]['model'][maker._ngram_dict[3]['model'][dataset[0]]])

results = maker.make_ngrams([dataset[2]])
print(results)



In [None]:
# print(ngrams_dict[2][0][ngrams_dict[1][0][dataset[0]]])
print(ngrams_dict[3][1][ngrams_dict[2][1][ngrams_dict[1][1][dataset[0]]]])