### Prepare the environment

In [27]:
import pandas as pd
from gensim.models.phrases import Phrases
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import re
from gensim import utils
from gensim.corpora.dictionary import Dictionary
from gensim.models import ldamodel
import pyLDAvis.gensim_models as gensim_models

In [28]:
# removed "_" from regular expression
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^`{|}~"""
RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation), re.UNICODE)


def strip_punctuation(s):
    s = utils.to_unicode(s)
    return RE_PUNCT.sub(" ", s)

### Import dataset

In [29]:
df_text = pd.read_excel(open('data/diffusion_innovation_open_ended.xls','rb'), sheet_name='diffusion_innovation_open_ended')

### Knowledge topic model

In [30]:
knowledge_stream = df_text['knowledge'].tolist()
#Preprocess
my_filter = [
    lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short]

bigram = Phrases(knowledge_stream, min_count=1, threshold=2)
knowledge_bigrams = ["".join(bigram[i]) for i in knowledge_stream]
knowledge_clean = [preprocess_string(i, filters=my_filter) for i in knowledge_bigrams]

print(knowledge_clean[0:5])    

[['share', 'insights'], ['space', 'central', 'person', 'access', 'useful', 'topics', 'trainings', 'accessed'], ['share', 'examples', 'works'], ['continuous', 'exposure', 'upskilling', 'training'], ['spread', 'responsibility', 'great', 'effort', 'upskilling', 'obtaining', 'buy', 'non', 'departments']]


In [31]:
dct_knowledge = Dictionary(knowledge_clean)
corpus = [dct_knowledge.doc2bow(doc) for doc in knowledge_clean]
print(len(dct_knowledge))
print(len(corpus))

435
142


In [32]:
lda = ldamodel.LdaModel(corpus, id2word=dct_knowledge, num_topics = 5)
lda.show_topics(num_words=10)

[(0,
  '0.094*"training" + 0.016*"people" + 0.013*"knowledge" + 0.013*"technology" + 0.012*"sessions" + 0.012*"workshops" + 0.012*"new" + 0.012*"experience" + 0.012*"create" + 0.012*"skills"'),
 (1,
  '0.048*"training" + 0.035*"data" + 0.031*"knowledge" + 0.026*"sharing" + 0.019*"employees" + 0.015*"teams" + 0.013*"share" + 0.013*"platforms" + 0.012*"work" + 0.012*"sessions"'),
 (2,
  '0.057*"training" + 0.034*"people" + 0.016*"awareness" + 0.012*"data" + 0.012*"technology" + 0.008*"employees" + 0.008*"best" + 0.008*"ideas" + 0.008*"use" + 0.008*"volunteers"'),
 (3,
  '0.023*"sharing" + 0.020*"tools" + 0.020*"product" + 0.019*"knowledge" + 0.017*"data" + 0.016*"need" + 0.015*"sessions" + 0.012*"plan" + 0.009*"training" + 0.009*"skills"'),
 (4,
  '0.027*"training" + 0.025*"data" + 0.021*"skills" + 0.021*"knowledge" + 0.016*"forums" + 0.014*"business" + 0.014*"having" + 0.014*"online" + 0.014*"trainings" + 0.013*"use"')]

In [38]:
# vis = gensim_models.prepare(lda, corpus, dct_knowledge)
# pyLDAvis.save_html(vis, 'knowledge_lda.html')




import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dct_knowledge)
vis



  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


### Persuasion topic model

In [39]:
persuasion_stream = df_text['persuasion'].tolist()
print(persuasion_stream[0:10])
#Preprocess
my_filter = [
    lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short]

#bigram = Phrases(persuasion_stream, min_count=1, threshold=2)
#persuasion_bigrams = ["".join(bigram[i]) for i in persuasion_stream]
persuasion_clean = [preprocess_string(i, filters=my_filter) for i in persuasion_stream]

print(persuasion_clean[0:5])   

['Lead by example, do not lead on opinion', 'Describe the benefits and how it can lead to more informed decisions and the accuracy using a data driven approach rather than gut-feeling or based on misinterpreted analysis.', 'Share examples of how it works', 'Introduction presentations and demos of data driven technologies', 'Selling the value and efficiency that it brings. This might require somebody working closely with the dept to understand were the value add lies.', 'training  workshops', 'Awards/Certificates for participation and different levels of competency. Possible implementation of ideas brought about through the contribution of the participants.', 'Highlight competitive advantage, highlight the existence of trends hidden in data, highlight the criticality of decisions based on trusted information, highlight the need to have an appetite for change (willingness to change direction based on observations within the data).', 'have a prominent display section in for instance the d

TypeError: decoding to str: need a bytes-like object, float found