### Prepare the environment

In [36]:
import pandas as pd
from gensim.models.phrases import Phrases
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import re
from gensim import utils
from gensim.corpora.dictionary import Dictionary
from gensim.models import ldamodel
import pyLDAvis.gensim_models as gensim_models

In [18]:
# removed "_" from regular expression
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^`{|}~"""
RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation), re.UNICODE)


def strip_punctuation(s):
    s = utils.to_unicode(s)
    return RE_PUNCT.sub(" ", s)

### Import dataset

In [7]:
df_text = pd.read_excel(open('diffusion_innovation_open_ended.xls','rb'), sheet_name='diffusion_innovation_open_ended')

                                           knowledge  \
0                                     Share insights   
1  Have a space that is central in which all pers...   
2                     Share examples of how it works   
3       Continuous exposure, upskilling and training   
4  I would spread the responsibility and put in a...   

                                          persuasion  \
0            Lead by example, do not lead on opinion   
1  Describe the benefits and how it can lead to m...   
2                     Share examples of how it works   
3  Introduction presentations and demos of data d...   
4  Selling the value and efficiency that it bring...   

                                            decision  \
0                                      I do not know   
1  Clear advantages - For an organisation and its...   
2                Will the benefits outweigh the cons   
3                       Cross platform compatibility   
4  Whether you want to survive as an organisat

### Knowledge topic model

In [22]:
knowledge_stream = df_text['knowledge'].tolist()
#Preprocess
my_filter = [
    lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short]

bigram = Phrases(knowledge_stream, min_count=1, threshold=2)
knowledge_bigrams = ["".join(bigram[i]) for i in knowledge_stream]
knowledge_clean = [preprocess_string(i, filters=my_filter) for i in knowledge_bigrams]

print(knowledge_clean[0:5])    

['Share insights', 'Have a space that is central in which all person can access, where useful topics and trainings can be accessed.', 'Share examples of how it works', 'Continuous exposure, upskilling and training', 'I would spread the responsibility and put in a great amount of effort in upskilling and obtaining buy-in of non-IT departments']
[['share', 'insights'], ['space', 'central', 'person', 'access', 'useful', 'topics', 'trainings', 'accessed'], ['share', 'examples', 'works'], ['continuous', 'exposure', 'upskilling', 'training'], ['spread', 'responsibility', 'great', 'effort', 'upskilling', 'obtaining', 'buy', 'non', 'departments']]


In [31]:
dct_knowledge = Dictionary(knowledge_clean)
corpus = [dct_knowledge.doc2bow(doc) for doc in knowledge_clean]
print(len(dct_knowledge))
print(len(corpus))

435
142


In [29]:
lda = ldamodel.LdaModel(corpus, id2word=dct_knowledge, num_topics = 5)
lda.show_topics(num_words=10)

[(0,
  '0.042*"training" + 0.022*"skills" + 0.017*"people" + 0.016*"data" + 0.014*"knowledge" + 0.013*"work" + 0.012*"need" + 0.012*"tools" + 0.012*"teams" + 0.009*"sharing"'),
 (1,
  '0.046*"training" + 0.037*"data" + 0.026*"sessions" + 0.019*"sharing" + 0.017*"knowledge" + 0.016*"people" + 0.013*"employees" + 0.010*"right" + 0.010*"create" + 0.009*"use"'),
 (2,
  '0.029*"training" + 0.019*"knowledge" + 0.018*"data" + 0.017*"people" + 0.016*"sharing" + 0.013*"individuals" + 0.010*"send" + 0.009*"tools" + 0.009*"forums" + 0.009*"product"'),
 (3,
  '0.069*"training" + 0.022*"employees" + 0.014*"business" + 0.014*"users" + 0.014*"data" + 0.014*"systems" + 0.013*"implement" + 0.012*"knowledge" + 0.011*"increase" + 0.009*"management"'),
 (4,
  '0.054*"training" + 0.026*"knowledge" + 0.019*"share" + 0.017*"technology" + 0.017*"awareness" + 0.013*"product" + 0.012*"work" + 0.009*"sessions" + 0.009*"build" + 0.009*"workshops"')]

In [38]:
vis = gensim_models.prepare(lda, corpus, dct_knowledge)
pyLDAvis.save_html(vis, 'knowledge_lda.html')

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


### Persuasion topic model

In [43]:
persuasion_stream = df_text['persuasion'].tolist()
print(persuasion_stream[0:10])
#Preprocess
my_filter = [
    lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short]

#bigram = Phrases(persuasion_stream, min_count=1, threshold=2)
#persuasion_bigrams = ["".join(bigram[i]) for i in persuasion_stream]
persuasion_clean = [preprocess_string(i, filters=my_filter) for i in persuasion_stream]

print(persuasion_clean[0:5])   

['Lead by example, do not lead on opinion', 'Describe the benefits and how it can lead to more informed decisions and the accuracy using a data driven approach rather than gut-feeling or based on misinterpreted analysis.', 'Share examples of how it works', 'Introduction presentations and demos of data driven technologies', 'Selling the value and efficiency that it brings. This might require somebody working closely with the dept to understand were the value add lies.', 'training  workshops', 'Awards/Certificates for participation and different levels of competency. Possible implementation of ideas brought about through the contribution of the participants.', 'Highlight competitive advantage, highlight the existence of trends hidden in data, highlight the criticality of decisions based on trusted information, highlight the need to have an appetite for change (willingness to change direction based on observations within the data).', 'have a prominent display section in for instance the d

TypeError: decoding to str: need a bytes-like object, float found