### Prepare the environment

In [15]:
import pandas as pd
from gensim.models.phrases import Phrases
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
import re
from gensim import utils
from gensim.corpora.dictionary import Dictionary
from gensim.models import ldamodel
import pyLDAvis.gensim_models as gensim_models
import pyLDAvis
import pyLDAvis.gensim_models

In [16]:
# removed "_" from regular expression
punctuation = r"""!"#$%&'‘()*+,-.\/:;<=>?@[\]^`{|}~"""
RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation), re.UNICODE)


def strip_punctuation(s):
    s = utils.to_unicode(s)
    return RE_PUNCT.sub(" ", s)

def prep_clean_data(input_list):
    input_list = [re.sub(r"[^a-zA-Z0-9]", " ", str(i_list)) for i_list in input_list]

    print(input_list[0:10])
    
    #Preprocess
    my_filter = [
        lambda x: x.lower(), strip_tags, strip_punctuation,
        strip_multiple_whitespaces, strip_numeric,
        remove_stopwords, strip_short]

    #bigram = Phrases(persuasion_stream, min_count=1, threshold=2)
    #persuasion_bigrams = ["".join(bigram[i]) for i in persuasion_stream]
    preprocess_clean = [preprocess_string(i, filters=my_filter) for i in input_list]

    print(preprocess_clean[0:5])
    
    return preprocess_clean

### Import dataset

In [17]:
df_text = pd.read_excel(open('data/diffusion_innovation_open_ended.xls','rb'), sheet_name='diffusion_innovation_open_ended')

### Knowledge topic model

In [18]:
knowledge_stream = df_text['knowledge'].tolist()
knowledge_clean = prep_clean_data(knowledge_stream)

['Share insights', 'Have a space that is central in which all person can access  where useful topics and trainings can be accessed ', 'Share examples of how it works', 'Continuous exposure  upskilling and training', 'I would spread the responsibility and put in a great amount of effort in upskilling and obtaining buy in of non IT departments', 'by sending people to training and helping them obtain new skills and making sure they work with the tools as frequent', 'Training individuals and having in house opportunities for them to be involved ', 'Drive business process ownership in a collective fashion  business and IT ', 'implement such systems as part of daily use to make users more familiar with thinking outside the box   e g  in HR systems allow analysis of leave   hours worked or something like that   compare my leave with the average  etc ', '  By improving data quality    Access contro  ']
[['share', 'insights'], ['space', 'central', 'person', 'access', 'useful', 'topics', 'traini

In [19]:
dct_knowledge = Dictionary(knowledge_clean)
corpus = [dct_knowledge.doc2bow(doc) for doc in knowledge_clean]
print(len(dct_knowledge))
print(len(corpus))

434
142


In [20]:
lda = ldamodel.LdaModel(corpus, id2word=dct_knowledge, num_topics = 5)
lda.show_topics(num_words=10)

[(0,
  '0.064*"training" + 0.018*"sharing" + 0.018*"knowledge" + 0.018*"people" + 0.015*"share" + 0.015*"trainings" + 0.015*"workshops" + 0.015*"data" + 0.011*"platforms" + 0.011*"technology"'),
 (1,
  '0.071*"training" + 0.025*"sessions" + 0.021*"online" + 0.021*"product" + 0.020*"sharing" + 0.013*"data" + 0.013*"driven" + 0.013*"awareness" + 0.013*"communication" + 0.012*"knowledge"'),
 (2,
  '0.021*"knowledge" + 0.015*"exposure" + 0.013*"training" + 0.010*"create" + 0.010*"data" + 0.010*"people" + 0.010*"individuals" + 0.010*"technology" + 0.010*"discussion" + 0.010*"contribute"'),
 (3,
  '0.057*"training" + 0.042*"data" + 0.020*"work" + 0.016*"use" + 0.016*"skills" + 0.013*"knowledge" + 0.013*"employees" + 0.013*"technologies" + 0.012*"organization" + 0.010*"people"'),
 (4,
  '0.028*"knowledge" + 0.021*"people" + 0.020*"training" + 0.019*"business" + 0.015*"employees" + 0.015*"skills" + 0.015*"tools" + 0.010*"data" + 0.010*"sharing" + 0.010*"work"')]

In [21]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dct_knowledge)
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


### Persuasion topic model

In [26]:
persuasion_stream = df_text['persuasion'].tolist()
persuasion_clean = prep_clean_data(persuasion_stream)

['Lead by example  do not lead on opinion', 'Describe the benefits and how it can lead to more informed decisions and the accuracy using a data driven approach rather than gut feeling or based on misinterpreted analysis ', 'Share examples of how it works', 'Introduction presentations and demos of data driven technologies', 'Selling the value and efficiency that it brings  This might require somebody working closely with the dept to understand were the value add lies ', 'training  workshops', 'Awards Certificates for participation and different levels of competency  Possible implementation of ideas brought about through the contribution of the participants ', 'Highlight competitive advantage  highlight the existence of trends hidden in data  highlight the criticality of decisions based on trusted information  highlight the need to have an appetite for change  willingness to change direction based on observations within the data  ', 'have a prominent display section in for instance the d

In [27]:
dct_knowledge = Dictionary(persuasion_clean)
corpus = [dct_knowledge.doc2bow(doc) for doc in persuasion_clean]
print(len(dct_knowledge))
print(len(corpus))

518
142


In [28]:

lda = ldamodel.LdaModel(corpus, id2word=dct_knowledge, num_topics = 4)
lda.show_topics(num_words=10)
    

[(0,
  '0.025*"benefits" + 0.023*"data" + 0.013*"teams" + 0.013*"incentives" + 0.010*"showing" + 0.010*"real" + 0.009*"need" + 0.009*"learning" + 0.008*"awareness" + 0.007*"driven"'),
 (1,
  '0.032*"data" + 0.029*"value" + 0.028*"people" + 0.011*"driven" + 0.009*"create" + 0.009*"demonstrate" + 0.009*"understand" + 0.009*"work" + 0.009*"proof" + 0.008*"workshops"'),
 (2,
  '0.054*"data" + 0.028*"driven" + 0.014*"benefits" + 0.013*"training" + 0.010*"highlight" + 0.010*"examples" + 0.009*"use" + 0.009*"based" + 0.009*"results" + 0.008*"awareness"'),
 (3,
  '0.039*"data" + 0.019*"driven" + 0.013*"use" + 0.012*"value" + 0.010*"user" + 0.008*"management" + 0.008*"decisions" + 0.008*"results" + 0.007*"result" + 0.007*"tech"')]

In [29]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dct_knowledge)
vis


  default_term_info = default_term_info.sort_values(
