
# Prerequisites – Download nltk stopwords and spacy model

In [1]:
# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import Packages


In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [3]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
# spacy for lemmatization
import spacy

In [5]:

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

  and should_run_async(code)


In [7]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  and should_run_async(code)


# Prepare Stopwords

In [8]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

sw = pd.read_csv('VP_assertions_stop_words_5303_W2021.txt')

# Convert to list
sw_data = sw.Archive.tolist()

stop_words.extend(sw_data)


# Import Assrtions Data

In [9]:
df = pd.read_csv('230 VP assertions corpus - Group 6.csv')

In [10]:
df.head(5)

Unnamed: 0,name,content
0,G6A002,Access resources required to scale at relative...
1,A003,Adapt offers to each market
2,A005,"Align interests of investors, the company top ..."
3,A006,Allow resource owners to make money using your...
4,A007,Apply big data analytics to produce insightful...


# Data pre-processing

In [11]:
# Convert to list
data = df.content.values.tolist()

In [12]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

In [13]:
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [14]:
pprint(data[:1])

['Access resources required to scale at relatively low cost or for free by '
 'creating benefits for the resource owners that they cannot create alone']


# Tokenize words and Clean-up text



In [15]:
#Tokenize each sentence into a list of words, removing punctuations and unnecessary characters 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['access', 'resources', 'required', 'to', 'scale', 'at', 'relatively', 'low', 'cost', 'or', 'for', 'free', 'by', 'creating', 'benefits', 'for', 'the', 'resource', 'owners', 'that', 'they', 'cannot', 'create', 'alone']]


# Creating Bigram and Trigram Models

In [16]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['access', 'resources', 'required', 'to', 'scale', 'at', 'relatively', 'low', 'cost', 'or', 'for', 'free', 'by', 'creating', 'benefits', 'for', 'the', 'resource_owners', 'that', 'they', 'cannot', 'create', 'alone']


# Remove Stopwords & Make Bigrams

In [17]:
# Define functions for stopwords, 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [18]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [19]:
pprint(data_words_bigrams[:1])

[['resources', 'low', 'benefits', 'resource_owners']]


# Create the Dictionary and Corpus needed for Topic Modeling

In [20]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

In [21]:
# Create Corpus
texts = data_words_bigrams

In [22]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [23]:

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1)]]


In [24]:
id2word[0]

'benefits'

In [25]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('benefits', 1), ('low', 1), ('resource_owners', 1), ('resources', 1)]]

# Building the Topic Model

In [26]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# View the topics in LDA model

In [27]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.003*"resources" + 0.003*"existing" + 0.003*"innovations" + '
  '0.003*"treatment" + 0.003*"allocate" + 0.003*"customizing" + 0.003*"list" + '
  '0.003*"wishes" + 0.003*"preferentially" + 0.003*"advocacy"'),
 (1,
  '0.618*"stakeholders" + 0.001*"strategic" + 0.001*"budget" + '
  '0.001*"agencies" + 0.001*"government" + 0.001*"limitations" + '
  '0.001*"preferred" + 0.001*"reconcile" + 0.001*"aligned" + '
  '0.001*"allocating"'),
 (2,
  '0.156*"products" + 0.133*"benefit" + 0.112*"chain" + 0.089*"services" + '
  '0.049*"brand" + 0.045*"ways" + 0.041*"channels" + 0.023*"barriers" + '
  '0.010*"interfaces" + 0.010*"applications"'),
 (3,
  '0.336*"product" + 0.115*"customers" + 0.088*"service" + 0.070*"brand" + '
  '0.057*"products" + 0.023*"benefits" + 0.009*"competitor" + 0.009*"terms" + '
  '0.009*"bound" + 0.001*"anchor"'),
 (4,
  '0.359*"proposition" + 0.082*"resource" + 0.078*"skills" + 0.066*"portfolio" '
  '+ 0.044*"investors" + 0.038*"returns" + 0.030*"board" + 0.009*"gov

# Visualize the topics-keywords

In [28]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis