
# Prerequisites – Download nltk stopwords and spacy model

In [1]:
# Run in python console
import nltk; nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/darora/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Import Packages


In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [4]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [5]:
# spacy for lemmatization
import spacy

In [8]:

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

  and should_run_async(code)


In [10]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  and should_run_async(code)


# Prepare Stopwords

In [11]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Import Assrtions Data

In [12]:
df = pd.read_csv('230 VP assertions corpus - Group 6.csv')

In [16]:
df.head()

Unnamed: 0,name,content
0,G6A002,Access resources required to scale at relative...
1,A003,Adapt offers to each market
2,A005,"Align interests of investors, the company top ..."
3,A006,Allow resource owners to make money using your...
4,A007,Apply big data analytics to produce insightful...


# Data pre-processing

In [19]:
# Convert to list
data = df.content.values.tolist()

In [22]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

In [23]:
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [24]:
pprint(data[:1])

['Access resources required to scale at relatively low cost or for free by '
 'creating benefits for the resource owners that they cannot create alone']


# Remove Stopwords

In [25]:
# Define functions for stopwords, 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [28]:
# Remove Stop Words
data_nostops = remove_stopwords(data)

In [29]:
pprint(data[:1])

['Access resources required to scale at relatively low cost or for free by '
 'creating benefits for the resource owners that they cannot create alone']


# Create the Dictionary and Corpus needed for Topic Modeling

In [30]:
# Create Dictionary
id2word = corpora.Dictionary(data_nostops)

In [31]:
# Create Corpus
texts = data_nostops

In [32]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [33]:

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]


In [34]:
id2word[0]

'access'

In [35]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('access', 1),
  ('alone', 1),
  ('benefits', 1),
  ('cannot', 1),
  ('cost', 1),
  ('create', 1),
  ('creating', 1),
  ('free', 1),
  ('low', 1),
  ('owners', 1),
  ('relatively', 1),
  ('required', 1),
  ('resource', 1),
  ('resources', 1),
  ('scale', 1)]]

# Building the Topic Model

In [36]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# View the topics in LDA model

In [37]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.076*"delivery" + 0.063*"including" + 0.045*"quality" + '
  '0.042*"expectations" + 0.039*"improve" + 0.036*"brand" + '
  '0.028*"continuously" + 0.026*"experience" + 0.017*"centric" + '
  '0.017*"implement"'),
 (1,
  '0.143*"time" + 0.078*"partners" + 0.073*"money" + 0.036*"work" + '
  '0.026*"need" + 0.023*"adapt" + 0.013*"directors" + 0.013*"people" + '
  '0.013*"communication" + 0.012*"board"'),
 (2,
  '0.132*"reduce" + 0.020*"end" + 0.017*"supports" + 0.017*"solution" + '
  '0.008*"deliver" + 0.007*"inventory" + 0.007*"number" + 0.007*"procurement" '
  '+ 0.007*"believe" + 0.007*"able"'),
 (3,
  '0.118*"increase" + 0.081*"information" + 0.055*"sales" + 0.035*"suppliers" '
  '+ 0.029*"customers" + 0.020*"provide" + 0.018*"users" + 0.016*"enhance" + '
  '0.014*"service" + 0.012*"apply"'),
 (4,
  '0.039*"deploy" + 0.029*"offers" + 0.027*"resources" + 0.025*"company" + '
  '0.008*"differently" + 0.008*"arrange" + 0.008*"broaden" + 0.008*"existing" '
  '+ 0.001*"ecommerce" + 0

# Visualize the topics-keywords

In [38]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis