
# Prerequisites – Download nltk stopwords and spacy model

In [1]:
# Run in python console
import nltk; nltk.download('stopwords'); nltk.download('wordnet')
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/georgetanev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/georgetanev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import Packages


In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [3]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [4]:
# spacy for lemmatization
import spacy

In [5]:

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

spacy.load("en_core_web_sm")
from spacy.lang.en import English

parser = English()
ss = SnowballStemmer("english")
lem = WordNetLemmatizer()

  and should_run_async(code)


In [7]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  and should_run_async(code)


# Define General Functions

In [8]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    # tokens = [get_lemma(token) for token in tokens]
    return tokens


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


def getDocTopicWeight(lda_model_in, new_doc):
    doc = prepare_text_for_lda(new_doc)
    new_doc_bow = id2word.doc2bow(doc)
    return lda_model_in.get_document_topics(new_doc_bow)

# Prepare Stopwords

In [9]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

sw = pd.read_csv('VP_assertions_stop_words_5303_W2021.txt')

# Convert to list
sw_data = sw.Archive.tolist()
# stop_words.extend(sw_data)
stop_words = sw_data

# Import Assrtions Data

In [10]:
df = pd.read_csv('230 VP assertions corpus - Group 6.csv')

In [11]:
df.head(5)

Unnamed: 0,name,content
0,G6A002,Access resources required to scale at relative...
1,A003,Adapt offers to each market
2,A005,"Align interests of investors, the company top ..."
3,A006,Allow resource owners to make money using your...
4,A007,Apply big data analytics to produce insightful...


# Data pre-processing

In [12]:
# Convert to list
data = df.content.values.tolist()

In [13]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

In [14]:
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [15]:
pprint(data[:1])

['Access resources required to scale at relatively low cost or for free by '
 'creating benefits for the resource owners that they cannot create alone']


# Tokenize words and Clean-up text



In [16]:
#Tokenize each sentence into a list of words, removing punctuations and unnecessary characters 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['access', 'resources', 'required', 'to', 'scale', 'at', 'relatively', 'low', 'cost', 'or', 'for', 'free', 'by', 'creating', 'benefits', 'for', 'the', 'resource', 'owners', 'that', 'they', 'cannot', 'create', 'alone']]


# Creating Bigram and Trigram Models

In [17]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['access', 'resources', 'required', 'to', 'scale', 'at', 'relatively', 'low', 'cost', 'or', 'for', 'free', 'by', 'creating', 'benefits', 'for', 'the', 'resource_owners', 'that', 'they', 'cannot', 'create', 'alone']


# Remove Stopwords, Make Bigrams & Stem words

In [18]:
# Define functions for stopwords, 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def stem_words(texts):
    return [[ss.stem(word) for word in simple_preprocess(str(doc))] for doc in texts]

def lem_words(texts):
    return [[lem.lemmatize(word) for word in simple_preprocess(str(doc))] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data)

#data_stemmed = stem_words(data_words_nostops)
data_lemmatized = lem_words(data_words_nostops)

# Form Bigrams
data_words_bigrams = make_bigrams(data_lemmatized)



In [20]:
pprint(data_words_bigrams[:1])

[['resource', 'low', 'benefit', 'resource', 'owner']]


# Create the Dictionary and Corpus needed for Topic Modeling

In [21]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

In [22]:
# Create Corpus
texts = data_words_bigrams

In [23]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [24]:

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2)]]


In [25]:
id2word[0]

'benefit'

In [26]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('benefit', 1), ('low', 1), ('owner', 1), ('resource', 2)]]

# Building the Topic Model

In [33]:
num_topics=50
runs = range(1, 6, 1)

for run in runs:

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics(num_topics=-1))
    doc_lda = lda_model[corpus]

    # Print out words per topic
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

    pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("output/"+str(num_topics)+"_"+str(run)+"_top_words.csv")

    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    vis

    # Calculate Document Topic Weights and Print to CSV

    df_doctop = pd.DataFrame(np.zeros((len(data),num_topics),dtype=float), index=np.arange(len(data)), columns=[list(range(num_topics))])
    count=0

    for a in data:
        doc_topic_weights = getDocTopicWeight(lda_model, a)
        for b in doc_topic_weights:
            # df_doctop = pd.DataFrame(b columns=range(20))
            df_doctop.at[count, b[0]] = b[1]

            # df_doctop.append(b[1]: doc_topic_weights)
        # print(doc_topic_weights[:][1])
        count=count+1

    df_doctop['Name']=df['name']
    df_doctop['Assertion']=data
    df_doctop.to_csv("output/"+str(num_topics)+"_"+str(run)+"_document_topic_weights.csv")
    
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
    pyLDAvis.save_html(vis, "output/" + str(num_topics) + '_LDA_Visualization.html')

[(0,
  '0.601*"channel" + 0.001*"cybersecurity" + 0.001*"offer" + 0.001*"resource" '
  '+ 0.001*"stream" + 0.001*"tend" + 0.001*"advocacy" + 0.001*"aligned" + '
  '0.001*"initiative" + 0.001*"preferentially"'),
 (1,
  '0.003*"treatment" + 0.003*"customizing" + 0.003*"aligned" + 0.003*"tend" + '
  '0.003*"stream" + 0.003*"loyal" + 0.003*"advocacy" + 0.003*"wish" + '
  '0.003*"associated" + 0.003*"preferentially"'),
 (2,
  '0.287*"master" + 0.002*"resource" + 0.002*"owner" + 0.002*"aligned" + '
  '0.002*"loyal" + 0.002*"stream" + 0.002*"wish" + 0.002*"tend" + '
  '0.002*"initiative" + 0.002*"customizing"'),
 (3,
  '0.902*"supplier" + 0.000*"customer" + 0.000*"proposition" + '
  '0.000*"advocacy" + 0.000*"wish" + 0.000*"treatment" + 0.000*"aligned" + '
  '0.000*"tend" + 0.000*"customizing" + 0.000*"stream"'),
 (4,
  '0.302*"owner" + 0.287*"resource" + 0.176*"investor" + 0.026*"proposition" + '
  '0.002*"customer" + 0.001*"return" + 0.001*"benefit" + 0.001*"loyal" + '
  '0.001*"stream" + 0

TypeError: Object of type complex is not JSON serializable