
# Prerequisites – Download nltk stopwords and spacy model

In [390]:
# Run in python console
import nltk; nltk.download('stopwords')
from nltk.stem import SnowballStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/georgetanev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import Packages


In [391]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

In [392]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [393]:
# spacy for lemmatization
import spacy

In [394]:

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [395]:
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

spacy.load("en_core_web_sm")
from spacy.lang.en import English

parser = English()
ss = SnowballStemmer("english")

In [396]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Set number of topics

In [397]:
num_topics=30

# Define General Functions

In [398]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    # tokens = [get_lemma(token) for token in tokens]
    return tokens


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


def getDocTopicWeight(lda_model_in, new_doc):
    doc = prepare_text_for_lda(new_doc)
    new_doc_bow = id2word.doc2bow(doc)
    return lda_model_in.get_document_topics(new_doc_bow)

# Prepare Stopwords

In [399]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

sw = pd.read_csv('VP_assertions_stop_words_5303_W2021.txt')

# Convert to list
sw_data = sw.Archive.tolist()
# stop_words.extend(sw_data)
stop_words = sw_data

# Import Assrtions Data

In [400]:
df = pd.read_csv('230 VP assertions corpus - Group 6.csv')

In [401]:
df.head(5)

Unnamed: 0,name,content
0,G6A002,Access resources required to scale at relative...
1,A003,Adapt offers to each market
2,A005,"Align interests of investors, the company top ..."
3,A006,Allow resource owners to make money using your...
4,A007,Apply big data analytics to produce insightful...


# Data pre-processing

In [402]:
# Convert to list
data = df.content.values.tolist()

In [403]:
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

In [404]:
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [405]:
pprint(data[:1])

['Access resources required to scale at relatively low cost or for free by '
 'creating benefits for the resource owners that they cannot create alone']


# Tokenize words and Clean-up text



In [406]:
#Tokenize each sentence into a list of words, removing punctuations and unnecessary characters 

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['access', 'resources', 'required', 'to', 'scale', 'at', 'relatively', 'low', 'cost', 'or', 'for', 'free', 'by', 'creating', 'benefits', 'for', 'the', 'resource', 'owners', 'that', 'they', 'cannot', 'create', 'alone']]


# Creating Bigram and Trigram Models

In [407]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['access', 'resources', 'required', 'to', 'scale', 'at', 'relatively', 'low', 'cost', 'or', 'for', 'free', 'by', 'creating', 'benefits', 'for', 'the', 'resource_owners', 'that', 'they', 'cannot', 'create', 'alone']


# Remove Stopwords, Make Bigrams & Stem words

In [408]:
# Define functions for stopwords, 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def stem_words(texts):
    return [[ss.stem(word) for word in simple_preprocess(str(doc))] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


In [409]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data)

data_stemmed = stem_words(data_words_nostops)

# Form Bigrams
data_words_bigrams = make_bigrams(data_stemmed)



In [410]:
pprint(data_words_bigrams[:1])

[['resourc', 'low', 'benefit', 'resourc', 'owner']]


# Create the Dictionary and Corpus needed for Topic Modeling

In [411]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

In [412]:
# Create Corpus
texts = data_words_bigrams

In [413]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [414]:

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2)]]


In [415]:
id2word[0]

'benefit'

In [416]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('benefit', 1), ('low', 1), ('owner', 1), ('resourc', 2)]]

# Building the Topic Model

In [417]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

# View the topics in LDA model

In [418]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_topics=-1))
doc_lda = lda_model[corpus]

[(0,
  '0.304*"chain" + 0.193*"portfolio" + 0.193*"life" + 0.026*"infrastructur" + '
  '0.001*"custom" + 0.001*"proposit" + 0.001*"offer" + 0.001*"resourc" + '
  '0.001*"stakehold" + 0.001*"owner"'),
 (1,
  '0.003*"definit" + 0.003*"defin" + 0.003*"list" + 0.003*"social" + '
  '0.003*"group" + 0.003*"emerg" + 0.003*"effici" + 0.003*"aggress" + '
  '0.003*"alloc" + 0.003*"increas"'),
 (2,
  '0.003*"definit" + 0.003*"defin" + 0.003*"list" + 0.003*"social" + '
  '0.003*"group" + 0.003*"emerg" + 0.003*"effici" + 0.003*"aggress" + '
  '0.003*"alloc" + 0.003*"increas"'),
 (3,
  '0.494*"proposit" + 0.317*"employe" + 0.086*"investor" + 0.046*"vision" + '
  '0.000*"master" + 0.000*"list" + 0.000*"group" + 0.000*"order" + '
  '0.000*"alloc" + 0.000*"definit"'),
 (4,
  '0.746*"resourc" + 0.001*"aggress" + 0.001*"defin" + 0.001*"order" + '
  '0.001*"list" + 0.001*"social" + 0.001*"group" + 0.001*"emerg" + '
  '0.001*"effici" + 0.001*"preferenti"'),
 (5,
  '0.003*"definit" + 0.003*"defin" + 0.003*"

# Write Top Topics Words to CSV

In [419]:
top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv(str(num_topics)+"_top_words.csv")

# Visualize the topics-keywords

In [420]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

TypeError: Object of type complex is not JSON serializable

PreparedData(topic_coordinates=                        x                   y  topics  cluster       Freq
topic                                                                    
39    -0.328834+0.000000j -0.164144+0.000000j       1        1  13.059709
48    -0.207379+0.000000j -0.188400+0.000000j       2        1   9.047496
3     -0.316624+0.000000j  0.114991+0.000000j       3        1   8.680734
31    -0.197415+0.000000j -0.172730+0.000000j       4        1   5.120734
45    -0.109940+0.000000j -0.157368+0.000000j       5        1   4.858637
12    -0.261641+0.000000j  0.066934+0.000000j       6        1   4.780161
38    -0.188115+0.000000j  0.236445+0.000000j       7        1   4.728322
23    -0.032614+0.000000j -0.074493+0.000000j       8        1   4.570810
27    -0.139070+0.000000j -0.060528+0.000000j       9        1   4.301633
34    -0.243149+0.000000j  0.201665+0.000000j      10        1   3.704871
49    -0.075020+0.000000j -0.102587+0.000000j      11        1   3.667506
18    -

# Calculate Document Topic Weights and Print to CSV

In [421]:
df_doctop = pd.DataFrame(np.zeros((len(data),num_topics),dtype=float), index=np.arange(len(data)), columns=[list(range(num_topics))])
count=0

for a in data:
    doc_topic_weights = getDocTopicWeight(lda_model, a)
    for b in doc_topic_weights:
        # df_doctop = pd.DataFrame(b columns=range(20))
        df_doctop.at[count, b[0]] = b[1]

        # df_doctop.append(b[1]: doc_topic_weights)
    # print(doc_topic_weights[:][1])
    count=count+1

df_doctop['Name']=df['name']
df_doctop['Assertion']=data
df_doctop.to_csv(str(num_topics)+"_document_topic_weights.csv")