In [43]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import pickle
import gensim
import spacy
import logging
import warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [2]:
import nltk
# nltk.download('punkt')

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

In [4]:
stopwords.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 
                   'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 
                   'right', 'line', 'even', 'also', 'may', 'take', 'come'])

In [5]:
df = pd.read_csv("../Data/metrics.csv")

In [6]:
# df = df[:100]

In [7]:
def remove_emails_nl_quotes(sentence):
    sentence = re.sub('\S*@\S*\s?', '', sentence)
    sentence = re.sub('\s+', ' ', sentence)
    sentence = re.sub("\'", "", sentence)
    sentence = gensim.utils.simple_preprocess(str(sentence), deacc=True) 
    return sentence

In [8]:
def word_tokenization(sentence):
    return [token for token in sentence if token not in stopwords]

In [9]:
df['Preprocessed'] = df['Review'].apply(remove_emails_nl_quotes)

In [10]:
df['No_Stopwords'] = df['Preprocessed'].apply(word_tokenization)

In [11]:
data_words = df['No_Stopwords'].values

In [12]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [13]:

# or in python:
from spacy.cli import download
# download("en_core_web_sm")

In [14]:
def preprocess_words(texts, stopwords=stopwords, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts = [bigram_model[doc] for doc in texts]
    texts = [trigram_model[bigram_model[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load("en_core_web_sm")
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts_out]
        
    return texts_out
        

In [15]:
data_ready = preprocess_words(data_words)


In [16]:
len(data_ready)

90

In [17]:
df['Review'][0]

'ROI on SEO and PPC. SEO has some additional beneﬁts in the long run.'

In [18]:
id2word = corpora.Dictionary(data_ready)

In [19]:
corpus = [id2word.doc2bow(text) for text in data_ready]

In [20]:
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=90, random_state=100, update_every=1, chunksize=10,
                    passes=5, alpha='symmetric', iterations=100,
                    per_word_topics=True)

In [21]:
df

Unnamed: 0.1,Unnamed: 0,Review,Preprocessed,No_Stopwords
0,0,ROI on SEO and PPC. SEO has some additional be...,"[roi, on, seo, and, ppc, seo, has, some, addit...","[roi, seo, ppc, seo, additional, beneﬁts, long]"
1,1,Growth is an important metric to measure the g...,"[growth, is, an, important, metric, to, measur...","[growth, important, metric, measure, growth, p..."
2,2,Return on ad spend (ROAS) is a metric used to ...,"[return, on, ad, spend, roas, is, metric, used...","[return, ad, spend, roas, metric, used, measur..."
3,3,Customer acquisition cost (CAC) enables compan...,"[customer, acquisition, cost, cac, enables, co...","[customer, acquisition, cost, cac, enables, co..."
4,4,make sure that the cost of acquiring customers...,"[make, sure, that, the, cost, of, acquiring, c...","[sure, cost, acquiring, customers, exceed, amo..."
...,...,...,...,...
85,85,Revenue Growth Rate measures the month-over-mo...,"[revenue, growth, rate, measures, the, month, ...","[revenue, growth, rate, measures, month, month..."
86,86,The virality / Viral coeﬃcient is the number o...,"[the, virality, viral, coeﬃcient, is, the, num...","[virality, viral, coeﬃcient, number, new, user..."
87,87,to use your product - but the key is that thes...,"[to, use, your, product, but, the, key, is, th...","[product, key, users, convert, paying, custome..."
88,88,Simply it is the dependence on one source of t...,"[simply, it, is, the, dependence, on, one, sou...","[simply, dependence, one, source, traﬃc, custo..."


In [22]:
def format_topic_sentences(ldamodel=None, corpus=corpus, texts=data_ready):
    sent_topics_df = pd.DataFrame()
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        row = sorted(row, key= lambda x: (x[1]), reverse=True)
        
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                cls_df = pd.DataFrame({
                    'Dominant_Topic': [int(topic_num)],
                    'Perc_Contribution': [round(prop_topic,4)],
                    'Topic_Keywords': [topic_keywords]
                })
                sent_topics_df = pd.concat([sent_topics_df, cls_df], axis=0)
            else:
                break
    
    return sent_topics_df

In [34]:
df_topics_sents_keywords = format_topic_sentences(ldamodel=lda_model, corpus=corpus)
df_topics_sents_keywords['Review'] = df['Review'].values
df_dominant_topic = df_topics_sents_keywords.reset_index()

In [35]:
df_topics_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Review
0,88,0.8352,"additional, roi, long, seo, beneﬁts, ppc, deep...",ROI on SEO and PPC. SEO has some additional be...
0,57,0.8007,"monthly, important, growth, business, metric, ...",Growth is an important metric to measure the g...
0,46,0.5380,"revenue, total, metric, generate, campaign, re...",Return on ad spend (ROAS) is a metric used to ...
0,51,0.7624,"customer, saas, marketing, new, calculate, muc...",Customer acquisition cost (CAC) enables compan...
0,51,0.8116,"customer, saas, marketing, new, calculate, muc...",make sure that the cost of acquiring customers...
...,...,...,...,...
0,46,0.2509,"revenue, total, metric, generate, campaign, re...",Revenue Growth Rate measures the month-over-mo...
0,51,0.4667,"customer, saas, marketing, new, calculate, muc...",The virality / Viral coeﬃcient is the number o...
0,51,0.3370,"customer, saas, marketing, new, calculate, muc...",to use your product - but the key is that thes...
0,51,0.5056,"customer, saas, marketing, new, calculate, muc...",Simply it is the dependence on one source of t...


In [36]:
df_dominant_topic

Unnamed: 0,index,Dominant_Topic,Perc_Contribution,Topic_Keywords,Review
0,0,88,0.8352,"additional, roi, long, seo, beneﬁts, ppc, deep...",ROI on SEO and PPC. SEO has some additional be...
1,0,57,0.8007,"monthly, important, growth, business, metric, ...",Growth is an important metric to measure the g...
2,0,46,0.5380,"revenue, total, metric, generate, campaign, re...",Return on ad spend (ROAS) is a metric used to ...
3,0,51,0.7624,"customer, saas, marketing, new, calculate, muc...",Customer acquisition cost (CAC) enables compan...
4,0,51,0.8116,"customer, saas, marketing, new, calculate, muc...",make sure that the cost of acquiring customers...
...,...,...,...,...,...
85,0,46,0.2509,"revenue, total, metric, generate, campaign, re...",Revenue Growth Rate measures the month-over-mo...
86,0,51,0.4667,"customer, saas, marketing, new, calculate, muc...",The virality / Viral coeﬃcient is the number o...
87,0,51,0.3370,"customer, saas, marketing, new, calculate, muc...",to use your product - but the key is that thes...
88,0,51,0.5056,"customer, saas, marketing, new, calculate, muc...",Simply it is the dependence on one source of t...


In [37]:
df_topics_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Review
0,88,0.8352,"additional, roi, long, seo, beneﬁts, ppc, deep...",ROI on SEO and PPC. SEO has some additional be...
0,57,0.8007,"monthly, important, growth, business, metric, ...",Growth is an important metric to measure the g...
0,46,0.5380,"revenue, total, metric, generate, campaign, re...",Return on ad spend (ROAS) is a metric used to ...
0,51,0.7624,"customer, saas, marketing, new, calculate, muc...",Customer acquisition cost (CAC) enables compan...
0,51,0.8116,"customer, saas, marketing, new, calculate, muc...",make sure that the cost of acquiring customers...
...,...,...,...,...
0,46,0.2509,"revenue, total, metric, generate, campaign, re...",Revenue Growth Rate measures the month-over-mo...
0,51,0.4667,"customer, saas, marketing, new, calculate, muc...",The virality / Viral coeﬃcient is the number o...
0,51,0.3370,"customer, saas, marketing, new, calculate, muc...",to use your product - but the key is that thes...
0,51,0.5056,"customer, saas, marketing, new, calculate, muc...",Simply it is the dependence on one source of t...


In [38]:
df_topics_sents_keywords['Topic_Keywords'][0]

0    additional, roi, long, seo, beneﬁts, ppc, deep...
0    monthly, important, growth, business, metric, ...
0    revenue, total, metric, generate, campaign, re...
0    customer, saas, marketing, new, calculate, muc...
0    customer, saas, marketing, new, calculate, muc...
                           ...                        
0    revenue, total, metric, generate, campaign, re...
0    customer, saas, marketing, new, calculate, muc...
0    customer, saas, marketing, new, calculate, muc...
0    customer, saas, marketing, new, calculate, muc...
0    customer, saas, marketing, new, calculate, muc...
Name: Topic_Keywords, Length: 90, dtype: object

In [39]:
pprint(lda_model.print_topics())

[(77,
  '0.002*"deep" + 0.002*"form" + 0.002*"heavily" + 0.002*"rely" + '
  '0.002*"concern" + 0.002*"weekly" + 0.002*"drop" + 0.002*"refer" + '
  '0.002*"primary" + 0.002*"software"'),
 (31,
  '0.002*"deep" + 0.002*"form" + 0.002*"heavily" + 0.002*"rely" + '
  '0.002*"concern" + 0.002*"weekly" + 0.002*"drop" + 0.002*"refer" + '
  '0.002*"primary" + 0.002*"software"'),
 (14,
  '0.002*"deep" + 0.002*"form" + 0.002*"heavily" + 0.002*"rely" + '
  '0.002*"concern" + 0.002*"weekly" + 0.002*"drop" + 0.002*"refer" + '
  '0.002*"primary" + 0.002*"software"'),
 (2,
  '0.002*"deep" + 0.002*"form" + 0.002*"heavily" + 0.002*"rely" + '
  '0.002*"concern" + 0.002*"weekly" + 0.002*"drop" + 0.002*"refer" + '
  '0.002*"primary" + 0.002*"software"'),
 (75,
  '0.002*"deep" + 0.002*"form" + 0.002*"heavily" + 0.002*"rely" + '
  '0.002*"concern" + 0.002*"weekly" + 0.002*"drop" + 0.002*"refer" + '
  '0.002*"primary" + 0.002*"software"'),
 (55,
  '0.002*"deep" + 0.002*"form" + 0.002*"heavily" + 0.002*"rely" +

In [40]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()


In [41]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)

In [42]:
vis

TypeError: Object of type complex is not JSON serializable

PreparedData(topic_coordinates=                        x                   y  topics  cluster       Freq
topic                                                                    
51     0.371791+0.000000j -0.322218+0.000000j       1        1  30.273666
46     0.359624+0.000000j  0.201584+0.000000j       2        1  12.157574
57     0.317427+0.000000j  0.231903+0.000000j       3        1  11.792038
59     0.170810+0.000000j  0.043843+0.000000j       4        1   7.350139
71     0.155831+0.000000j  0.036410+0.000000j       5        1   5.748721
...                   ...                 ...     ...      ...        ...
28    -0.023277+0.000000j -0.000423+0.000000j      86        1   0.199523
27    -0.023277+0.000000j -0.000423+0.000000j      87        1   0.199523
26    -0.023277+0.000000j -0.000423+0.000000j      88        1   0.199523
25    -0.023277+0.000000j -0.000423+0.000000j      89        1   0.199523
89    -0.023277+0.000000j -0.000423+0.000000j      90        1   0.199523

[90 ro

AttributeError: 'LdaModel' object has no attribute 'predict'