In [1]:
#text processing
import re
import string
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np

In [2]:
#read the csv file with amazon reviews
reviews_df=pd.read_csv('dataset.csv',error_bad_lines=False)

In [3]:
reviews_df['tweet'] = reviews_df['tweet'].astype(str)

In [4]:
print(reviews_df.head(6))

   Unnamed: 0                                              tweet
0           0  RT @JeffreyGuterman: @DonaldJTrumpJr I think o...
1           1  RT @RealJamesWoods: President Trump simply kno...
2           2  RT @GOP: Tuesday’s #SOTU made it clear: Democr...
3           3  @SenSanders Donald Trump left out a few detail...
4           4  RT @SenTedCruz: .@realDonaldTrump's #SOTU spee...
5           5  RT @RyanAFournier: How can a sitting United St...


In [5]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    """
    text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", text)
    text = text.lower()  # lower case text
    text = nltk.word_tokenize(text)
    return (text)

In [6]:
stop_words = stopwords.words('english')
stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do','took','time','year',
'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line','even', 'also', 'may', 'take', 'come', 'new','said', 'like','people'])

In [7]:
def remove_stop_words(text):
     return [word for word in text if word not in stop_words]

In [8]:
stemmer = PorterStemmer()

In [9]:
def stem_words(text):
    """
    Function to stem words
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # no single letter words
    except IndexError:
        pass

    return text

In [10]:
def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))

In [11]:
# clean reviews and create new column "tokenized"
import time

In [12]:
t1 = time.time()
reviews_df['tokenized_reviews'] = reviews_df['tweet'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(reviews_df), "reviews:", (t2-t1)/60, "min") #Time to clean and tokenize 3209 reviews: 0.21254388093948365 min

Time to clean and tokenize 103055 reviews: 0.8351837515830993 min


In [13]:
print("reviews with their respective tokenize version:" )
print(reviews_df.head(5))

reviews with their respective tokenize version:
   Unnamed: 0                                              tweet  \
0           0  RT @JeffreyGuterman: @DonaldJTrumpJr I think o...   
1           1  RT @RealJamesWoods: President Trump simply kno...   
2           2  RT @GOP: Tuesday’s #SOTU made it clear: Democr...   
3           3  @SenSanders Donald Trump left out a few detail...   
4           4  RT @SenTedCruz: .@realDonaldTrump's #SOTU spee...   

                                   tokenized_reviews  
0  [rt, one, absurd, moment, sotu, disgust, fathe...  
1  [rt, presid, trump, simpli, knock, park, respo...  
2  [rt, tuesday, sotu, made, clear, democrat, won...  
3  [donald, trump, left, detail, sotu, 2018, lost...  
4  [rt, sotu, speech, strong, start, incred, succ...  


In [14]:
#LDA
import gensim
import pyLDAvis.gensim

In [15]:
#Create a Gensim dictionary from the tokenized data
tokenized = reviews_df['tokenized_reviews']

In [16]:
#Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(tokenized)

In [17]:
#Filter terms which occurs in less than 1 review and more than 80% of the reviews.
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [18]:
#convert the dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [19]:
print([[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[('absurd', 1), ('disgust', 1), ('father', 1), ('man', 1), ('moment', 1), ('one', 1), ('prob', 1), ('sotu', 1)]]


In [20]:
#LDA
t1 = time.time()
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 4, id2word=dictionary, passes=15)
t2 = time.time()
print("Time to clean and tokenize", len(reviews_df), "tweet:", (t2-t1)/60, "min")

Time to clean and tokenize 103055 tweet: 5.261151293913524 min


In [25]:
#saving the model
ldamodel.save('model_combined.gensim')
topics = ldamodel.print_topics(num_words=10)

In [26]:
print("Now printing the topics and their composition")
print("This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic")
for topic in topics:
    print(topic)

Now printing the topics and their composition
This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic
(0, '0.043*"sotu" + 0.026*"watch" + 0.022*"america" + 0.021*"call" + 0.019*"person" + 0.018*"israel" + 0.017*"hero" + 0.017*"antisemit" + 0.017*"democrat" + 0.017*"admit"')
(1, '0.062*"sotu" + 0.026*"great" + 0.021*"choos" + 0.020*"trump" + 0.019*"ask" + 0.019*"must" + 0.018*"matter" + 0.015*"el" + 0.012*"en" + 0.012*"keep"')
(2, '0.066*"sotu" + 0.022*"watch" + 0.017*"speech" + 0.013*"trump" + 0.011*"talk" + 0.009*"forward" + 0.009*"america" + 0.009*"presid" + 0.008*"follow" + 0.008*"politician"')
(3, '0.040*"trump" + 0.028*"state" + 0.024*"sotu" + 0.021*"presid" + 0.020*"respons" + 0.017*"union" + 0.017*"women" + 0.014*"democrat" + 0.012*"well" + 0.012*"poll"')


In [27]:
#finding the similarity of the first review with topics
print('\n')
print("first review is:")
print(reviews_df.tweet[0])
get_document_topics = ldamodel.get_document_topics(corpus[0])
print('\n')
print("The similarity of this review with the topics and respective similarity score are ")
print(get_document_topics)



first review is:
RT @JeffreyGuterman: @DonaldJTrumpJr I think one of the most absurd moments of the #SOTU is when your disgusting father, a man who has prob…


The similarity of this review with the topics and respective similarity score are 
[(0, 0.33565742), (1, 0.028023863), (2, 0.6078731), (3, 0.028445657)]


In [None]:
#visualizing topics
lda_viz = gensim.models.ldamodel.LdaModel.load('model_combined.gensim')
lda_display = pyLDAvis.gensim.prepare(lda_viz, corpus, dictionary, sort_topics=True)
pyLDAvis.show(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [08/Mar/2020 19:17:01] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2020 19:17:01] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2020 19:17:01] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [08/Mar/2020 19:17:01] "GET /LDAvis.js HTTP/1.1" 200 -
