In [1]:
import pandas as pd
import gensim
import nltk
import re

nltk.download('wordnet')

# download word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# filename = 'abcnews-date-text.csv'
filename = 'twitter_trump_2019_05.csv'
raw_docs = pd.read_csv(filename, error_bad_lines=False)

In [3]:
len(raw_docs)

677

In [4]:
raw_docs.head()

Unnamed: 0,text,created_at,id_str
0,Robert Mueller came to the Oval Office (along ...,05-30-2019 15:34:11,1134120831389392896
1,“Comey and Brennan are turning on each other.”...,05-30-2019 14:41:24,1134107544681455616
2,Congressman John Ratcliffe “The Trump Campaign...,05-30-2019 13:41:43,1134092525218590721
3,Russia Russia Russia! That’s all you heard at ...,05-30-2019 11:57:47,1134066371510378501
4,....say he fought back against this phony crim...,05-30-2019 11:57:47,1134066372584062976


In [29]:
raw_docs['text'][:10]

0    Robert Mueller came to the Oval Office (along ...
1    “Comey and Brennan are turning on each other.”...
2    Congressman John Ratcliffe “The Trump Campaign...
3    Russia Russia Russia! That’s all you heard at ...
4    ....say he fought back against this phony crim...
5    Russia Russia Russia! That’s all you heard at ...
6    ....say he fought back against this phony crim...
7    The Greatest Presidential Harassment in histor...
8    I was not informed about anything having to do...
9    Great show tonight @seanhannity you really get...
Name: text, dtype: object

In [30]:
import pdb

def preprocess_docs(raw_docs, num_docs=None):
    """
    Args:
        raw_docs: pandas.DataFrame

    Returns:
        list(list): return list of list
    """
    if num_docs is None:
        num_docs = len(raw_docs)
        
    docs = list()

    stemmer = nltk.stem.SnowballStemmer('english')
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    
    stopwords_http = ['https', 'rt', 'amp']

    for d in raw_docs['text'][:num_docs]:
        
#         print(d)
        processed_tokens = list()
        # normalize and tokenize
        tokens = gensim.utils.simple_preprocess(d)
    
#         print(tokens)
#         pdb.set_trace()

        # lemmatize then stem
        for t in tokens:
            # remove stop words
            if t not in gensim.parsing.preprocessing.STOPWORDS:
                if t not in stopwords_http:
    #             print(t)
                    p_t = stemmer.stem(lemmatizer.lemmatize(t, pos='v'))
                    processed_tokens.append(p_t)
            
    #             pdb.set_trace()

        docs.append(processed_tokens)
    
    return docs


docs = preprocess_docs(raw_docs)

In [44]:
print(len(docs))
# print(docs)

677


In [32]:
def create_dictionary(docs):
    return gensim.corpora.Dictionary(docs)

dictionary = create_dictionary(docs)

## Function to recover stem

In [33]:
def get_original_word(stem, raw_docs):
    index = raw_docs['text'].str.contains(stem, flags=re.IGNORECASE)
    first_index = index.idxmax()
    
    matched_text = raw_docs['text'][first_index]

    words = nltk.word_tokenize(matched_text)
    
    matched_word = None
    
    for w in words:
        if stem in w.lower():
            matched_word = w
            break
    
    if matched_word is None:
        return stem
    
    return matched_word

# BOW

In [34]:
bows = [dictionary.doc2bow(doc) for doc in docs]

In [35]:
bows[0][:5]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]

In [36]:
lda_model_bow = gensim.models.LdaModel(bows,
                                        num_topics=3,
                                        id2word=dictionary,
                                        passes=2)

In [37]:
for index, topic in lda_model_bow.print_topics(-1):
    print('index {}: topic= {}'.format(index, topic))

index 0: topic= 0.012*"china" + 0.010*"great" + 0.009*"realdonaldtrump" + 0.008*"dollar" + 0.007*"billion" + 0.007*"state" + 0.007*"tariff" + 0.007*"presid" + 0.006*"mueller" + 0.006*"trump"
index 1: topic= 0.015*"great" + 0.008*"time" + 0.008*"know" + 0.008*"democrat" + 0.007*"state" + 0.006*"work" + 0.005*"new" + 0.005*"presid" + 0.004*"dbongino" + 0.004*"job"
index 2: topic= 0.011*"trump" + 0.011*"presid" + 0.008*"great" + 0.008*"countri" + 0.008*"china" + 0.007*"dbongino" + 0.006*"want" + 0.005*"peopl" + 0.005*"year" + 0.005*"deal"


In [38]:
for index, topic in lda_model_bow.show_topics(-1, formatted=False):
    print('Topic {}: '.format(index))
    
    print('\n\tStems:')
    
    for word, weight in topic:
        print('\t\t{}'.format(word), end='\n')
#         print('{} '.format(word))

    print('\n\n\tRecovered Words:')
    
    for word, weight in topic:
        print('\t\t{}'.format(get_original_word(word, raw_docs)), end='\n')

    print('\n')

Topic 0: 

	Stems:
		china
		great
		realdonaldtrump
		dollar
		billion
		state
		tariff
		presid
		mueller
		trump


	Recovered Words:
		China
		Greatest
		realDonaldTrump
		Dollar
		Billion
		Statement
		tariff
		Presidential
		Mueller
		Trump


Topic 1: 

	Stems:
		great
		time
		know
		democrat
		state
		work
		new
		presid
		dbongino
		job


	Recovered Words:
		Greatest
		time
		know
		Democrats
		Statement
		work
		FoxNews
		Presidential
		dbongino
		job


Topic 2: 

	Stems:
		trump
		presid
		great
		countri
		china
		dbongino
		want
		peopl
		year
		deal


	Recovered Words:
		Trump
		Presidential
		Greatest
		countries
		China
		dbongino
		wanted
		people
		years
		deals




## TF-IDF

In [39]:

def bows2tfidf(bows):
    tfidf = gensim.models.TfidfModel(bows)
    corpus_tfidf = tfidf[bows]
    
    return corpus_tfidf

corpus_tfidf = bows2tfidf(bows)



In [40]:
type(corpus_tfidf)

gensim.interfaces.TransformedCorpus

In [41]:
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
                                   num_topics=3,
                                   id2word=dictionary,
                                   passes=2)

In [42]:
for index, topic in lda_model_tfidf.print_topics(-1):
    print('Index {}: topic= {}'.format(index, topic))

Index 0: topic= 0.004*"realdonaldtrump" + 0.003*"dbongino" + 0.003*"presid" + 0.003*"trump" + 0.003*"america" + 0.003*"china" + 0.002*"great" + 0.002*"news" + 0.002*"countri" + 0.002*"peopl"
Index 1: topic= 0.004*"dbongino" + 0.003*"democrat" + 0.003*"great" + 0.003*"realdonaldtrump" + 0.002*"republican" + 0.002*"state" + 0.002*"vote" + 0.002*"china" + 0.002*"work" + 0.002*"want"
Index 2: topic= 0.004*"great" + 0.003*"china" + 0.003*"trump" + 0.003*"time" + 0.003*"billion" + 0.002*"presid" + 0.002*"year" + 0.002*"dollar" + 0.002*"state" + 0.002*"new"


In [43]:
for index, topic in lda_model_tfidf.show_topics(-1, formatted=False):
    print('Topic {}: '.format(index))
    
    print('\n\tStems:')
    
    for word, weight in topic:
        print('\t\t{}'.format(word), end='\n')
#         print('{} '.format(word))

    print('\n\n\tRecovered Words:')
    
    for word, weight in topic:
        print('\t\t{}'.format(get_original_word(word, raw_docs)), end='\n')

    print('\n')

Topic 0: 

	Stems:
		realdonaldtrump
		dbongino
		presid
		trump
		america
		china
		great
		news
		countri
		peopl


	Recovered Words:
		realDonaldTrump
		dbongino
		Presidential
		Trump
		Americans
		China
		Greatest
		FoxNews
		countries
		people


Topic 1: 

	Stems:
		dbongino
		democrat
		great
		realdonaldtrump
		republican
		state
		vote
		china
		work
		want


	Recovered Words:
		dbongino
		Democrats
		Greatest
		realDonaldTrump
		Republican
		Statement
		vote
		China
		work
		wanted


Topic 2: 

	Stems:
		great
		china
		trump
		time
		billion
		presid
		year
		dollar
		state
		new


	Recovered Words:
		Greatest
		China
		Trump
		time
		Billion
		Presidential
		years
		Dollar
		Statement
		FoxNews


