In [1]:
import pandas as pd
import gensim
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
# filename = 'abcnews-date-text.csv'
filename = 'twitter_trump_2019_05.csv'
raw_docs = pd.read_csv(filename, error_bad_lines=False)

In [3]:
len(raw_docs)

677

In [4]:
raw_docs.head()

Unnamed: 0,text,created_at,id_str
0,Robert Mueller came to the Oval Office (along ...,05-30-2019 15:34:11,1134120831389392896
1,“Comey and Brennan are turning on each other.”...,05-30-2019 14:41:24,1134107544681455616
2,Congressman John Ratcliffe “The Trump Campaign...,05-30-2019 13:41:43,1134092525218590721
3,Russia Russia Russia! That’s all you heard at ...,05-30-2019 11:57:47,1134066371510378501
4,....say he fought back against this phony crim...,05-30-2019 11:57:47,1134066372584062976


In [5]:
raw_docs['text'][:10]

0    Robert Mueller came to the Oval Office (along ...
1    “Comey and Brennan are turning on each other.”...
2    Congressman John Ratcliffe “The Trump Campaign...
3    Russia Russia Russia! That’s all you heard at ...
4    ....say he fought back against this phony crim...
5    Russia Russia Russia! That’s all you heard at ...
6    ....say he fought back against this phony crim...
7    The Greatest Presidential Harassment in histor...
8    I was not informed about anything having to do...
9    Great show tonight @seanhannity you really get...
Name: text, dtype: object

In [6]:
import pdb

def preprocess_docs(raw_docs, num_docs=None):
    """
    Args:
        raw_docs: pandas.DataFrame

    Returns:
        list(list): return list of list
    """
    if num_docs is None:
        num_docs = 10
        
    docs = list()

    stemmer = nltk.stem.SnowballStemmer('english')
    lemmatizer = nltk.stem.WordNetLemmatizer()

    for d in raw_docs['text'][:num_docs]:
        
#         print(d)
        processed_tokens = list()
        # normalize and tokenize
        tokens = gensim.utils.simple_preprocess(d)
    
#         print(tokens)
#         pdb.set_trace()

        # lemmatize then stem
        for t in tokens:
            # remove stop words
            if t not in gensim.parsing.preprocessing.STOPWORDS:
    #             print(t)
                p_t = stemmer.stem(lemmatizer.lemmatize(t, pos='v'))
                processed_tokens.append(p_t)
            
    #             pdb.set_trace()

        docs.append(processed_tokens)
    
    return docs


docs = preprocess_docs(raw_docs)

In [7]:
print(len(docs))
print(docs)

10
[['robert', 'mueller', 'come', 'oval', 'offic', 'potenti', 'candid', 'seek', 'name', 'director', 'fbi', 'posit', 'year', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'nice'], ['comey', 'brennan', 'turn', 'kilmead'], ['congressman', 'john', 'ratcliff', 'trump', 'campaign', 'clear', 'conspir', 'collud', 'foxnew'], ['russia', 'russia', 'russia', 'hear', 'begin', 'witch', 'hunt', 'hoax', 'russia', 'disappear', 'russia', 'help', 'elect', 'crime', 'exist', 'dem', 'partner', 'fake', 'news', 'media'], ['fight', 'phoni', 'crime', 'exist', 'horrend', 'fals', 'accus', 'shouldn', 'fight', 'sit', 'obstruct', 'mueller', 'obstruct', 'presidenti', 'harass'], ['russia', 'russia', 'russia', 'hear', 'begin', 'witch', 'hunt', 'hoax', 'russia', 'disappear', 'russia', 'help', 'elect', 'crime', 'exist', 'dem', 'partner', 'fake', 'news', 'media'], ['fight', 'phoni', 'crime', 'exist', 'horrend', 'fals', 'acquisit', 'shouldn', 'fight', 'sit', 'obstruct', 'mueller', 'obstruct', 'president

In [8]:
def create_dictionary(docs):
    return gensim.corpora.Dictionary(docs)

dictionary = create_dictionary(docs)

# BOW

In [9]:
bows = [dictionary.doc2bow(doc) for doc in docs]

In [49]:
bows[0][:5]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]

In [11]:
lda_model_bow = gensim.models.LdaModel(bows,
                                        num_topics=3,
                                        id2word=dictionary,
                                        passes=2)

In [12]:
for index, topic in lda_model_bow.print_topics(-1):
    print('index {}: topic= {}'.format(index, topic))

index 0: topic= 0.115*"russia" + 0.026*"exist" + 0.026*"crime" + 0.026*"dem" + 0.026*"partner" + 0.026*"media" + 0.026*"hoax" + 0.026*"hunt" + 0.026*"begin" + 0.026*"help"
index 1: topic= 0.051*"fight" + 0.051*"obstruct" + 0.040*"mueller" + 0.039*"harass" + 0.039*"presidenti" + 0.027*"crime" + 0.027*"exist" + 0.027*"phoni" + 0.027*"shouldn" + 0.027*"horrend"
index 2: topic= 0.030*"john" + 0.028*"name" + 0.017*"foxnew" + 0.017*"have" + 0.017*"great" + 0.017*"inform" + 0.017*"militari" + 0.017*"japan" + 0.017*"men" + 0.017*"mccain"


In [48]:
for index, topic in lda_model_bow.show_topics(-1, formatted=False):
    print('Topic {}:'.format(index))
    
    for word, weight in topic:
        print('{} '.format(word), end='')
        
    print('\n')

Topic 0:
russia exist crime dem partner media hoax hunt begin help 

Topic 1:
fight obstruct mueller harass presidenti crime exist phoni shouldn horrend 

Topic 2:
john name foxnew have great inform militari japan men mccain 



## TF-IDF

In [13]:

def bows2tfidf(bows):
    tfidf = gensim.models.TfidfModel(bows)
    corpus_tfidf = tfidf[bows]
    
    return corpus_tfidf

corpus_tfidf = bows2tfidf(bows)



In [14]:
type(corpus_tfidf)

gensim.interfaces.TransformedCorpus

In [15]:
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
                                   num_topics=3,
                                   id2word=dictionary,
                                   passes=2)

In [16]:
for index, topic in lda_model_tfidf.print_topics(-1):
    print('Index {}: topic= {}'.format(index, topic))

Index 0: topic= 0.040*"russia" + 0.017*"brennan" + 0.017*"turn" + 0.017*"kilmead" + 0.017*"comey" + 0.015*"john" + 0.014*"campaign" + 0.014*"clear" + 0.014*"conspir" + 0.014*"congressman"
Index 1: topic= 0.013*"obstruct" + 0.013*"fight" + 0.013*"accus" + 0.012*"horrend" + 0.012*"fals" + 0.012*"phoni" + 0.011*"sit" + 0.011*"shouldn" + 0.011*"exist" + 0.011*"crime"
Index 2: topic= 0.024*"fight" + 0.024*"obstruct" + 0.017*"number" + 0.016*"name" + 0.016*"charg" + 0.016*"bring" + 0.016*"harass" + 0.016*"presidenti" + 0.015*"shouldn" + 0.015*"sit"


In [46]:
for index, topic in lda_model_tfidf.show_topics(-1, formatted=False):
    print('Topic {}: '.format(index))
    
    for word, weight in topic:
        print('{} '.format(word), end='')
#         print('{} '.format(word))

    print('\n')

Topic 0: 
russia brennan turn kilmead comey john campaign clear conspir congressman 

Topic 1: 
obstruct fight accus horrend fals phoni sit shouldn exist crime 

Topic 2: 
fight obstruct number name charg bring harass presidenti shouldn sit 

