In [1]:
import pandas as pd
import gensim
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
# filename = 'abcnews-date-text.csv'
filename = 'twitter_trump_2019_05.csv'
raw_docs = pd.read_csv(filename, error_bad_lines=False)

In [27]:
len(raw_docs)

677

In [3]:
raw_docs.head()

Unnamed: 0,text,created_at,id_str
0,Robert Mueller came to the Oval Office (along ...,05-30-2019 15:34:11,1134120831389392896
1,“Comey and Brennan are turning on each other.”...,05-30-2019 14:41:24,1134107544681455616
2,Congressman John Ratcliffe “The Trump Campaign...,05-30-2019 13:41:43,1134092525218590721
3,Russia Russia Russia! That’s all you heard at ...,05-30-2019 11:57:47,1134066371510378501
4,....say he fought back against this phony crim...,05-30-2019 11:57:47,1134066372584062976


In [4]:
raw_docs['text'][:10]

0    Robert Mueller came to the Oval Office (along ...
1    “Comey and Brennan are turning on each other.”...
2    Congressman John Ratcliffe “The Trump Campaign...
3    Russia Russia Russia! That’s all you heard at ...
4    ....say he fought back against this phony crim...
5    Russia Russia Russia! That’s all you heard at ...
6    ....say he fought back against this phony crim...
7    The Greatest Presidential Harassment in histor...
8    I was not informed about anything having to do...
9    Great show tonight @seanhannity you really get...
Name: text, dtype: object

In [28]:
import pdb

def preprocess_docs(raw_docs, num_docs=None):
    """
    Args:
        raw_docs: pandas.DataFrame

    Returns:
        list(list): return list of list
    """
    if num_docs is None:
        num_docs = 10
        
    docs = list()

    stemmer = nltk.stem.SnowballStemmer('english')
    lemmatizer = nltk.stem.WordNetLemmatizer()

    for d in raw_docs['text'][:num_docs]:
        
#         print(d)
        processed_tokens = list()
        # normalize and tokenize
        tokens = gensim.utils.simple_preprocess(d)
    
#         print(tokens)
#         pdb.set_trace()

        # lemmatize then stem
        for t in tokens:
            # remove stop words
            if t not in gensim.parsing.preprocessing.STOPWORDS:
    #             print(t)
                p_t = stemmer.stem(lemmatizer.lemmatize(t, pos='v'))
                processed_tokens.append(p_t)
            
    #             pdb.set_trace()

        docs.append(processed_tokens)
    
    return docs


docs = preprocess_docs(raw_docs)

In [29]:
print(len(docs))
print(docs)

10
[['robert', 'mueller', 'come', 'oval', 'offic', 'potenti', 'candid', 'seek', 'name', 'director', 'fbi', 'posit', 'year', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'nice'], ['comey', 'brennan', 'turn', 'kilmead'], ['congressman', 'john', 'ratcliff', 'trump', 'campaign', 'clear', 'conspir', 'collud', 'foxnew'], ['russia', 'russia', 'russia', 'hear', 'begin', 'witch', 'hunt', 'hoax', 'russia', 'disappear', 'russia', 'help', 'elect', 'crime', 'exist', 'dem', 'partner', 'fake', 'news', 'media'], ['fight', 'phoni', 'crime', 'exist', 'horrend', 'fals', 'accus', 'shouldn', 'fight', 'sit', 'obstruct', 'mueller', 'obstruct', 'presidenti', 'harass'], ['russia', 'russia', 'russia', 'hear', 'begin', 'witch', 'hunt', 'hoax', 'russia', 'disappear', 'russia', 'help', 'elect', 'crime', 'exist', 'dem', 'partner', 'fake', 'news', 'media'], ['fight', 'phoni', 'crime', 'exist', 'horrend', 'fals', 'acquisit', 'shouldn', 'fight', 'sit', 'obstruct', 'mueller', 'obstruct', 'president

In [30]:
def create_dictionary(docs):
    return gensim.corpora.Dictionary(docs)

dictionary = create_dictionary(docs)

In [31]:
bows = [dictionary.doc2bow(doc) for doc in docs]

In [32]:
bows[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1)]

In [37]:
lda_model_bow = gensim.models.LdaModel(bows,
                                        num_topics=3,
                                        id2word=dictionary,
                                        passes=2)

In [38]:
for index, topic in lda_model_bow.print_topics(-1):
    print('index {}: topic= {}'.format(index, topic))

index 0: topic= 0.033*"name" + 0.022*"robert" + 0.022*"john" + 0.021*"year" + 0.021*"congressman" + 0.021*"conspir" + 0.021*"campaign" + 0.021*"trump" + 0.021*"foxnew" + 0.021*"ratcliff"
index 1: topic= 0.029*"number" + 0.028*"have" + 0.028*"great" + 0.025*"bring" + 0.024*"charg" + 0.018*"tell" + 0.017*"conflict" + 0.017*"foxnew" + 0.017*"congrat" + 0.017*"tonight"
index 2: topic= 0.096*"russia" + 0.040*"obstruct" + 0.040*"exist" + 0.040*"crime" + 0.040*"fight" + 0.022*"harass" + 0.022*"presidenti" + 0.022*"mueller" + 0.022*"disappear" + 0.022*"elect"


## TF-IDF

In [42]:

def bows2tfidf(bows):
    tfidf = gensim.models.TfidfModel(bows)
    corpus_tfidf = tfidf[bows]
    
    return corpus_tfidf

corpus_tfidf = bows2tfidf(bows)



In [43]:
type(corpus_tfidf)

gensim.interfaces.TransformedCorpus

In [44]:
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf,
                                   num_topics=3,
                                   id2word=dictionary,
                                   passes=2)

In [45]:
for index, topic in lda_model_tfidf.print_topics(-1):
    print('Index {}: topic= {}'.format(index, topic))

Index 0: topic= 0.015*"ship" + 0.015*"job" + 0.015*"men" + 0.015*"recent" + 0.015*"love" + 0.015*"militari" + 0.015*"visit" + 0.015*"spectacular" + 0.015*"mccain" + 0.015*"flotus"
Index 1: topic= 0.030*"fight" + 0.030*"obstruct" + 0.019*"fals" + 0.019*"horrend" + 0.019*"shouldn" + 0.019*"phoni" + 0.019*"sit" + 0.017*"name" + 0.016*"presidenti" + 0.016*"harass"
Index 2: topic= 0.038*"russia" + 0.017*"number" + 0.016*"comey" + 0.016*"kilmead" + 0.016*"turn" + 0.016*"brennan" + 0.015*"bring" + 0.015*"charg" + 0.015*"foxnew" + 0.013*"collud"
