### Preliminaries
Source: https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730

In [27]:
import pandas as pd
import nltk

In [28]:
# Import Data
df = pd.DataFrame(pd.read_csv('Data/trump_climate_tweets.csv'))
df = df[['text', 'created_at', 'retweet_count', 'favorite_count',
       'is_retweet']]
documents = df['text']

### Data Preprocessing

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMP is able to use tf-idf
tf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=no_features,
                                  stop_words='english')
tfidf = tf_vectorizer.fit_transform(documents)
tfidf_feature_names = tf_vectorizer.get_feature_names()

In [42]:
# LDA can only use raw term counts for LDA because it is a probabilitic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

### NMF and LDA with Scikit Learn

In [43]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5,
         init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', 
                                learning_offset=50, random_state=0).fit(tf)

### Displaying and Evaluating Topics

In [44]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [46]:
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
new york freezing snowing 32º need really inches weekend snowed
Topic 1:
change changed working climate wasn just cold concept term constantly
Topic 2:
record temperatures setting massive snow cold wow country low degrees
Topic 3:
air clean expensive hoax focused beautiful healthy total bullshit distracted
Topic 4:
coldest march recorded history like 1996 looks having month long
Topic 5:
work didn change anymore climate words let people wiseguys fighting
Topic 6:
hell need outside freezing fast world july 32º record late
Topic 7:
data emails manipulated http science based faulty leaked 5beahats proven
Topic 8:
caused nuclear weapons incompetent crazy hands leaders concerned weak leader
Topic 9:
nyc snowing started springtime going happened amp just freezing hell
Topic 10:
lebanon jerusalem snowing zero flow fat faulty fear feyahq7lgb fictional
Topic 11:
china destroy competitiveness factories amp prepare nonexistent thrilled order happy
Topic 12:
country freezing weather bruta

In [47]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
air temperatures clean total caps record beautiful change changed expensive
Topic 1:
amp china concept people air competitive obamacare spending beautiful bullshit
Topic 2:
data emails http manipulated believe scientists proven united coldest anymore
Topic 3:
change area climate working anymore united snow hoaxsters years known
Topic 4:
freezing york new need country hell money wasting fast size
Topic 5:
really stupid believe jonathan gruber wasn parts emails fight climate
Topic 6:
20 bad worse years happened data bullshit obamacare wasn freezing
Topic 7:
don having hoaxsters want believe didn freezing article hell changing
Topic 8:
snowing really money use competitiveness cold area nyc cont freeze
Topic 9:
ship massive air record country changing weather ice focused weapons
Topic 10:
factories fight competitiveness let destroy china manufacturing amp scientists climate
Topic 11:
obama york new coldest freezing years major 50 recorded called
Topic 12:
record cold country setti