In [1]:
import pyspark, pickle
from pyspark import SparkContext
from pyspark.sql.functions import countDistinct, regexp_replace, monotonically_increasing_id, lit
from pyspark.storagelevel import StorageLevel
import pandas as pd
import numpy as np
from pyspark.ml.feature import CountVectorizer, StopWordsRemover, RegexTokenizer
from pyspark.ml.clustering import LDA, LocalLDAModel

from nltk.corpus import stopwords
import nltk, re

from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel

pd.options.display.max_colwidth = -1



In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Helper functions for LDA analysis

In [3]:
def com_lda(df, community_number, k):
    """
    This function performs LDA on a given Twitter community with number of topics set to k.
    
    df: spark dataframe of tweet data
    community_number: ID number for a community
    k: number of topics
    returns: model vocabulary, count vectorized tweets, and model object
    """
    com_df = df.filter('community = {0}'.format(community_number))
    temp = pipeline.fit(com_df)
    vocab = temp.stages[2].vocabulary
    com_df_features = temp.transform(com_df)
    
    lda = LDA(k=k, maxIter=100, optimizer='online')
    model = lda.fit(com_df_features)
    
    return vocab, com_df_features, model

def print_top_words(model, vocab):
    """
    Prints the highest weighted words for each topic in a given LDA model.
    """
    top_words = model.describeTopics().rdd.map(lambda x: x['termIndices']).collect()
    
    for i, topic in enumerate(top_words):
        print('Topic {0}:'.format(i), end=' ')
        for index in topic:
            print(vocab[index], end=', ')
        print('\n')
        
def get_top_tweet_ids(model, df):
    """
    Uses the dot product of topic-by-word vectors and tweet-by-word-count vectors to score each tweet's
    relevance to an LDA topic.
    returns: 2D array of tweets for each LDA topic, sorted in order of relevance descending
    """
    ids = []
    m = model.topicsMatrix().toArray()
    n_topics = m.shape[1]
    for col in range(n_topics):
        topic = m[:, col]
        tweet_scores = df.rdd.map(lambda x: (x['features'].dot(topic), x['tweet_id'])).collect()
        ids.extend([tweet_id for score, tweet_id in sorted(tweet_scores, reverse=True)])
    res = np.array(ids).reshape(n_topics, -1)
    return res

# Load tweet data and community data

In [3]:
tweets = spark.read.parquet('tweets_all.parquet')
coms = spark.read.parquet('communities.parquet')

In [6]:
# Remove URLs and @mentions from tweet text
# Use only tweets that were written originals, not retweets or quotes
lda_tweets = tweets.filter('retweet_id is null and quote_id is null').select('tweet_id', 'screen_name', 'name',
                  regexp_replace('text', r'(https?://[^ ,]+)|(@[\w_]+)', '').alias('text')) \
    .join(coms, 'screen_name', 'inner')
lda_tweets.persist(StorageLevel.MEMORY_AND_DISK)
lda_tweets.registerTempTable('lda_tweets')

In [7]:
lda_tweets.show()

+---------------+------------------+--------------------+--------------------+---------+
|    screen_name|          tweet_id|                name|                text|community|
+---------------+------------------+--------------------+--------------------+---------+
| iansomerhalder|228120301790167040|     Ian Somerhalder|KIDS,TEENS,PARENT...|       45|
|realDonaldTrump|435741126440808448|     Donald J. Trump|":   Whether Glob...|        9|
|  YEARSofLIVING|655409401533218816|               YEARS|5 ways to reduce ...|       10|
|  powerglobalus|657771285968109568|         Powerglobal|Progressive expec...|        3|
|  powerglobalus|682330791427194882|         Powerglobal|UN Climate Change...|        3|
|  sydneysleroux|709818869368688641|              andrea|glad to see flori...|        0|
|      WorldBank|729761681723772930|          World Bank|#Climatechange + ...|        4|
|       KottieCB|774609086415511552|    K Christie-Blick|Teaching ideas fo...|       10|
|           Esri|7794

In [8]:
# How many users per community?
spark.sql("""
    select community, count(distinct screen_name) as n_nodes
    from lda_tweets
    where community is not null
    group by community
    order by n_nodes desc
    limit 10
""").show()

+---------+-------+
|community|n_nodes|
+---------+-------+
|        3|  13434|
|       10|  11534|
|        9|   9583|
|        2|   7586|
|       11|   6734|
|        4|   5506|
|       18|   4354|
|       21|   2893|
|       28|   2588|
|       20|   2487|
+---------+-------+



# Set parameters for text preprocessing

In [9]:
tweet_stopwords = stopwords.words('english') + \
    ['rt', 'climate', 'change', 'global', 'warming', 'climatechange', 'climate', 'globalwarming', 'https', 'http',
        'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 'get', 'may', 'says', 'say', 'make',
        'based', 'even', 'another', 'completely', 'thanks', 'way', 'find', 'used', 'thing', '2017', 'see', 'need',
        'know', 'global-warming', 'climate-change', 'knows', 'think', 'thinks', 'take', 'new', 'day', 'days']

In [10]:
# Create regex tokenizer that is useful for Twitter data (preserves emoticons, hashtags, etc.)
# I used code from here, with some modifications: https://github.com/adonoho/TweetTokenizers/blob/master/PottsTweetTokenizer.py

pattern = r"""(?:\[link\])|(?:(?:\+?[01][\-\s.]*)?(?:[\(]?\d{3}[\-\s.\)]*)?\d{3}[\-\s.]*\d{4})|(?:(?<= )[<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP/\:\}\{@\|\\]|[\)\]\(\[dDpP/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)|(<[^>]+>)|(?:@[\w_]+)|(?:["\'][a-z0-9/-]+["\'])|(?:[a-z][a-z\-_]+[a-z])|(?:[+\-]?\d+[,/.:-]\d+[+\-]?)|(?:[\w_]+)"""

word_re = re.compile(pattern, re.VERBOSE | re.I | re.UNICODE)

In [11]:
# Tokenize tweets
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", gaps=False, pattern=word_re.pattern,
                              minTokenLength = 2)

# Remove stopwords
stp_rmv = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='new_tokens',
                           stopWords=tweet_stopwords)

# Count occurences of words
cnvk = CountVectorizer(inputCol=stp_rmv.getOutputCol(), outputCol='features', vocabSize=10000)

# Pipeline
pipeline = Pipeline(stages=[tokenizer, stp_rmv, cnvk])

# Run LDA for each community, with n_topics ranging from k=6 to k=8

In [None]:
# Run LDA and save a list with all of the models

communities = [3,4,2,18,12,28]
vocab_list = []
df_features_list = []
model_list = []
ks = range(6,9)
for community in communities:
    for k in ks:
        vocab, com_df_features, model = com_lda(lda_tweets, community, k)
        vocab_list.append(vocab)
        df_features_list.append(com_df_features)
        model_list.append(model)

# Choose model for Com 28

In [102]:
# Separate out Com 28 models
vocab28 = vocab_list[15:18]
df_features28 = df_features_list[15:18]
model28 = model_list[15:18]

In [103]:
# View top words for each k
for model, vocab in zip(model28, vocab28):
    print('Top Words for Community 12')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 12
Topic 0: solar, action, energy, join, help, risk, uk, business, renewables, water, 

Topic 1: latest, daily, news, environment, due, green, scientists, ocean, protect, forests, 

Topic 2: trump, us, budget, science, donald, environment, scientists, news, research, epa, 

Topic 3: earth, time, weather, people, world, still, deniers, oceans, well, first, 

Topic 4: carbon, epa, pruitt, scott, dioxide, plan, energy, head, science, arctic, 

Topic 5: great, shell, oil, 1991, trump, scientists, us, impacts, already, film, 



Top Words for Community 12
Topic 0: shell, oil, 1991, knew, film, danger, impact, giant, court, warned, 

Topic 1: latest, daily, due, ocean, us, people, green, environment, uk, forests, 

Topic 2: trump, us, scientists, environment, world, budget, research, news, donald, epa, 

Topic 3: scientists, great, dead, australia, reef, large, sections, also, women, humans, 

Topic 4: carbon, energy, epa, pruitt, scott, action, dioxide, solar, chief,

In [104]:
# Get Com 28 k=7 top tweets
top28_7 = get_top_tweet_ids(model28[1], df_features28[1])

In [114]:
# Best tweets
lda_tweets.filter('tweet_id in {0}'.format(tuple(top28_7[3,:10]))).select('screen_name', 'community', 'text') \
                    .show(40, truncate=False)

+-------------+---------+-------------------------------------------------------------------------------------------------------------+
|screen_name  |community|text                                                                                                         |
+-------------+---------+-------------------------------------------------------------------------------------------------------------+
|GomeKarim    |28       |Top story: Large Sections of Australia’s Great Reef Are Now Dead, Scientists Fi… , see more                  |
|Worldnews_top|28       |Large Sections of Australiaâs Great Reef Are Now Dead, Scientists Find  #worldnews #news #breakingnews     |
|BiancaJagger |28       |Devastating news RT Large Sections of Australia’s Great Reef Are Now Dead, Scientists Find                   |
|MandalaCosmos|28       |#SaveEarth =&gt; Large Sections of Australia’s Great Reef Are Now Dead, Scientists Find - The New York Times |
|adamspector2 |28       |Large Sections of Austr

In [111]:
# Save best Com 28 models
com28_model = model28[1]
com28_df = df_features28[1]
com28_vocab = vocab28[1]

In [187]:
com28_model.estimatedDocConcentration()

DenseVector([0.0838, 0.0914, 0.182, 0.0777, 0.1355, 0.0934, 0.0955])

In [181]:
# Save all model data
com28_model.save('./lda_data/com28_lda_model') # Load with LocalLDAModel.load('lda_data/com28_lda_model')
with open('./lda_data/com28_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com28_vocab, pklfile)
com28_df.write.parquet('./lda_data/com28_df.parquet')

# Choose model for Com 12 (Canadians!)

In [88]:
# Separate out Com 12 models
vocab12 = vocab_list[12:15]
df_features12 = df_features_list[12:15]
model12 = model_list[12:15]

In [89]:
# View top words for each k
for model, vocab in zip(model12, vocab12):
    print('Top Words for Community 12')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 12
Topic 0: epa, carbon, chief, co2, pruitt, scott, dioxide, science, energy, cause, 

Topic 1: science, house, daily, canada, white, talk, latest, fossil, budget, trump, 

Topic 2: trump, communities, adapt, canada, administration, fund, inuit, knowledge, great, archive, 

Topic 3: work, planet, permafrost, deniers, science, canada, release, carbon, cdnpoli, read, 

Topic 4: study, help, world, action, weather, daily, meet, tillerson, cdnpoli, latest, 

Topic 5: trump, cdnpoli, science, canada, environment, us, oil, news, people, shell, 



Top Words for Community 12
Topic 0: energy, world, earth, africa, education, meet, power, melting, green, battle, 

Topic 1: great, daily, canada, latest, study, carbon, scientists, help, plan, action, 

Topic 2: years, planet, water, ago, women, work, case, read, youth, stop, 

Topic 3: cdnpoli, canada, carbon, news, future, record, permafrost, tax, rising, show, 

Topic 4: canada, communities, adapt, cdnpoli, fund, inuit, 

In [90]:
# Get Com 12 k=8 top tweets
top12_8 = get_top_tweet_ids(model12[2], df_features12[2])

In [99]:
# Best tweets
lda_tweets.filter('tweet_id in {0}'.format(tuple(top12_8[7,:10]))).select('screen_name', 'community', 'text') \
                    .show(40, truncate=False)

+---------------+---------+----------------------------------------------------------------------------------------------------------------------------------------+
|screen_name    |community|text                                                                                                                                    |
+---------------+---------+----------------------------------------------------------------------------------------------------------------------------------------+
|environmentguru|12       |EPA head Scott Pruitt denies that carbon dioxide causes global warming: Scott Pruitt…                                                   |
|bluemassgroup  |12       |(202-564-4700) =&gt; Scott Pruitt’s office deluged with angry callers after he questions the science of global warming                  |
|regwhit1       |12       |Scott Pruitt’s office deluged with angry callers after he questions the science of global warming                                       |
|Sciencesh

In [100]:
# Save best Com 12 models
com12_model = model12[2]
com12_df = df_features12[2]
com12_vocab = vocab12[2]

In [101]:
com12_model.estimatedDocConcentration()

DenseVector([0.0855, 0.0826, 0.0928, 0.0711, 0.0862, 0.0907, 0.1193, 0.1319])

In [182]:
# Save all model data
com12_model.save('./lda_data/com12_lda_model')
with open('./lda_data/com12_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com12_vocab, pklfile)
com12_df.write.parquet('./lda_data/com12_df.parquet')

# Choose model for Com 3

In [31]:
# Separate out Com 3 models
vocab3 = vocab_list[:3]
df_features3 = df_features_list[:3]
model3 = model_list[:3]

In [32]:
# View top words for each k
for model, vocab in zip(model3, vocab3):
    print('Top Words for Community 3')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 3
Topic 0: scientists, trump, snow, news, caused, weather, right, fake, real, believe, 

Topic 1: epa, carbon, trump, science, pruitt, chief, weather, scott, 21, dioxide, 

Topic 2: science, hoax, security, ice, models, national, threat, mattis, sea, story, 

Topic 3: trump, money, science, research, bill, waste, director, nye, budget, scam, 

Topic 4: gore, al, man-made, trump, course, deniers, bernie, golf, sanders, eric, 

Topic 5: science, consensus, 97, talk, epa, cut, lie, blog, fake, house, 



Top Words for Community 3
Topic 0: scientists, trump, science, bill, believe, hoax, real, nye, news, obama, 

Topic 1: trump, epa, pruitt, scott, chief, 21, us, great, cop, programs, 

Topic 2: weather, science, models, security, story, national, claims, mattis, threat, experts, 

Topic 3: money, trump, research, waste, director, consensus, budget, scam, talk, 97, 

Topic 4: trump, course, denier, golf, narrative, millions, protesters, deny, snow, look, 

Topic 5: 

In [None]:
# Get Com 3 k=8 top tweets
top3_8 = get_top_tweet_ids(model_list[2], df_features_list[2])

In [None]:
# Save best Com 3 models
com3_model = model_list[2]
com3_df = df_features_list[2]
com3_vocab = vocab_list[2]

In [49]:
com3_model.estimatedDocConcentration()

DenseVector([0.1275, 0.1524, 0.0946, 0.1115, 0.0813, 0.0871, 0.1008, 0.0929])

In [183]:
# Save all model data
com3_model.save('./lda_data/com3_lda_model')
with open('./lda_data/com3_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com3_vocab, pklfile)
com3_df.write.parquet('./lda_data/com3_df.parquet')

# Choose model for Com 4

In [39]:
# Separate out Com 4 models
vocab4 = vocab_list[3:6]
df_features4 = df_features_list[3:6]
model4 = model_list[3:6]

In [41]:
# View top words for each k
for model, vocab in zip(model4, vocab4):
    print('Top Words for Community 4')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 4
Topic 0: action, cities, news, cop, last, de, taking, nature, la, 22, 

Topic 1: women, latest, us, daily, 4climate, fight, join, world, impact, impacts, 

Topic 2: threat, children, arctic, energy, agriculture, world, years, ice, food, environment, 

Topic 3: trump, science, epa, carbon, pruitt, chief, energy, budget, scott, environmental, 

Topic 4: terra, forming, solar, long, hoax, di, power, back, g20, south, 

Topic 5: gt, adaptation, scientists, climateaction, resilience, great, women, sustainable, health, australia, 



Top Words for Community 4
Topic 0: epa, scientists, great, trump, us, chief, carbon, pruitt, science, dioxide, 

Topic 1: women, latest, daily, 4climate, fight, water, join, us, action, world, 

Topic 2: energy, de, la, tillerson, video, storage, farmers, zimbabwe, el, en, 

Topic 3: trump, adaptation, health, cop, news, science, budget, green, national, policy, 

Topic 4: weather, cities, report, trump, gender, meeting, mayor, live, sm

In [42]:
# Get Com 4 k=8 top tweets
top4_8 = get_top_tweet_ids(model4[2], df_features4[2])

In [44]:
# Best tweets
lda_tweets.filter('tweet_id in {0}'.format(tuple(top4_8[1,:10]))).select('screen_name', 'community', 'text') \
                    .show(40, truncate=False)

+--------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------+
|screen_name   |community|text                                                                                                                                            |
+--------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------+
|c40cities     |4        |Mayors &amp; global leaders are gathering in NY today to highlight the critical role of women in the fight against climate change #Women4Climate|
|AnnickNicky   |4        |Understanding the significant role of women in the fight against climate change is crucial, not just for women #women4climate #climateaction    |
|Anne_Hidalgo  |4        |Women are more than half of humanity and the  showed that women are more vulnerable to climate change disasters. #

In [None]:
# Save best Com 4 models
com4_model = model4[2]
com4_df = df_features4[2]
com4_vocab = vocab4[2]

In [50]:
com4_model.estimatedDocConcentration()

DenseVector([0.12, 0.2039, 0.0882, 0.0846, 0.0869, 0.0827, 0.0933, 0.0782])

In [184]:
# Save all model data
com4_model.save('./lda_data/com4_lda_model')
with open('./lda_data/com4_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com4_vocab, pklfile)
com4_df.write.parquet('./lda_data/com4_df.parquet')

# Choose model for Com 18

In [69]:
# Separate out Com 18 models
vocab18 = vocab_list[9:12]
df_features18 = df_features_list[9:12]
model18 = model_list[9:12]

In [70]:
# View top words for each k
for model, vocab in zip(model18, vocab18):
    print('Top Words for Community 18')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 18
Topic 0: epa, pruitt, scott, carbon, scientists, co2, chief, dioxide, denial, primary, 

Topic 1: science, trump, budget, latest, cut, research, real, house, cuts, us, 

Topic 2: exxon, tillerson, money, work, read, alias, believe, rex, email, blog, 

Topic 3: trump, water, energy, us, california, humans, paris, administration, future, agreement, 

Topic 4: great, reef, australia, scientists, large, dead, barrier, sections, peopleandnaturespeakerseries, temperature, 

Topic 5: york, robinson, 2140, stanley, novel, kim, amy, brady, interviews, book, 



Top Words for Community 18
Topic 0: pruitt, epa, scott, carbon, science, research, co2, chief, scientists, dioxide, 

Topic 1: science, trump, great, scientists, latest, epa, reef, cuts, arctic, budget, 

Topic 2: trump, people, us, science, security, energy, national, defense, secretary, challenge, 

Topic 3: york, robinson, 2140, stanley, oil, novel, kim, scientists, amy, brady, 

Topic 4: science, house, cut

In [None]:
# Get Com 18 k=8
top18_8 = get_top_tweet_ids(model18[2], df_features18[2])

In [84]:
# Best tweets
lda_tweets.filter('tweet_id in {0}'.format(tuple(top18_8[2,:10]))).select('screen_name', 'community', 'text') \
                    .show(100, truncate=False)

+---------------+---------+------------------------------------------------------------------------------------------------------------------------------------+
|screen_name    |community|text                                                                                                                                |
+---------------+---------+------------------------------------------------------------------------------------------------------------------------------------+
|alecola66      |18       |Sometime it is difficult to understand people. Sometime people make you feel very angry 😡 #Trump                                   |
|climatemedianet|18       |‘Shell knew’: oil giant's 1991 film warned of climate change danger | Environment | The Guardian                                    |
|BrianRJacobson |18       |‘Shell knew’: oil giant's 1991 film warned of climate change danger #energyhumanities                                               |
|guardianweekly |18       |‘Shell k

In [85]:
# Save best Com 18 models
com18_model = model18[2]
com18_df = df_features18[2]
com18_vocab = vocab18[2]

In [185]:
# Save all model data
com18_model.save('./lda_data/com18_lda_model')
with open('./lda_data/com18_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com18_vocab, pklfile)
com18_df.write.parquet('./lda_data/com18_df.parquet')

# Choose model for Com 2

In [47]:
# Separate out Com 2 models
vocab2 = vocab_list[6:9]
df_features2 = df_features_list[6:9]
model2 = model_list[6:9]

In [48]:
# View top words for each k
for model, vocab in zip(model2, vocab2):
    print('Top Words for Community 2')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 2
Topic 0: real, trump, us, people, believe, science, gop, world, hoax, water, 

Topic 1: epa, pruitt, scott, chief, tillerson, trump, exxon, science, email, carbon, 

Topic 2: trump, fight, republicans, scientists, 17, cuts, research, administration, join, high, 

Topic 3: trump, great, money, scientists, reef, environmental, dead, rules, curb, vehicle, 

Topic 4: bill, news, science, nye, tucker, carlson, early, guy, scientists, fox, 

Topic 5: trump, budget, science, house, cut, white, agency, leading, secretary, proposes, 



Top Words for Community 2
Topic 0: trump, believe, us, real, scientists, administration, want, science, everything, denial, 

Topic 1: epa, pruitt, scott, chief, tillerson, science, exxon, email, carbon, co2, 

Topic 2: trump, denying, bernie, sanders, cnn, state, leader, epa, storms, blast, 

Topic 3: trump, scientists, great, money, defense, reef, hoax, real, national, australia, 

Topic 4: bill, news, nye, guy, snow, science, tucker,

In [None]:
# Get Com 2 k=7
top2_7 = get_top_tweet_ids(model2[1], df_features2[1])

In [67]:
# Best tweets
lda_tweets.filter('tweet_id in {0}'.format(tuple(top2_7[1,:10]))).select('screen_name', 'community', 'text') \
                    .show(100, truncate=False)

+--------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------+
|screen_name   |community|text                                                                                                                                       |
+--------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------+
|lkherman      |2        |The EPA's Scott Pruitt says carbon dioxide isn't the main cause of global warming, but the EPA website says so.…                           |
|MariaShea55   |2        |Science! It's real! 
EPA chief Scott Pruitt says carbon dioxide is not a primary contributor to global warming                             |
|3X09          |2        |Can't we take "Scott Pruitt" to court and present evidence of climate change to make him enforce the EPA regulations? #pruitt #perjury #epa

In [68]:
# Save best Com 2 models
com2_model = model2[1]
com2_df = df_features2[1]
com2_vocab = vocab2[1]

In [186]:
# Save all model data
com2_model.save('./lda_data/com2_lda_model')
with open('./lda_data/com2_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com2_vocab, pklfile)
com2_df.write.parquet('./lda_data/com2_df.parquet')

# Choose model for Com 10

In [60]:
# Run community 10
com10_vocab, com10_df, com10_model = com_lda(lda_tweets, 10, 10)

print_top_words(com10_model, com10_vocab)

Topic 0: shell, 1991, oil, fossil, film, fuel, knew, sustainability, denial, environment, 

Topic 1: trump, energy, p2, latest, daily, tcot, environment, coal, 2a, us, 

Topic 2: tillerson, exxon, us, years, alias, email, impacts, world, already, itstimetochange, 

Topic 3: scientists, great, australia, reef, barrier, save, time, summer, auspol, money, 

Topic 4: canada, show, permafrost, people, bill, rules, trump, carbon, huge, science, 

Topic 5: making, oil, us, warn, doctors, investors, risk, comes, first, food, 

Topic 6: ice, sea, arctic, record, ocean, itstimetochange, news, join, world, antarctica, 

Topic 7: house, trump, science, budget, white, cut, agency, leading, administration, post, 

Topic 8: republicans, fight, planet, believe, 17, americans, facts, health, march, happening, 

Topic 9: epa, pruitt, trump, chief, scott, carbon, science, dioxide, co2, head, 



In [None]:
top10_10 = get_top_tweet_ids(com10_model, com10_df)

In [22]:
# Best tweets 
lda_tweets.filter('tweet_id in {0}'.format(tuple(top_tweets[0,:20]))).select('screen_name', 'community', 'text') \
                    .show(truncate=False)

+---------------+---------+------------------------------------------------------------------------------------------------------------------------------------------+
|screen_name    |community|text                                                                                                                                      |
+---------------+---------+------------------------------------------------------------------------------------------------------------------------------------------+
|symphorians1   |10       |Scott Pruitt is unfit to head the EPA. Via : EPA Chief Scott Pruitt Questions Basic Facts About Climate Change                            |
|boinerz        |10       |EPA chief Pruitt says carbon dioxide is not to blame for global warming  SLEAZEWHORE EPA Pruitt EVIL                                      |
|EdySemaan      |10       |Trump's EPA head Scott Pruitt denies that CO2 causes global warming. 
Trump's EPA head Scott Pruitt denies that CO2 causes global warming.

In [204]:
# Save all model data
com10_model.save('./lda_data/com10_lda_model')
with open('./lda_data/com10_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com10_vocab, pklfile)
com10_df.write.parquet('./lda_data/com10_df.parquet')

# Choose model for Com 9

In [116]:
# Run community 9
community = 9
ks = range(6,9)
vocab9=[]
model9=[]
df_features9=[]
for k in ks:
    vocab, com_df_features, model = com_lda(lda_tweets, community, k)
    vocab9.append(vocab)
    df_features9.append(com_df_features)
    model9.append(model)
    print('Top Words for Community {0}, k = {1}'.format(community, k), '\n')
    print_top_words(model, vocab)
    print('\n')

Top Words for Community 9, k = 6 

Topic 0: co2, real, us, send, dummies, drives, copy, trump, scientists, environment, 

Topic 1: science, epa, pruitt, scott, co2, chief, carbon, matter, opinion, dioxide, 

Topic 2: science, budget, cut, trump, gt, house, white, tip, bill, home, 

Topic 3: trump, real, money, research, people, want, world, hoax, believe, still, 

Topic 4: great, due, reef, jobs, dead, scientists, australia, carbon, february, large, 

Topic 5: hope, time, years, weather, 7cc, watch, greenhouse, last, gas, carbon, 



Top Words for Community 9, k = 7 

Topic 0: co2, send, dummies, drives, copy, trump, resist, oil, scientists, great, 

Topic 1: science, epa, pruitt, co2, scott, carbon, matter, opinion, chief, dioxide, 

Topic 2: budget, trump, science, cut, bill, house, white, cuts, agency, paris, 

Topic 3: trump, world, real, money, earth, us, research, people, want, waste, 

Topic 4: soon, due, back, rights, 2010, explained, sleds, toboggans, isis, mark, 

Topic 5: en

In [None]:
# Get Com 9 k=8 top tweets
top9_8 = get_top_tweet_ids(model9[2], df_features9[2])

In [174]:
# Best tweets
lda_tweets.filter('tweet_id in {0}'.format(tuple(top9_8[1,:100]))).select('screen_name', 'community', 'text') \
                    .show(100, truncate=False)

+---------------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|screen_name    |community|text                                                                                                                                           |
+---------------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------+
|Tedderman1     |9        |EPA Chief Dipshit Scott Pruitt Says Carbon Dioxide is Not a Primary Contributor to Global Warming  #UniteBlue                                  |
|GarethsMom     |9        | - science isnt a conspiracy 
On climate change, Scott Pruitt contradicts the EPA’s own website
                                               |
|DaviesNow      |9        |FixIt: New EPA boss Scott refuses to accept scientific concensus on #climatechange #EPA #science                 

In [178]:
# Save best Com 28 models
com9_model = model9[2]
com9_df = df_features9[2]
com9_vocab = vocab9[2]

In [179]:
com9_model.estimatedDocConcentration()

DenseVector([0.181, 0.1451, 0.0928, 0.1167, 0.0731, 0.075, 0.0886, 0.0757])

In [180]:
# Save all model data
com9_model.save('./lda_data/com9_lda_model')
with open('./lda_data/com9_lda_vocab.pkl', 'wb') as pklfile:
    pickle.dump(com9_vocab, pklfile)
com9_df.write.parquet('./lda_data/com9_df.parquet')

# Save top 100 tweets for all topics and communities to one DF for export

In [239]:
top_tweet_ids_save = [top10_10, top3_8, top4_8, top2_7, top18_8, top12_8, top28_7, top9_8]
n_keep_tweets = 100

 # Create empty DF to save to
top_tweets_df = lda_tweets.filter('tweet_id = "nonexistent id"').withColumn('topic', lit(None))

for com in top_tweet_ids_save:
    for topic_num in range(com.shape[0]):
        top_tweets_df = top_tweets_df.unionAll(
                                lda_tweets.filter('tweet_id in {0}'.format(tuple(com[topic_num,:n_keep_tweets]))) \
                                    .withColumn('topic', lit(topic_num)) \
                                    .select('screen_name', 'tweet_id', 'name', 'text', 'community', 'topic'))

top_tweets_df.persist(StorageLevel.MEMORY_AND_DISK)

In [244]:
top_tweets_df.show()

+---------------+------------------+-------------------+--------------------+---------+-----+
|    screen_name|          tweet_id|               name|                text|community|topic|
+---------------+------------------+-------------------+--------------------+---------+-----+
|   daphnewysham|836999249405435904|      Daphne Wysham|‘Shell knew’: oil...|       10|    0|
|         Q_petr|837028930607583232|        Quercuspetr|‘Shell knew’: oil...|       10|    0|
|  science_ooyuz|839009313582641152|       Science News|'Shell Knew': Oil...|       10|    0|
|   chinorubeitp|837165600925499392|               Rube|RT : ‘Shell knew’...|       10|    0|
| CLIMATECHANGE8|836557843641716740|     CLIMATE CHANGE|'Shell knew': oil...|       10|    0|
|    RosalindCPH|837044753162203143|   Rosalind Pearson|‘Shell knew’: oil...|       10|    0|
|     AtorElabor|837121530366681089|      Paul Mittwoch|‘Shell knew’: oil...|       10|    0|
|   StephenDanna|837004038403055616|       StephenDanna|Wow.

In [245]:
top_tweets_pd = top_tweets_df.toPandas()  

In [248]:
with open('./lda_data/lda_top_tweets.pkl', 'wb') as pklfile:
    pickle.dump(top_tweets_pd, pklfile, protocol=2)