In [1]:
from utils import load_embedding, parse_tup, cos_sim, cos_dist
from tqdm.auto import tqdm
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd

In [2]:
# from utils import CANDIDATE_SUBS
TIME_FRAME = "monthly"
left_candidates = ["JoeBiden","SandersForPresident","BaemyKlobaechar","ElizabethWarren","Pete_Buttigieg","YangForPresidentHQ"]
right_candidates = ["The_Donald"]
mapping = {}
for a,b in zip(left_candidates+right_candidates,px.colors.qualitative.Plotly):
    mapping[a] = b

In [4]:
import findspark
findspark.init("/h/224/cameron/spark-3.0.0-preview2-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType,StringType

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.getConf().getAll()

cols = ["author","subreddit","body"]
comments = spark.read.load("/comments_2019.parquet").select(*cols)
comments = comments.where((comments['subreddit'].rlike("|".join(["(" + cand + ")" for cand in ["ElizabethWarren"]]))) 
                          & (comments['body'] != "[removed]")
                          & (comments['body'] != "[deleted]")
                          & (comments["author"] != "AutoModerator")
                          & (comments["author"] != "[deleted]")
                          & ~comments["body"].contains("www.reddit.com"))

comments.show()

+--------------------+---------------+--------------------+
|              author|      subreddit|                body|
+--------------------+---------------+--------------------+
|       flamethrower2|ElizabethWarren|According to Cohe...|
|       rieslingatkos|ElizabethWarren|> *Without the pr...|
|    metroidcomposite|ElizabethWarren|>Military interve...|
|       flamethrower2|ElizabethWarren|We know Rich Dad ...|
|             IlikeJG|ElizabethWarren|What are you even...|
|        nyr11messier|ElizabethWarren|I was thinking th...|
|CarolinianRevolution|ElizabethWarren|                What|
|  RecallRethuglicans|ElizabethWarren|Bold would be pay...|
|       marshalgivens|ElizabethWarren|Family Fun Pack f...|
|      SANTA_OFFICIAL|ElizabethWarren|I think she shoul...|
|      idontevenwant2|ElizabethWarren|That isn't exactl...|
|               oiooo|ElizabethWarren|Gillibrand is my ...|
|   starspangledxunzi|ElizabethWarren|Elizabeth Warren ...|
|            flying87|ElizabethWarren|Me

In [4]:
comments.count()

2113252

## Preprocessing

In [22]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from pyspark.sql.types import StringType, ArrayType 
from pyspark.sql.functions import udf
import nltk
import re

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def clean_comment(comment):
    comment = comment.lower()
    # Remove urls
    comment = re.sub('http[s]?://\S+', '', comment)
    token_words = word_tokenize(comment)
    stem_comment=[]
    for token in token_words:
        token = re.sub("[,\.!?']", '', token)
        token = lemmatizer.lemmatize(token)
        if token not in stop_words and (len(token) > 3 or token=="joe" or token=="amy"):
            stem_comment.append(token)
            stem_comment.append(" ")
    return "".join(stem_comment)

udf_clean = udf(clean_comment,StringType())
# Drop comments that result in an empty string "" after cleaning. These are comments like "ok", "who?" etc. 
comments = comments.withColumn("clean_body", udf_clean(comments["body"])).filter("clean_body != ''")
comments.show()

+------------------+---------------+--------------------+--------------------+--------------------+
|            author|      subreddit|                body|          clean_body|     clean_tokenized|
+------------------+---------------+--------------------+--------------------+--------------------+
|     flamethrower2|ElizabethWarren|According to Cohe...|according cohen t...|[according, cohen...|
|     rieslingatkos|ElizabethWarren|> *Without the pr...|without proper fr...|[without, proper,...|
|  metroidcomposite|ElizabethWarren|>Military interve...|military interven...|[military, interv...|
|     flamethrower2|ElizabethWarren|We know Rich Dad ...|know rich poor re...|[know, rich, poor...|
|           IlikeJG|ElizabethWarren|What are you even...|even talking comm...|[even, talking, c...|
|      nyr11messier|ElizabethWarren|I was thinking th...|thinking thing al...|[thinking, thing,...|
|RecallRethuglicans|ElizabethWarren|Bold would be pay...|bold would paying...|[bold, would, pay...|


## Word clouds

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

for candidate in left_candidates:
    text = comments.where(comments["subreddit"] == candidate).select("clean_body").rdd.flatMap(lambda x: x).collect()
    word_cloud = WordCloud(max_font_size=200, background_color="white", max_words=80,
                        colormap="nipy_spectral", stopwords=STOPWORDS).generate(' '.join(text))
    plt.figure()
    plt.imshow(word_cloud, interpolation="hermite")
    plt.title("{} Subreddit Word Cloud\n".format(candidate))
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()
    plt.savefig("visualizations/wordclouds/{}_subreddit_word_cloud".format(candidate))

## LDA Summarization 

Summarize the entire text corpora (all of the different subs together) + the individual candidate subreddits

In [23]:
import gensim
import gensim.corpora as corpora

def create_bow(data):
    # Create Dictionary
    word_dict = corpora.Dictionary(data)  # Create Corpus
    # Term Document Frequency
    corpus = [word_dict.doc2bow(text) for text in data]
    return corpus, word_dict

In [24]:
from pyspark.sql.functions import col, split 
comments = comments.withColumn("clean_tokenized",split(col("clean_body"), " "))
comments.show()

+------------------+---------------+--------------------+--------------------+--------------------+
|            author|      subreddit|                body|          clean_body|     clean_tokenized|
+------------------+---------------+--------------------+--------------------+--------------------+
|     flamethrower2|ElizabethWarren|According to Cohe...|according cohen t...|[according, cohen...|
|     rieslingatkos|ElizabethWarren|> *Without the pr...|without proper fr...|[without, proper,...|
|  metroidcomposite|ElizabethWarren|>Military interve...|military interven...|[military, interv...|
|     flamethrower2|ElizabethWarren|We know Rich Dad ...|know rich poor re...|[know, rich, poor...|
|           IlikeJG|ElizabethWarren|What are you even...|even talking comm...|[even, talking, c...|
|      nyr11messier|ElizabethWarren|I was thinking th...|thinking thing al...|[thinking, thing,...|
|RecallRethuglicans|ElizabethWarren|Bold would be pay...|bold would paying...|[bold, would, pay...|


In [25]:
print("--- finding bigrams ---")
text_data = comments.select("clean_tokenized").rdd.flatMap(lambda x: x).collect()
bigram = gensim.models.Phrases(text_data, min_count=8, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
text_data = [bigram_mod[comment] for comment in text_data]

--- finding bigrams ---


In [26]:
print("--- creating BoW model ---")
corpus, word_dict = create_bow(text_data)
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=word_dict,num_topics=5,random_state=100,chunksize=100,passes=10,per_word_topics=True)

--- creating BoW model ---


In [27]:
lda_model.print_topics()

[(0,
  '0.035*"warren" + 0.020*"bernie" + 0.017*"think" + 0.015*"like" + 0.014*"candidate" + 0.011*"would" + 0.011*"people" + 0.009*"sander" + 0.009*"biden" + 0.008*"supporter"'),
 (1,
  '0.016*"poll" + 0.014*"state" + 0.010*"election" + 0.009*"would" + 0.009*"voter" + 0.008*"campaign" + 0.008*"money" + 0.008*"vote" + 0.008*"number" + 0.007*"primary"'),
 (2,
  '0.012*"post" + 0.010*"comment" + 0.010*"article" + 0.008*"question" + 0.008*"thanks" + 0.008*"read" + 0.007*"good" + 0.007*"thank" + 0.007*"know" + 0.006*"like"'),
 (3,
  '0.025*"people" + 0.011*"like" + 0.009*"think" + 0.008*"make" + 0.007*"thing" + 0.007*"would" + 0.007*"need" + 0.006*"right" + 0.006*"want" + 0.005*"policy"'),
 (4,
  '0.028*"plan" + 0.016*"would" + 0.009*"cost" + 0.009*"healthcare" + 0.008*"medicare" + 0.008*"year" + 0.008*"insurance" + 0.008*"bill" + 0.006*"system" + 0.006*"company"')]