In [3]:
from utils import load_embedding, parse_tup, cos_sim, cos_dist
from tqdm.auto import tqdm
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd

In [4]:
# from utils import CANDIDATE_SUBS
TIME_FRAME = "monthly"
left_candidates = ["JoeBiden","SandersForPresident","BaemyKlobaechar","ElizabethWarren","Pete_Buttigieg","YangForPresidentHQ"]
right_candidates = ["The_Donald"]
mapping = {}
for a,b in zip(left_candidates+right_candidates,px.colors.qualitative.Plotly):
    mapping[a] = b

In [3]:
import findspark
findspark.init("/h/224/cameron/spark-3.0.0-preview2-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType,StringType

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.getConf().getAll()

cols = ["author","subreddit","body"]
comments = spark.read.load("/comments_2019.parquet").select(*cols)
comments = comments.where((comments['subreddit'].rlike("|".join(["(" + cand + ")" for cand in left_candidates]))) 
                          & (comments['body'] != "[removed]")
                          & (comments['body'] != "[deleted]")
                          & (comments["author"] != "AutoModerator")
                          & (comments["author"] != "[deleted]")
                          & ~comments["body"].contains("www.reddit.com"))

comments.show()

+-------------------+-------------------+--------------------+
|             author|          subreddit|                body|
+-------------------+-------------------+--------------------+
|           firephly|SandersForPresident|                  ok|
| girl_introspective|SandersForPresident|This meme format ...|
|     skankhunt_4200|SandersForPresident|Dude if more than...|
|   stripes_by_proxy|SandersForPresident|That's still 8 mo...|
|            Zerkcs1|SandersForPresident|Like I said. Love...|
|  ThorVonHammerdong| YangForPresidentHQ|I'll be expanding...|
|     aintbutathing3|SandersForPresident|Agreed. The whole...|
|           lalaohhi|SandersForPresident|I completely disa...|
|       trevrichards|SandersForPresident|     Be gone, troll.|
|        EverWatcher|SandersForPresident|Amazing! 

The na...|
|    executivemonkey|SandersForPresident|AOC helps us in t...|
|       chiechie1979|SandersForPresident|Don't forget Nixo...|
|          rpaguirre|SandersForPresident|Also, maybe mo

In [9]:
comments.count()

2113252

## Preprocessing

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from pyspark.sql.types import StringType, ArrayType 
from pyspark.sql.functions import udf, col, split 
import nltk
import re

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def clean_comment(comment):
    comment = comment.lower()
    # Remove urls
    comment = re.sub('http[s]?://\S+', '', comment)
    token_words = word_tokenize(comment)
    stem_comment=[]
    for token in token_words:
        token = re.sub("[,\.!?']", '', token)
        token = lemmatizer.lemmatize(token)
        if token not in stop_words and (len(token) > 3 or token=="joe" or token=="amy"):
            stem_comment.append(token)
            stem_comment.append(" ")
    return "".join(stem_comment)

udf_clean = udf(clean_comment,StringType())
# Drop comments that result in an empty string "" after cleaning. These are comments like "ok", "who?" etc. 
comments = comments.withColumn("clean_body", udf_clean(comments["body"])).filter("clean_body != ''")
comments = comments.withColumn("clean_body_tokenized", split(col("clean_body"), " "))
comments = comments.withColumn("len", size("clean_body_tokenized"))
comments.show()

+-------------------+-------------------+--------------------+--------------------+--------------------+---+
|             author|          subreddit|                body|          clean_body|clean_body_tokenized|len|
+-------------------+-------------------+--------------------+--------------------+--------------------+---+
| girl_introspective|SandersForPresident|This meme format ...|  meme format never |[meme, format, ne...|  4|
|     skankhunt_4200|SandersForPresident|Dude if more than...|dude person sayin...|[dude, person, sa...| 24|
|   stripes_by_proxy|SandersForPresident|That's still 8 mo...|still month prima...|[still, month, pr...| 18|
|            Zerkcs1|SandersForPresident|Like I said. Love...|like said love ab...|[like, said, love...| 25|
|  ThorVonHammerdong| YangForPresidentHQ|I'll be expanding...|expanding album t...|[expanding, album...| 12|
|     aintbutathing3|SandersForPresident|Agreed. The whole...|agreed whole conv...|[agreed, whole, c...|  7|
|           lalaohh

## Word Distributions

Probability of a word $w_i$ appearing a subreddit $R$ is 

$P(w_i|R) = \frac{f(w_i,R)}{|R|}$

Aka what I've been putting off for too long

In [5]:
import pyspark.sql.functions as f
import os
def count_dict(dataframe,cand,normalize=True):
    isolated = dataframe.where(dataframe["subreddit"] == cand) 
    counts = isolated.select(f.explode('clean_body_tokenized').alias('col')).groupBy('col').count().collect()
    n = isolated.agg(f.sum("len")).collect()[0][0]
    if normalize:
        counts_dict = {row['col']: row['count']/n for row in counts if row['count']/n > 0.0001}
        return counts_dict
    return {row['col']: row['count'] for row in counts if row['count']/n > 0.0001}
normalized_fp = "dataframes/word_freq/comment_counts.csv"
fp = "dataframes/word_freq/NON_NORMALIZED-comment_counts.csv"

cand_perc_df = None
cand_count_df = None
try:
    cand_perc_df = pd.read_csv(normalized_fp,index_col=0)
    cand_count_df = pd.read_csv(fp,index_col=0)
except:
    cand_perc_df = pd.DataFrame.from_dict({cand: count_dict(comments,cand) for cand in left_candidates}).fillna(0).drop([''])
    cand_perc_df.to_csv(normalized_fp)
    cand_count_df = pd.DataFrame.from_dict({cand: count_dict(comments,cand,False) for cand in left_candidates}).fillna(0).drop([''])
    cand_count_df.to_csv(fp)
# cand_perc_df
cand_count_df

Unnamed: 0,JoeBiden,SandersForPresident,BaemyKlobaechar,ElizabethWarren,Pete_Buttigieg,YangForPresidentHQ
potential,55.0,2275.0,8.0,520.0,1470.0,4475.0
fell,28.0,0.0,0.0,0.0,0.0,0.0
whoever,51.0,1837.0,4.0,426.0,995.0,2263.0
private,164.0,11127.0,17.0,2096.0,4546.0,12529.0
value,79.0,5350.0,7.0,1126.0,4617.0,12315.0
...,...,...,...,...,...,...
compete,0.0,0.0,0.0,0.0,0.0,2140.0
graphic,0.0,0.0,0.0,0.0,0.0,2595.0
metric,0.0,0.0,0.0,0.0,0.0,1901.0
material,0.0,0.0,0.0,0.0,0.0,1947.0


### Log Odds Ratio

Source: ```Monroe, Burt L., Michael P. Colaresi, and Kevin M. Quinn. "Fightin'words: Lexical feature selection and evaluation for identifying the content of political conflict." Political Analysis 16, no. 4 (2008): 372-403.```

Odds for a word $w$ in a class of documents $i$ is $O^{(i)}_{w}=\frac{f^{(i)}_{w}}{(1-f^{(i)}_{w})}$

Odds ratio for two classes (candidates) would be $\theta^{(i_1 - i_2)}_{w} = \frac{O^{(i^1)}_{w}}{O^{(i^2)}_{w}}$

Log odds ratio would then be $log(\theta^{(i_1 - i_2)}_{w})$

By logging the odds ratio we can compare the odds ratio of words across classes

In [12]:
from itertools import combinations
from numpy.random import randint


to_odds = lambda x : (x/(1-x+0.001))
log_odds_ratio = lambda o1,o2 : np.log(o1/(o2+0.001)) 

# For each n choose 2 combinations of candidates
for cand1,cand2 in list(combinations(left_candidates,2)):
    # Calculate log odds ratio
    isolated_perc = cand_perc_df[[cand1,cand2]]
    word_odds = isolated_perc.applymap(lambda x : to_odds(x))
    word_odds["log_odds"] = word_odds.apply(lambda x : log_odds_ratio(*x),axis=1)
    # get absolute frequency of word usage
    word_odds["use"] = cand_count_df[cand1] + cand_count_df[cand2]
    word_odds = word_odds[np.isfinite(word_odds).all(1)]
    word_odds = word_odds.sort_values("log_odds",ascending=False)
    
    # visualize
    fig = px.scatter(word_odds, x="use", y="log_odds",hover_data=[word_odds.index],opacity=0.5)
    head_tail_slice = list(range(7))+list(range(-7,0))
    annotations = [dict(x = np.log10(row["use"]),
                        y = row["log_odds"],
                        ay = randint(-20,20),
                        text = row.name) 
                   for _,row in 
                   word_odds.iloc[head_tail_slice][["log_odds","use"]].iterrows()]
    xmax = word_odds["use"].max()*0.9
    ymax,ymin = word_odds["log_odds"].max()*1.2, word_odds["log_odds"].min()
    annotations += [dict(x= np.log10(xmax),y=ymax,showarrow=False,text=f"<b>{cand1}</b>"),dict(x= np.log10(xmax),y=ymin,showarrow=False,text=f"<b>{cand2}</b>")]
    args = {
        "template": "simple_white",
        "title": "Log Odds Ratio of Word Usage (/r/{}, /r/{})".format(cand1,cand2),
        "annotations": annotations
    }
    fig.update_layout(**args)
    fig.update_xaxes(type="log")
    fig.write_html("visualizations/content_analysis/{}_{}-log_odds_ratio.html".format(cand1,cand2))
    fig.show()
    


divide by zero encountered in log



## Word clouds

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
c = []
dataset = []
for candidate in left_candidates:
    comments_list = comments.where(comments["subreddit"] == candidate).select("clean_body").rdd.flatMap(lambda x: x).collect()
    c += ([candidate] * len(comments_list))
    dataset += comments_list
    text = ' '.join(comments_list)
    word_cloud = WordCloud(max_font_size=200, background_color="white", max_words=80,
                        colormap="nipy_spectral", stopwords=STOPWORDS).generate(text)
    plt.figure()
    plt.imshow(word_cloud, interpolation="hermite")
    plt.title("{} Subreddit Word Cloud\n".format(candidate))
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()
#     plt.savefig("visualizations/content_analysis/test{}_subreddit_word_cloud".format(candidate))
print(len(dataset),len(c),len(left_candidates))


Bad key "text.kerning_factor" on line 4 in
/h/224/cameron/miniconda3/envs/pyspark_env/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


## TF-IDF 

Concat all comments from the same subreddits

### Individual Comments


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
print(len(dataset),len(left_candidates),len(c))
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(dataset)
# tf_idf = pd.DataFrame(vectors.T.todense(), index=vectorizer.get_feature_names(), columns=left_candidates)
# tf_idf = tf_idf.sort_values(left_candidates[1], ascending=False)
# tf_idf.head()

In [None]:
from sklearn.decomposition import TruncatedSVD
from MulticoreTSNE import MulticoreTSNE as TSNE

X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(vectors)
X_embedded = TSNE(n_components=2,n_jobs=8, perplexity=40, verbose=2).fit_transform(X_reduced)

In [None]:
from matplotlib import pyplot as plt

vis_x = X_embedded[:, 0]
vis_y = X_embedded[:, 1]
plt.scatter(vis_x, vis_y, c=digits.target, c=[mapping[ca] for ca in c], marker='.')
plt.show()
savefig("visualizations/TSNE-candidate_comments")

In [None]:
print(X_reduced.shape)

### Subreddits as a whole

In [None]:
print(len(dataset),len(left_candidates),len(c))
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(dataset)
# tf_idf = pd.DataFrame(vectors.T.todense(), index=vectorizer.get_feature_names(), columns=left_candidates)
# tf_idf = tf_idf.sort_values(left_candidates[1], ascending=False)
# tf_idf.head()