In [13]:
from utils import load_embedding, parse_tup, cos_sim, cos_dist
from tqdm.auto import tqdm
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd

In [14]:
# from utils import CANDIDATE_SUBS
TIME_FRAME = "monthly"
left_candidates = ["JoeBiden","SandersForPresident","BaemyKlobaechar","ElizabethWarren","Pete_Buttigieg","YangForPresidentHQ"]
right_candidates = ["The_Donald"]
mapping = {}
for a,b in zip(left_candidates+right_candidates,px.colors.qualitative.Plotly):
    mapping[a] = b

In [15]:
import findspark
findspark.init("/h/224/cameron/spark-3.0.0-preview2-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType,StringType

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.getConf().getAll()
cols = ["author","subreddit","body"]

In [4]:
comments = spark.read.load("/comments_2019.parquet").select(*cols)
comments = comments.where(comments.subreddit.isin(left_candidates)
                          & (comments['body'] != "[removed]")
                          & (comments['body'] != "[deleted]")
                          & (comments["author"] != "AutoModerator")
                          & (comments["author"] != "[deleted]")
                          & ~comments["body"].contains("www.reddit.com"))

comments.show()

+-------------------+-------------------+--------------------+
|             author|          subreddit|                body|
+-------------------+-------------------+--------------------+
|           firephly|SandersForPresident|                  ok|
| girl_introspective|SandersForPresident|This meme format ...|
|     skankhunt_4200|SandersForPresident|Dude if more than...|
|   stripes_by_proxy|SandersForPresident|That's still 8 mo...|
|            Zerkcs1|SandersForPresident|Like I said. Love...|
|  ThorVonHammerdong| YangForPresidentHQ|I'll be expanding...|
|     aintbutathing3|SandersForPresident|Agreed. The whole...|
|           lalaohhi|SandersForPresident|I completely disa...|
|       trevrichards|SandersForPresident|     Be gone, troll.|
|        EverWatcher|SandersForPresident|Amazing! 

The na...|
|    executivemonkey|SandersForPresident|AOC helps us in t...|
|       chiechie1979|SandersForPresident|Don't forget Nixo...|
|          rpaguirre|SandersForPresident|Also, maybe mo

In [8]:
comments.count()

2113252

## Preprocessing

In [5]:
from pyspark.sql.types import StringType, ArrayType 
from pyspark.sql.functions import udf, col, split 
from utils import clean_text,get_wordnet_pos

udf_clean = udf(clean_text,ArrayType(StringType()))
clean_text("Amy Klobuchar He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun.")

['amy',
 'klobuchar',
 'run',
 'eat',
 'time',
 'habit',
 'swimming',
 'play',
 'long',
 'hour']

In [6]:
# Drop comments that result in an empty string "" after cleaning. These are comments like "ok", "who?" etc. 
comments = comments.withColumn("clean_body_tokenized", udf_clean(comments["body"]))
comments = comments.withColumn("len", size("clean_body_tokenized"))
comments = comments.filter("len > 0")
comments.show()

+-------------------+-------------------+--------------------+--------------------+---+
|             author|          subreddit|                body|clean_body_tokenized|len|
+-------------------+-------------------+--------------------+--------------------+---+
| girl_introspective|SandersForPresident|This meme format ...|[meme, format, ne...|  3|
|     skankhunt_4200|SandersForPresident|Dude if more than...|[dude, person, sa...| 23|
|   stripes_by_proxy|SandersForPresident|That's still 8 mo...|[still, month, pr...| 17|
|            Zerkcs1|SandersForPresident|Like I said. Love...|[like, say, love,...| 25|
|  ThorVonHammerdong| YangForPresidentHQ|I'll be expanding...|[expand, album, t...| 11|
|     aintbutathing3|SandersForPresident|Agreed. The whole...|[agree, whole, co...|  6|
|           lalaohhi|SandersForPresident|I completely disa...|[completely, disa...| 68|
|       trevrichards|SandersForPresident|     Be gone, troll.|         [go, troll]|  2|
|        EverWatcher|SandersForP

## Word Distributions

Probability of a word $w_i$ appearing a subreddit $R$ is 

$P(w_i|R) = \frac{f(w_i,R)}{|R|}$

Aka what I've been putting off for too long

In [16]:
from collections import Counter 
from operator import add
import pyspark.sql.functions as f
from pyspark.ml.feature import CountVectorizer 
import os

cv = CountVectorizer(inputCol="clean_body_tokenized", outputCol="vectors")

def filter_dict(a_dict,thresh=0.0001,n=None):
    if n:
        return {key: val for key,val in a_dict.items() if val/n > thresh}
    else:
        return {key: val for key,val in a_dict.items() if val > thresh}
    

def count_dict(dataframe,cand=None,thresh=0.0001):
    isolated = dataframe.where(dataframe["subreddit"] == cand) if cand else dataframe
    model = cv.fit(isolated)
    combined_df = (isolated.select(f.explode('clean_body_tokenized').alias('col'))
                   .select(f.collect_list('col').alias('clean_body_tokenized')))
    counts = model.transform(combined_df).select('vectors').collect()
    counts = dict(zip(model.vocabulary, counts[0]['vectors'].values))
    n = isolated.select(f.sum("len")).collect()[0][0]
    return filter_dict(counts,thresh,n),n

fp = "dataframes/word_freq/comment_counts.csv"
normalized_fp = "dataframes/word_freq/normalized-comment_counts.csv"

cand_count_df = pd.DataFrame()
cand_perc_df = pd.DataFrame()

try:
    cand_perc_df = pd.read_csv(normalized_fp,index_col=0)
    cand_count_df = pd.read_csv(fp,index_col=0)
except:
    pass

for cand in tqdm(left_candidates):
    if not cand in cand_perc_df:
        cand_vec,n = count_dict(comments,cand)
        print(cand,n)
        cand_count_df = pd.concat([cand_count_df,pd.Series(cand_vec,name=cand)], axis=1) 
        cand_count_df = cand_count_df.fillna(0)
        cand_count_df.to_csv(fp)
        normalized = pd.Series({key: val/n for key,val in cand_vec.items()},name=cand)
        cand_perc_df = pd.concat([cand_perc_df,normalized], axis=1) 
        cand_perc_df = cand_perc_df.fillna(0)
        cand_perc_df.to_csv(normalized_fp)
        
if not "total" in cand_count_df:
    cand_count_df["total"] = cand_count_df.sum(axis=1)
    cand_count_df.to_csv(fp)

if not "total" in cand_perc_df:
    n = comments.select(f.sum("len")).collect()[0][0]
    cand_perc_df["total"] = cand_count_df["total"]/n
    cand_perc_df = cand_perc_df.fillna(0)
    cand_perc_df.to_csv(normalized_fp)
cand_count_df.index.name = "word"
cand_perc_df.index.name = "word"
cand_perc_df.head()

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




Unnamed: 0_level_0,JoeBiden,SandersForPresident,BaemyKlobaechar,ElizabethWarren,Pete_Buttigieg,YangForPresidentHQ,total
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
biden,0.022383,0.003714,0.005204,0.004286,0.002912,0.001405,0.002656
like,0.010717,0.00953,0.010911,0.010042,0.010743,0.010559,0.010239
would,0.009754,0.009116,0.0094,0.009801,0.0082,0.01019,0.009498
think,0.00975,0.007483,0.011247,0.010498,0.011287,0.00934,0.009139
trump,0.009003,0.004816,0.006435,0.003733,0.003399,0.004131,0.004235


### Log Odds Ratio

Source: ```Monroe, Burt L., Michael P. Colaresi, and Kevin M. Quinn. "Fightin'words: Lexical feature selection and evaluation for identifying the content of political conflict." Political Analysis 16, no. 4 (2008): 372-403.```

Odds for a word $w$ in a class of documents $i$ is $O^{(i)}_{w}=\frac{f^{(i)}_{w}}{(1-f^{(i)}_{w})}$

Odds ratio for two classes (candidates) would be $\theta^{(i_1 - i_2)}_{w} = \frac{O^{(i^1)}_{w}}{O^{(i^2)}_{w}}$

Log odds ratio would then be $log(\theta^{(i_1 - i_2)}_{w})$

By logging the odds ratio we can compare the odds ratio of words across classes

In [18]:
from itertools import combinations
from numpy.random import randint


to_odds = lambda x : (x/(1-x+0.001))
log_odds_ratio = lambda o1,o2 : np.log((o1+0.001)/(o2+0.001)) 

# For each n choose 2 combinations of candidates
for cand1,cand2 in list(combinations(left_candidates,2)):
    # Calculate log odds ratio
    isolated_perc = cand_perc_df[[cand1,cand2]]
    word_odds = isolated_perc.applymap(lambda x : to_odds(x))
    word_odds["log_odds"] = word_odds.apply(lambda x : log_odds_ratio(*x),axis=1)
    # get absolute frequency of word usage
    word_odds["use"] = cand_count_df[cand1] + cand_count_df[cand2]
    word_odds = word_odds[np.isfinite(word_odds).all(1)]
    word_odds = word_odds.sort_values("log_odds",ascending=False)
    # visualize
    fig = px.scatter(word_odds,
                     x="use",
                     y="log_odds",
                     hover_data=[word_odds.index],
                      labels={"use":"Word Frequency",
                         "index": "Word",
                         "log_odds": "Log Odds Ratio"},
                     opacity=0.5)
    # annotations
    head_tail_slice = list(range(7))+list(range(-7,0))
    annotations = [dict(x = np.log10(row["use"]),
                        y = row["log_odds"],
                        ay = randint(-20,20),
                        ax = randint(-20,20),
                        text = row.name) 
                   for _,row in 
                   word_odds.iloc[head_tail_slice][["log_odds","use"]].iterrows()]
    xmax = word_odds["use"].max()*0.9
    ymax,ymin = word_odds["log_odds"].max()*1.2, word_odds["log_odds"].min()
    annotations += [dict(x= np.log10(xmax),y=ymax,showarrow=False,text=f"<b>{cand1}</b>"),dict(x= np.log10(xmax),y=ymin,showarrow=False,text=f"<b>{cand2}</b>")]
    args = {
        "template": "simple_white",
        "title": "Log Odds Ratio of Word Usage (/r/{}, /r/{})".format(cand1,cand2),
        "annotations": annotations
    }
    fig.update_layout(**args)
    fig.update_xaxes(type="log")
#     fig.write_html("visualizations/content_analysis/{}_{}-log_odds_ratio.html".format(cand1,cand2))
    fig.show()
    

### Visualize Against Master

In [None]:
master_log_odds = cand_perc_df.copy().applymap(lambda x : to_odds(x))
for cand in left_candidates:
    master_log_odds[cand] = master_log_odds.apply(lambda x : log_odds_ratio(x[cand],x["total"]),axis=1)
master_log_odds = master_log_odds.reset_index()
master_log_odds = master_log_odds.melt(id_vars=["word"], value_vars=left_candidates,var_name="subreddit",value_name="log_odds")
master_log_odds = master_log_odds.merge(cand_count_df["total"],how="left",left_on="word",right_index=True)
annotations = []
for cand in left_candidates:
    iso = master_log_odds[master_log_odds["subreddit"] == cand].sort_values("log_odds",ascending=False)
    annotations += [dict(x = np.log10(row["total"]),
                        y = row["log_odds"],
                        ay = randint(-20,0),
                        text = row["word"]) 
                   for _,row in 
                   iso.head(3).iterrows()]
xmax = master_log_odds["total"].max()*0.9
ymax,ymin = master_log_odds["log_odds"].max()*1.1, master_log_odds["log_odds"].min()*1.1
annotations += [dict(x= np.log10(xmax),y=ymax,showarrow=False,text="<b>Subreddit Specific</b>"),
                dict(x= np.log10(xmax),y=0,showarrow=False,text="<b>General Usage</b>"),
                dict(x= np.log10(xmax),y=ymin,showarrow=False,text="<b>Other Subbredits</b>")]
    

fig = px.scatter(master_log_odds,
                 x="total",
                 y="log_odds",
                 color="subreddit",
                 hover_data=['word'],
                 opacity=0.5,
                 color_discrete_map=mapping,
                 labels={
                     "total": "Word Frequency (All Subreddits)",
                     "log_odds": "Log Odds Ratio Against All Subreddits",
                     "subreddit": "Subreddit",
                     "word": "Word"
                 })
args = {
    "template": "simple_white",
    "title": "Log Odds Ratio of Word Usage",
    "annotations": annotations,
    "height": 1000
}
fig.update_layout(**args)
fig.update_xaxes(type="log")
fig.write_html("visualizations/content_analysis/all_subs-log_odds_ratio.html")
fig.show()

In [None]:
n = 20
top_words = pd.DataFrame()
for cand in left_candidates:
    iso = master_log_odds[master_log_odds["subreddit"] == cand].sort_values("log_odds",ascending=False)
    print(iso.head(n))


## Word clouds

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
odds_dict = master_log_odds.to_dict()
for cand in left_candidates:
    iso = master_log_odds[master_log_odds["subreddit"] == cand]
    word_cloud_dict = {}
    for row in iso.iterrows():
        word_cloud_dict[row[1].word] = row[1].log_odds

    wordcloud = WordCloud(background_color="white",
                          colormap="nipy_spectral").generate_from_frequencies(word_cloud_dict)
 
    plt.figure()
    plt.imshow(wordcloud, interpolation="hermite")
    plt.title("{} Subreddit Word Cloud\n".format(cand))
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.savefig("visualizations/content_analysis/word_clouds/{}_subreddit_word_cloud".format(cand))


## Log Odds Against Democratic Debates

How different are the comments on candidate subs from the words of the candidates themselves 

In [None]:
left_candidates = ["JoeBiden","SandersForPresident","BaemyKlobaechar","ElizabethWarren","Pete_Buttigieg","YangForPresidentHQ"]
cands = ["Joe Biden","Bernie Sanders","Amy Klobuchar","Elizabeth Warren","Pete Buttigieg","Andrew Yang"]
cand_speeches = pd.read_csv("external_datasets/candidate_speeches/debates.csv")
cand_speeches = cand_speeches[cand_speeches["speaker"].isin(cands)].dropna(subset=["speech"])
cand_speeches["clean_text"] = cand_speeches["speech"].apply(clean_comment)
cand_speeches = cand_speeches[["speaker","gender","speech","clean_text"]].set_index("speaker")
cand_speeches.head()

In [None]:
debate_count = {}
debate_prob = {}
for cand in cands:
    iso = cand_speeches.loc[cand]
    word_count = pd.Series(np.concatenate([x for x in iso["clean_text"]])).value_counts()
    n = word_count.sum()
    debate_count[cand] = filter_dict(word_count.to_dict(),n=n)
    word_prob = filter_dict((word_count/n).to_dict())
    debate_prob[cand] = word_prob

debate_count = pd.DataFrame.from_dict(debate_count).fillna(0)
debate_count["total"] = debate_count.sum(axis=1)
debate_prob = pd.DataFrame.from_dict(debate_prob).fillna(0)

debate_count

In [None]:
debate_odds = debate_prob.applymap(lambda x : to_odds(x))
subreddit_odds = cand_perc_df.applymap(lambda x : to_odds(x))
for subreddit,candidate in zip(left_candidates,cands):
    merged = pd.merge(subreddit_odds[subreddit],debate_odds[candidate],left_index=True,right_index=True,how="outer").fillna(0)
    # Drop any row that contains a 0 (avoid divide by 0 error)
    merged = merged.loc[~(merged==0).any(axis=1)]
    merged["log_odds"] = merged.apply(lambda x : log_odds_ratio(*x),axis=1)
    merged = merged.sort_values("log_odds")
    print(merged)


## Visualize Democratic Debates Word Freq

In [None]:
for subreddit,candidate in zip(left_candidates,cands):
    merged = pd.merge(subreddit_odds[subreddit],debate_odds[candidate],left_index=True,right_index=True,how="outer").fillna(0)
    # Drop any row that contains a 0 (avoid divide by 0 error)
    merged = merged.loc[~(merged==0).any(axis=1)]
    merged["log_odds"] = merged.apply(lambda x : log_odds_ratio(*x),axis=1)
    merged = merged.sort_values("log_odds")
    fig = px.scatter(merged,
                     x=candidate,
                     y=subreddit,
                     hover_data=[merged.index],
                     opacity=0.5,
                     labels={
                     subreddit: "Subreddit Usage",
                     candidate: "Debate Usage",
                     "word": "Word"
                     })
    # annotations
    head_tail_slice = list(range(7))+list(range(-7,0))
    annotations = [dict(x = row[candidate],
                        y = row[subreddit],
                        ay = randint(-20,20),
                        ax = randint(-20,20),
                        text = row.name) 
                   for _,row in 
                   merged.iloc[head_tail_slice][[subreddit,candidate]].iterrows()]
    args = {
        "template": "simple_white",
        "title": "{} Log Odds of Word Usage (Debates/Subreddit)".format(candidate),
        "annotations": annotations
    }
    fig.update_layout(**args)
    fig.write_html("visualizations/content_analysis/{}_debate_sub-log_odds.html".format(subreddit))
    fig.show()
    

## Progressive vs Moderate Subreddits

Top 8 Closest Subreddits to /r/demsocialist: /r/progressive (0.317), /r/dsa (0.324), /r/LeftWithoutEdge (0.337), /r/Political_Revolution (0.343), /r/chomsky (0.346), /r/AOC (0.356), /r/SocialDemocracy (0.359), /r/Liberal (0.387)

Top 8 Closest Subreddits to /r/ConservativeDemocrat: /r/centerleftpolitics (0.314), /r/neoliberal (0.365), /r/Enough_AOC_Spam (0.368), /r/moderatepolitics (0.379), /r/Enough_Sanders_Spam (0.389), /r/tuesday (0.396), /r/PoliticalDiscussion (0.413), /r/neoconNWO (0.415)

In [None]:
prog_subs = ["demsocialist",
             "progressive",
             "dsa",
             "LeftWithoutEdge",
             "Political_Revolution",
             "chomsky",
             "AOC",
             "EnoughLibertarianSpam",
             "SocialDemocracy",
             "Liberal"]
mod_subs = ["ConservativeDemocrat",
            "centerleftpolitics",
            "neoliberal",
            "Enough_AOC_Spam",
            "moderatepolitics",
            "Enough_Sanders_Spam",
            "tuesday","PoliticalDiscussion",
            "neoconNWO"]

cols = ["author","subreddit","body"]
pol_comments = spark.read.load("/comments_2019.parquet").select(*cols)
pol_comments = pol_comments.where(pol_comments.subreddit.isin(prog_subs+mod_subs)
                          & (pol_comments['body'] != "[removed]")
                          & (pol_comments['body'] != "[deleted]")
                          & (pol_comments["author"] != "AutoModerator")
                          & (pol_comments["author"] != "groupbot")
                          & (pol_comments["author"] != "jobautomator")                                  
                          & (pol_comments["author"] != "userpinger")
                          & (pol_comments["author"] != "[deleted]")
                          & ~pol_comments["body"].contains("www.reddit.com"))

In [None]:
assign_sub = lambda x : "progressive_sub" if x in prog_subs else "moderate_sub"

udf_assign_sub = udf(assign_sub,StringType())
# Drop comments that result in an empty string "" after cleaning. These are comments like "ok", "who?" etc. 
pol_comments = pol_comments.withColumn("clean_body_tokenized", udf_clean(pol_comments["body"]))
pol_comments = pol_comments.withColumn("sub_type",udf_assign_sub(pol_comments["subreddit"]))
pol_comments = pol_comments.drop('subreddit').withColumnRenamed('sub_type', 'subreddit')
pol_comments = pol_comments.withColumn("len", size("clean_body_tokenized"))
pol_comments = pol_comments.filter("len > 0")
pol_comments.show()

In [None]:
from collections import Counter 
from operator import add
import pyspark.sql.functions as f
from pyspark.ml.feature import CountVectorizer 
import os

cv = CountVectorizer(inputCol="clean_body_tokenized", outputCol="vectors")


fp = "dataframes/word_freq/pol_comment_counts.csv"
normalized_fp = "dataframes/word_freq/normalized-pol_comment_counts.csv"

ideology_count_df = pd.DataFrame()
ideology_perc_df = pd.DataFrame()

try:
    ideology_perc_df = pd.read_csv(normalized_fp,index_col=0)
    ideology_count_df = pd.read_csv(fp,index_col=0)
except:
    pass

for ideology in tqdm(["moderate_sub","progressive_sub"]):
    if not ideology in ideology_perc_df:
        ideology_vec,n = count_dict(pol_comments,ideology)
        print(ideology,n)
        ideology_count_df = pd.concat([ideology_count_df,pd.Series(ideology_vec,name=ideology)], axis=1) 
        ideology_count_df = ideology_count_df.fillna(0)
        ideology_count_df.to_csv(fp)
        normalized = pd.Series({key: val/n for key,val in ideology_vec.items()},name=ideology)
        ideology_perc_df = pd.concat([ideology_perc_df,normalized], axis=1) 
        ideology_perc_df = ideology_perc_df.fillna(0)
        ideology_perc_df.to_csv(normalized_fp)
        
if not "total" in ideology_count_df:
    ideology_count_df["total"] = ideology_count_df.sum(axis=1)
    ideology_count_df.to_csv(fp)

if not "total" in ideology_perc_df:
    n = pol_comments.select(f.sum("len")).collect()[0][0]
    ideology_perc_df["total"] = ideology_count_df["total"]/n
    ideology_perc_df = ideology_perc_df.fillna(0)
    ideology_perc_df.to_csv(normalized_fp)

ideology_count_df.index.name = "word"
ideology_perc_df.index.name = "word"
ideology_perc_df.head()

In [None]:
## TODO make this work
ideology_perc_df = ideology_perc_df[["moderate_sub","progressive_sub"]]
ideology_perc_df = ideology_perc_df.applymap(lambda x : to_odds(x))

ideology_perc_df["log_odds"] = ideology_perc_df.apply(lambda x : log_odds_ratio(*x),axis=1)
# get absolute frequency of word usage
ideology_perc_df["use"] = ideology_count_df["moderate_sub"] + ideology_count_df["progressive_sub"]
ideology_perc_df = ideology_perc_df[np.isfinite(ideology_perc_df).all(1)]
ideology_perc_df = ideology_perc_df.sort_values("log_odds")
# visualize
fig = px.scatter(ideology_perc_df,
                 x="use",
                 y="log_odds",
                 hover_data=[ideology_perc_df.index],
                 labels={"use":"Word Frequency",
                         "index": "Word",
                         "log_odds": "Log Odds Ratio"},
                 opacity=0.5)
# annotations
head_tail_slice = list(range(7))+list(range(-7,0))
annotations = [dict(x = np.log10(row["use"]),
                    y = row["log_odds"],
                    ay = randint(-20,20),
                    ax = randint(-20,20),
                    text = row.name) 
               for _,row in 
               ideology_perc_df.iloc[head_tail_slice][["log_odds","use"]].iterrows()]
xmax = ideology_perc_df["use"].max()*0.9
ymax,ymin = ideology_perc_df["log_odds"].max()*1.2, ideology_perc_df["log_odds"].min()
annotations += [dict(x= np.log10(xmax),y=ymax,showarrow=False,text="<b>Moderate Subreddits</b>"),dict(x= np.log10(xmax),y=ymin,showarrow=False,text="<b>Progressive Subreddits</b>")]
args = {
    "template": "simple_white",
    "title": "Log Odds Ratio of Word Usage ({}, {})".format("Moderate Subreddits","Progressive Subreddits"),
    "annotations": annotations
}
fig.update_layout(**args)
fig.update_xaxes(type="log")
fig.write_html("visualizations/content_analysis/{}_{}-log_odds_ratio.html".format("moderate_subs","progressive_subs"))
fig.show()

In [None]:
progressive_words = list(ideology_perc_df.sort_values("log_odds").head(20).index)
progressive_words

## Progressive Word Usage

Plotting word odds for progressive subreddits against word odds for candidate subs

In [None]:
# merged = pd.merge(cand_perc_df[left_candidates].applymap(lambda x : to_odds(x)),ideology_perc_df[["progressive_sub"]],left_index=True,right_index=True,how="inner").fillna(0)
melted = cand_perc_df[left_candidates].applymap(lambda x : to_odds(x)).reset_index()
melted = melted.melt(value_vars=left_candidates,id_vars=["word"],var_name="subreddit",value_name="word_odds").set_index("word",drop=True)
melted = pd.merge(ideology_perc_df[["progressive_sub"]],melted,left_index=True,right_index=True,how="right").fillna(0)
melted


In [None]:
from sklearn.metrics import r2_score

# visualize
fig = px.scatter(melted,
                 x="progressive_sub",
                 y="word_odds",
                 color="subreddit",
                 color_discrete_map=mapping,
                 hover_data=[melted.index],
                 opacity=0.4,
                 labels={"progressive_sub":"Word Usage in Progressive Subreddits",
                         "index": "Word",
                         "subreddit": "Candidate",
                         "word_odds": "Word Usage in Candidate Subreddits"})
args = {
    "template": "simple_white",
    "title": "Word Distribution Relative to Progressive Subreddits",
}

xmin,xmax = melted["progressive_sub"].min(),melted["progressive_sub"].max()
x = np.linspace(xmin,xmax,num=2)
for cand in left_candidates:
    iso = melted[melted["subreddit"]==cand]
    # Fit a linear regression to our model and its results
    model = np.polyfit(iso["word_odds"], iso["progressive_sub"], 1)
    predict = np.poly1d(model)
    r2 = r2_score(iso["progressive_sub"], predict(iso["word_odds"]))
    print(cand,r2)
    fig.add_scatter(x=x,
                    y=predict(x),
                    mode='lines',
                    line=dict(dash="dash",width=2,color=mapping[cand]),
                    name=f"{cand} Linear Regression (r<sup>2</sup>={r2:.2f})",
                    legendgroup=cand,
                   )
    
fig.update_layout(**args)
fig.write_html("visualizations/content_analysis/word_dist_vs_prog_subs.html")

fig.show()