In [1]:
from utils import load_embedding, parse_tup, cos_sim, cos_dist
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd

## Comments over time

In [39]:
TIME_FRAME = "weekly"
left_candidates = ["SandersForPresident","ElizabethWarren","YangForPresidentHQ","Pete_Buttigieg",
                   "Kamala","BetoORourke","JoeBiden","BaemyKlobaechar","bidenbro"]
right_candidates = ["The_Donald"]

In [40]:
subreddit_comments = pd.read_csv("/h/224/cameron/Political-Subreddit-Embedding/temp/temporal/{}_wv.txt".format(TIME_FRAME),header=None,sep=' ')
subreddit_comments[0], subreddit_comments["week"] = subreddit_comments[0].apply(parse_tup).str
subreddit_comments = subreddit_comments.rename({0:"subreddit",1:"count"},axis=1)
subreddit_comments = subreddit_comments[subreddit_comments["subreddit"].isin(left_candidates) | subreddit_comments["subreddit"].isin(right_candidates)].reset_index(drop=True)
idx = pd.MultiIndex.from_product([subreddit_comments['week'].unique(), subreddit_comments['subreddit'].unique()],
                                 names=['week', 'subreddit'])

# In the case that there isn't a vector for a specific week/subreddit we backfill the vector from the previous
subreddit_comments = subreddit_comments.set_index(['week', 'subreddit']).reindex(idx).reset_index().sort_values('week').fillna(1)
subreddit_comments


Columnar iteration over characters will be deprecated in future releases.



Unnamed: 0,week,subreddit,count
466,2018-12-23,JoeBiden,1.0
459,2018-12-23,SandersForPresident,3088.0
460,2018-12-23,YangForPresidentHQ,17.0
461,2018-12-23,ElizabethWarren,92.0
462,2018-12-23,The_Donald,124357.0
...,...,...,...
425,2019-12-22,ElizabethWarren,745.0
424,2019-12-22,YangForPresidentHQ,22812.0
431,2019-12-22,Pete_Buttigieg,5102.0
426,2019-12-22,The_Donald,60746.0


In [50]:
args = {
    "x": "week",
    "y": "count",
    "color": "subreddit",
    "line_shape": "spline",
    "title":  "Comments Over Time",
    "template": "simple_white",
    "labels": {"count": "# Comments", "week": "Two Week Window Starting", "subreddit": "Subreddit"},
}

fig = px.line(subreddit_comments,**args)
fig.update_layout(
    annotations=[
        dict(
            x="2019-02-24",
            y=np.log10(6000),
            text="Yang AMA on /r/politics",
        ),
        dict(
            x="2019-12-1",
            y=np.log10(1),
            text="Harris drops out of presidential race",
        ),
        dict(
            x="2019-11-1",
            y=np.log10(30),
            text="O'Rourke drops out of presidential race",
        )
    ]
)
fig.update_yaxes(type="log")

fig.write_html("visualizations/temporal/{}_subreddit_counts.html".format(TIME_FRAME))
fig.show()

In [15]:
subreddits, vectors  = load_embedding("/h/224/cameron/Political-Subreddit-Embedding/trained_embeddings/adapted_vecs.txt")
filt = subreddits.isin(left_candidates) | subreddits.isin(right_candidates)
subreddits, vectors = subreddits[filt], vectors[filt]

In [16]:
fig = go.Figure()
for candidate in left_candidates+right_candidates:
    can_ind = subreddits.index[subreddits == candidate][0]
    can_vector = vectors.loc[can_ind]
    can_dists = vectors.apply(lambda r : cos_sim(can_vector,r),axis=1).tolist()
    axis = subreddits.tolist()
    axis.append(axis[0])
    can_dists.append(can_dists[0])     
    fig.add_trace(
        go.Scatterpolar(
            r=can_dists, 
            theta=axis,            
            name=candidate, # name to be exibited on legend and on hover
            hoverinfo='name+r', # what to show on hover (name + data point)
            hovertemplate='Similarity %{r:0.2f}', # Format of data point
            opacity= 0.25,
            line_shape='spline',
            line_smoothing=0.75,
            line_width=2       
        )
    )

layout = {
    "title": "Similarity of Political Subreddits",
    "polar_bgcolor": 'white',
    "polar_radialaxis_visible": True,
    "polar_radialaxis_showticklabels": True,
    "polar_radialaxis_tickfont_color": 'darkgrey',
    "polar_angularaxis_color": 'grey',
    "polar_angularaxis_showline": False,
    "polar_radialaxis_showline": False,
    "polar_radialaxis_layer": 'below traces',
    "polar_radialaxis_gridcolor": '#F2F2F2',
    "polar_radialaxis_range": (0,1.1),
    "polar_radialaxis_tickvals": [0,0.25,0.5,0.75,0.92],
    "polar_radialaxis_ticktext": ["Different","","","", "Similar"],
    "polar_radialaxis_tickmode": 'array' 
}
fig.update_layout(**layout)
fig.write_html("visualizations/2019_radial.html")
fig.show()

In [57]:
subreddits, vectors  = load_embedding("/h/224/cameron/Political-Subreddit-Embedding/trained_embeddings/temporal/{}/{}_vecs_0.0043_35.txt".format(TIME_FRAME,TIME_FRAME))
subreddits = pd.DataFrame(subreddits.apply(parse_tup).tolist())
subreddits.columns = ["subreddit","week"]
filt = subreddits["subreddit"].isin(left_candidates) #| subreddits["subreddit"].isin(right_candidates)
subreddits, vectors = subreddits[filt], vectors[filt]
embedding = pd.concat([subreddits, vectors], axis=1).reset_index(drop=True)
del subreddits, vectors
idx = pd.MultiIndex.from_product([embedding['week'].unique(), embedding['subreddit'].unique()],
                                 names=['week', 'subreddit'])

# In the case that there isn't a vector for a specific week/subreddit we backfill the vector from the previous
embedding = embedding.set_index(['week', 'subreddit']).reindex(idx).reset_index().sort_values(['subreddit','week']).bfill()
embedding = embedding.sort_values('week')
embedding

Unnamed: 0,week,subreddit,1,2,3,4,5,6,7,8,...,141,142,143,144,145,146,147,148,149,150
415,2018-12-23,BaemyKlobaechar,0.091692,-0.029751,0.148638,-0.149702,0.096647,0.033364,0.027113,0.025403,...,-0.009281,0.004107,0.060580,-0.128254,0.016328,0.161756,0.009749,0.147253,-0.109042,-0.110427
408,2018-12-23,YangForPresidentHQ,0.037001,0.024873,0.045705,-0.166485,-0.007079,-0.035158,0.037638,0.018895,...,0.051165,0.080786,-0.013446,-0.103149,-0.006241,0.075474,-0.013396,0.116566,-0.111486,-0.133578
414,2018-12-23,BetoORourke,0.074438,0.016745,0.117814,0.032809,-0.097617,-0.030849,-0.053858,0.087476,...,-0.063615,-0.095274,-0.058502,0.140099,0.055988,0.044100,0.123268,-0.003409,-0.120328,0.065192
410,2018-12-23,Pete_Buttigieg,0.114011,-0.033956,0.148011,-0.155364,0.079942,0.028795,0.013205,0.016937,...,-0.037102,-0.002533,0.064184,-0.118704,0.009817,0.152981,0.007901,0.115117,-0.105391,-0.110438
409,2018-12-23,SandersForPresident,0.142286,-0.032217,0.006050,-0.044465,-0.105778,-0.099155,-0.056903,-0.083968,...,0.032751,0.032462,0.108065,0.011459,-0.006112,0.163671,0.053668,-0.029038,-0.016965,-0.111375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,2019-12-22,ElizabethWarren,0.124212,-0.027952,0.108921,-0.121044,0.017996,-0.033680,0.062444,-0.009715,...,-0.052400,-0.122181,0.115840,-0.102288,0.026115,-0.082577,0.050019,-0.031042,-0.047535,-0.068858
206,2019-12-22,BetoORourke,0.136946,-0.045365,0.130785,-0.127449,0.067563,0.023679,-0.010132,0.046265,...,-0.075356,-0.013665,0.070400,-0.092108,0.017821,0.121303,0.038829,0.093688,-0.092182,-0.112988
207,2019-12-22,BaemyKlobaechar,0.131278,-0.016642,0.099783,-0.151858,0.104746,0.036428,0.021752,0.059008,...,-0.014360,-0.020193,0.073870,-0.065614,0.010109,0.089343,-0.028489,0.046814,-0.096203,-0.071017
201,2019-12-22,SandersForPresident,0.212718,0.076833,-0.008305,-0.077936,-0.035532,0.032725,-0.025301,0.019405,...,-0.154282,-0.001838,0.051827,-0.021436,-0.006791,0.090611,0.071635,-0.021111,-0.010458,-0.026878


In [63]:
def avg_distance(subreddit,week):
    filter_week = embedding[embedding["week"] == week]
    sub_vec = filter_week[filter_week["subreddit"] == subreddit].drop(["subreddit","week"],axis=1).to_numpy().flatten()
    other_vecs = filter_week[filter_week["subreddit"] != subreddit].drop(["subreddit","week"],axis=1).to_numpy()    
    dists = np.apply_along_axis(lambda a : 1 - cos_sim(sub_vec,a), 1, other_vecs)
    return np.mean(dists)
# embedding = embedding[~embedding["subreddit"].isin(right_candidates)]
embedding["avg_dist"] = np.vectorize(avg_distance)(embedding['subreddit'], embedding['week'])
embedding

Unnamed: 0,week,subreddit,1,2,3,4,5,6,7,8,...,144,145,146,147,148,149,150,avg_dist,similarity,avg_distance
415,2018-12-23,BaemyKlobaechar,0.091692,-0.029751,0.148638,-0.149702,0.096647,0.033364,0.027113,0.025403,...,-0.128254,0.016328,0.161756,0.009749,0.147253,-0.109042,-0.110427,0.273921,0.352051,0.299432
408,2018-12-23,YangForPresidentHQ,0.037001,0.024873,0.045705,-0.166485,-0.007079,-0.035158,0.037638,0.018895,...,-0.103149,-0.006241,0.075474,-0.013396,0.116566,-0.111486,-0.133578,0.287870,0.374011,0.315706
414,2018-12-23,BetoORourke,0.074438,0.016745,0.117814,0.032809,-0.097617,-0.030849,-0.053858,0.087476,...,0.140099,0.055988,0.044100,0.123268,-0.003409,-0.120328,0.065192,0.625768,0.916736,0.707761
410,2018-12-23,Pete_Buttigieg,0.114011,-0.033956,0.148011,-0.155364,0.079942,0.028795,0.013205,0.016937,...,-0.118704,0.009817,0.152981,0.007901,0.115117,-0.105391,-0.110438,0.269633,0.343870,0.294124
409,2018-12-23,SandersForPresident,0.142286,-0.032217,0.006050,-0.044465,-0.105778,-0.099155,-0.056903,-0.083968,...,0.011459,-0.006112,0.163671,0.053668,-0.029038,-0.016965,-0.111375,0.382042,0.513989,0.423548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,2019-12-22,ElizabethWarren,0.124212,-0.027952,0.108921,-0.121044,0.017996,-0.033680,0.062444,-0.009715,...,-0.102288,0.026115,-0.082577,0.050019,-0.031042,-0.047535,-0.068858,0.244663,0.269417,0.255781
206,2019-12-22,BetoORourke,0.136946,-0.045365,0.130785,-0.127449,0.067563,0.023679,-0.010132,0.046265,...,-0.092108,0.017821,0.121303,0.038829,0.093688,-0.092182,-0.112988,0.235548,0.259385,0.246242
207,2019-12-22,BaemyKlobaechar,0.131278,-0.016642,0.099783,-0.151858,0.104746,0.036428,0.021752,0.059008,...,-0.065614,0.010109,0.089343,-0.028489,0.046814,-0.096203,-0.071017,0.204771,0.219209,0.211540
201,2019-12-22,SandersForPresident,0.212718,0.076833,-0.008305,-0.077936,-0.035532,0.032725,-0.025301,0.019405,...,-0.021436,-0.006791,0.090611,0.071635,-0.021111,-0.010458,-0.026878,0.323142,0.364370,0.341248


In [72]:
 args = {
    "x": "week",
    "y": "avg_dist",
    "color": "subreddit",
    "line_shape": "spline",
    "title":  "Cosine Distance of Political Subreddit Embeddings Over Time",
    "template": "simple_white",
    "labels": {"avg_dist": "Cosine Distance", "week": "Two Week Window Starting", "subreddit": "Subreddit"},
    "range_y": [0,1]
}

fig = px.line(embedding,**args)
# fig.update_layout(
#     annotations=[
#         dict(
#             x="2019-07-28",
#             y=0.89,
#             text="Beto O'Rourke: Donald Trump is a white nationalist",
#         )
#     ]
# )

fig.write_html("visualizations/temporal/{}_cos_dist_over_time.html".format(TIME_FRAME))
fig.show()

### Correlation Matrix

In [65]:
pivot = embedding.pivot(index='week', columns='subreddit', values='avg_dist')
correlation = np.corrcoef(pivot.to_numpy())

In [66]:
labels = pivot.columns
data = go.Heatmap(z=correlation, y=labels, x=labels)

layout = {
    "title": "Candidate Distance Over Time - Pearson Correlation ({})".format(TIME_FRAME.title()),
    "xaxis": {"title": "Candidate"},
    "yaxis": {"title": "Candidate"},
}
fig = go.Figure(data=data,layout=layout)
# fig.write_html("visualizations/temporal/{}_dist_correlation.html".format(TIME_FRAME))
fig.show()

In [67]:
from sklearn.decomposition import PCA
pca =  PCA(n_components = 1)
one_dim =  pd.DataFrame(pca.fit_transform(embedding.iloc[:,2:]))
one_dim.columns = ["embedding"]
one_dim[["subreddit","week"]] = embedding[["subreddit","week"]]
one_dim

Unnamed: 0,embedding,subreddit,week
0,-0.461033,YangForPresidentHQ,2019-09-01
1,-0.281310,SandersForPresident,2019-09-01
2,0.726603,Pete_Buttigieg,2019-09-01
3,-0.459964,ElizabethWarren,2019-09-01
4,0.260726,Kamala,2019-09-01
...,...,...,...
419,0.125508,ElizabethWarren,2019-01-06
420,-0.444485,Kamala,2019-01-06
421,-0.337921,JoeBiden,2019-01-06
422,0.216252,BetoORourke,2019-01-06


In [69]:
pivot = one_dim.pivot(index='week', columns='subreddit', values="embedding")
correlation = np.corrcoef(pivot.to_numpy())

In [70]:
labels = pivot.columns
data = go.Heatmap(z=correlation, y=labels, x=labels)

layout = {
    "title": "Candidate Vector Correlation Over Time - 1-Dim PCA ({})".format(TIME_FRAME.title()),
    "xaxis": {"title": "Candidate"},
    "yaxis": {"title": "Candidate"},
}
fig = go.Figure(data=data,layout=layout)
fig.write_html("visualizations/temporal/{}_1_dim_correlation.html".format(TIME_FRAME))
fig.show()