In [4]:
import findspark
findspark.init("/h/224/cameron/spark-3.0.0-preview2-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import date_sub
from glob import glob
from datetime import datetime
import numpy as np
import pandas as pd
import tempfile

In [5]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.getConf().getAll()
TIME_FRAME = "weekly"
window = {
    "weekly": 7,
    "biweekly": 14,
}

In [3]:
# Load the Parquet data
comments = spark.read.load("/comments_2019.parquet").fillna("")
subreddits = spark.read.load("dataframes/subreddits.parquet")
comments = comments.join(subreddits, ['subreddit'], 'leftsemi')
cols = ['author','subreddit','created_utc']
comments = comments.select(*cols)
comments.printSchema()

root
 |-- author: string (nullable = false)
 |-- subreddit: string (nullable = false)
 |-- created_utc: integer (nullable = true)



In [4]:
# Add date column
# Create a function that returns the desired UDF from a timestamp 
to_udf = udf(lambda ts: datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:00:00"))

comments = comments.withColumn("timestamp", to_udf(comments["created_utc"]))
# Add column that aggregates by week 
comments = comments.withColumn("week",date_sub(next_day(col("timestamp"),"sunday"),window[TIME_FRAME]))
comments = comments.withColumn('subreddit', concat(lit('('),col('subreddit'),lit(','),col('week'),lit(')')))
comments = comments.drop(*["created_utc","timestamp","week"])
comments.show()

+--------------------+--------------------+
|              author|           subreddit|
+--------------------+--------------------+
|         andrewmyles|(NintendoSwitch,2...|
|             SeeDeez| (nyjets,2019-02-17)|
|       Serious_Sam_2|(RocketLeagueExch...|
|           [deleted]|(worldnews,2019-0...|
|      RanietsSharvas|(AnthemTheGame,20...|
|            Nole2424|(apexlegends,2019...|
|         mrfuckhead1|(synthesizers,201...|
|       thelucidvegan| (movies,2019-02-17)|
|IShouldWashTheDishes|(apexlegends,2019...|
|           tardis_11|(apexlegends,2019...|
|        Hamburtle666|(relationships,20...|
|     browneyedgirl79|(Random_Acts_Of_A...|
|           TumNarDok|(politics,2019-02...|
|   mywaterlooaccount|  (memes,2019-02-17)|
|        winterman666|(todayilearned,20...|
|disposableaccountass|(politics,2019-02...|
|             VM_1701|(MortalKombat,201...|
|       brave_pumpkin|(KotakuInAction,2...|
|           JibberGXP|(FortNiteBR,2019-...|
|            Reisz618|(AskReddit

## Word2Vecf Files
[Word2vecf](https://github.com/BIU-NLP/word2vecf/blob/master/README.md) requires three inputs
* training_data: text file of word-context pairs (space delimited)
* word_vocabulary: file mapping subreddits (strings) to their counts
* count_vocabulary: file mapping users (contexts -> subreddit commenters) to their counts

### Training Data

We want to avoid having to load the raw data as there are 1 billion+ rows. Working with aggregates from the start makes things much easier. 

*Since this is the temporal embedding we've already added the week into the subreddit name. Each subreddit/week combo is considered a new word with possible different contexts*

In [5]:
training_data = comments.groupBy(["subreddit","author"]).count().cache()
training_data.show()

+--------------------+------------------+-----+
|           subreddit|            author|count|
+--------------------+------------------+-----+
|  (memes,2019-02-17)| mywaterlooaccount|    1|
|(AnthemTheGame,20...|      Snow56border|   15|
|(MortalKombat,201...|      Frandaman760|   29|
|(AskReddit,2019-0...|          stefan5b|   97|
| (sports,2019-02-17)|         roaragami|    1|
|(AskReddit,2019-0...|    sweetbabygreen|   29|
|(intermittentfast...|        frevernewb|   15|
| (NHLHUT,2019-02-17)|      BX_Islanders|  105|
|(melbourne,2019-0...|      Sys6473eight|   37|
|(relationship_adv...|        _____i____|   40|
|    (nba,2019-02-17)|         EandT1003|    3|
|(JordanPeterson,2...|humanoidxincognito|    2|
|(specializedtools...|    MuddyScroll360|    1|
|(entitledparents,...|     AutoModerator| 3833|
|(starcraft,2019-0...|         [deleted]|  489|
|(The_Donald,2019-...|   mimefortheblind|    9|
|  (memes,2019-02-17)|         Daneiiiil|    1|
|(chicago,2019-02-17)|villagethriftidiot

In [6]:
training_data.count()

301431081

### Word Vocabulary

In [7]:
from pyspark.sql.functions import sum as _sum
word_vocabulary = training_data.groupBy("subreddit").agg(_sum('count').alias('count')).cache()
word_vocabulary.show()

In [8]:
word_vocabulary.count()

52243

### Context Vocabulary

In [9]:
context_vocabulary = training_data.groupBy("author").agg(_sum('count').alias('count')).cache()
context_vocabulary.show()

+--------------------+------+
|              author| count|
+--------------------+------+
|   sillygaythrowaway|   197|
|             Faulkal|    99|
|        Tittypookaka|   753|
|       DankMemesMods|207100|
|        aeonianvibes|   116|
|                 n67|   254|
|     Jason_Whorehees|  1413|
|     bookmovietvworm|  1991|
|         andreagassi|   604|
|        Beebeemybaby|   248|
|WestCoastBestCoast01|  1826|
|    NipDrunkChipmunk|    24|
|     angrykeyboarder|  1098|
|     Nameless_king69|   258|
|            nuhtalia|    55|
|          RhysRuther|    14|
|       Spocks-Nephew|  4064|
|              nshane|   455|
|        SansasAgency|   512|
|           Minic3211|   303|
+--------------------+------+
only showing top 20 rows



In [10]:
context_vocabulary.count()

15283648

## Write Vocabularies and Training Data to File

In [2]:
# Create a temp context for the word and context vocabulary files (which get passed to the word2vecf script)
import subprocess
import sys
import os
temp_dir = "/h/224/cameron/Political-Subreddit-Embedding/temp/temporal/"
subprocess.run("mkdir -p {}".format(temp_dir), shell=True)

CompletedProcess(args='mkdir -p /h/224/cameron/Political-Subreddit-Embedding/temp/temporal/', returncode=0)

In [8]:
# Create temp files
file_data = os.path.join(temp_dir, '{}_data.txt'.format(TIME_FRAME))
file_wv = os.path.join(temp_dir, '{}_wv.txt'.format(TIME_FRAME))
file_cv = os.path.join(temp_dir, '{}_cv.txt'.format(TIME_FRAME))

In [None]:
print("Writing training data to {}...".format(file_data))
training_data.toPandas().to_csv(file_data, header=False, index=False, sep=' ')
training_data.unpersist()

In [None]:
print("Writing word vocab data to {}...".format(file_wv))
word_vocabulary.toPandas().to_csv(file_wv, header=False, index=False, sep=' ')
word_vocabulary.unpersist()

In [None]:
print("Writing context vocab data to {}...".format(file_cv))
context_vocabulary.toPandas().to_csv(file_cv, header=False, index=False, sep=' ')
context_vocabulary.unpersist()

## Train Embedding

In [None]:
# Word2vec parameters, using negative sampling
# -alpha 0.18 -negative 35 -sample 0.0043 -size 150
from utils import generate_embedding, load_embedding
embedding_args = {
                    "param1": "sample", 
                    "p1": 0.0043, 
                    "param2": "negative", 
                    "p2": 35, 
                    "file_data": file_data , 
                    "file_wv": file_wv, 
                    "file_cv": file_cv,
                    "size": 150,
                    "alpha": 0.18
                 }
embedding = generate_embedding(embedding_args)
embedding

In [None]:
subreddits, vectors = load_embedding(embedding)
subreddits

### Parse Out Subreddit from Week Again

Since we've already trained all of the seperate emebeddings there isn't a need for them to be in the same column anymore. This will make animating the emebedding over time easier.

In [None]:
from utils import parse_tup

sub_df = pd.DataFrame(subreddits.apply(parse_tup).tolist())
sub_df.columns = ["subreddit","week"]
sub_df

## Visualize/Animate
1. Reduce to 3/2 dimensions
2. Add subreddit/week columns to factored dataframe
3. Visualize

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px
subprocess.run("mkdir -p visualizations/temporal", shell=True)
left_subreddits = ["JoeBiden","Pete_Buttigieg","Kamala",
                        "SandersForPresident","BetoORourke","ElizabethWarren",
                        "BaemyKlobaechar","YangForPresidentHQ","politics","progressive",
                        "demsocialist","SocialDemocracy","centerleftpolitics",
                        "ConservativeDemocrat","moderatepolitics","ChapoTrapHouse"]
right_subreddits = ["The_Donald","Conservative","ShitPoliticsSays","progun","Republican","Capitalism"]

In [None]:
# PCA Dim Reduction -> 2 dimensions
pca =  PCA(n_components = 2)
two_dim =  pd.DataFrame(pca.fit_transform(vectors))
two_dim[["subreddit","week"]] = sub_df
idx = pd.MultiIndex.from_product([two_dim['week'].unique(), two_dim['subreddit'].unique()],
                                 names=['week', 'subreddit'])

# In the case that there isn't a vector for a specific week/subreddit we bacfill the vector from the previous
two_dim = two_dim.set_index(['week', 'subreddit']).reindex(idx).reset_index().sort_values('week').bfill()
two_dim = two_dim[two_dim["subreddit"].isin(left_subreddits) | two_dim["subreddit"].isin(right_subreddits)]
two_dim["partisan"] = np.where(two_dim["subreddit"].isin(left_subreddits), 'left', 'right')
two_dim

In [1]:
(max_x, max_y), (min_x, min_y) = two_dim[[0,1]].max(axis=0), two_dim[[0,1]].min(axis=0)
args = {
    "x": 0,
    "y": 1,
    "hover_name": "subreddit",
    "text": "subreddit",
    "opacity": 0.7,
    "color": "partisan",
    "animation_frame": two_dim.week.astype(str),
    "animation_group": "subreddit",
    "range_x": [min_x-3,max_x+3],
    "range_y": [min_y-3,min_y+3],
}
fig = px.scatter(two_dim,**args)
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.write_html("visualizations/temporal/{}_2d_scatter.html".format(TIME_FRAME))
fig.show()

NameError: name 'two_dim' is not defined

In [2]:
# PCA Dim Reduction -> 3 dimensions
pca =  PCA(n_components = 3)
three_dim =  pd.DataFrame(pca.fit_transform(vectors))
three_dim[["subreddit","week"]] = sub_df
idx = pd.MultiIndex.from_product([three_dim['week'].unique(), three_dim['subreddit'].unique()],
                                 names=['week', 'subreddit'])

# In the case that there isn't a vector for a specific week/subreddit we bacfill the vector from the previous
three_dim = three_dim.set_index(['week', 'subreddit']).reindex(idx).reset_index().sort_values('week').bfill()
three_dim = three_dim[three_dim["subreddit"].isin(left_subreddits) | three_dim["subreddit"].isin(right_subreddits)]
three_dim["partisan"] = np.where(three_dim["subreddit"].isin(left_subreddits), 'left', 'right')
three_dim

NameError: name 'PCA' is not defined

In [None]:
(max_x, max_y, max_z), (min_x, min_y, min_z) = three_dim[[0,1,2]].max(axis=0), three_dim[[0,1,2]].min(axis=0)
args = {
    "x": 0,
    "y": 1,
    "z": 2,
    "hover_name": "subreddit",
#     "text": "subreddit",
    "opacity": 0.7,
    "color": "partisan",
    "animation_frame": three_dim.week.astype(str),
    "animation_group": "subreddit",
    "range_x": [min_x-3,max_x+3],
    "range_y": [min_y-3,min_y+3],
    "range_z": [min_z-3,max_z+1]

}
fig = px.scatter_3d(three_dim,**args)
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.write_html("visualizations/temporal/{}_3d_scatter.html".format(TIME_FRAME))
fig.show()