In [1]:
import findspark
findspark.init("/h/224/cameron/spark-3.0.0-preview2-bin-hadoop2.7")
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tempfile

In [2]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.getConf().getAll()

[('spark.driver.host', 'adavm1.ais.sandbox'),
 ('spark.driver.memory', '400g'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'PySparkShell'),
 ('spark.app.id', 'local-1590801054061'),
 ('spark.executor.memory', '400g'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '45557'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.executor.cores', '16'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
# Load the Parquet data
# df = spark.read.load("/comments_2019.parquet").fillna("")
# df.createOrReplaceTempView("comments")
# political_comments = spark.sql("""select * from comments 
#                                   where subreddit in 
#                                       ("JoeBiden","Pete_Buttigieg","Kamala",
#                                       "SandersForPresident","BetoORourke","ElizabethWarren",
#                                       "BaemyKlobaechar","YangForPresidentHQ","politics","progressive",
#                                       "demsocialist","SocialDemocracy","centerleftpolitics","ConservativeDemocrat",
#                                       "moderatepolitics")
#                                 """)
political_comments = spark.read.load("/comments_2019.parquet").fillna("")
# political_comments.createOrReplaceTempView("comments")
political_comments = political_comments.select("subreddit", "author", "created_utc")
political_comments.printSchema()
political_comments.count()

root
 |-- subreddit: string (nullable = false)
 |-- author: string (nullable = false)
 |-- created_utc: integer (nullable = true)



1663587081

## Word2Vecf Files
[Word2vecf](https://github.com/BIU-NLP/word2vecf/blob/master/README.md) requires three inputs
* word_vocabulary: file mapping subreddits (strings) to their counts
* count_vocabulary: file mapping users (contexts -> subreddit commenters) to their counts
* training_data: text file of word-context pairs (space delimited)


### Word Vocabulary

In [5]:
word_vocabulary = political_comments.groupBy("subreddit").count()
word_vocabulary.show()

+-------------------+-------+
|          subreddit|  count|
+-------------------+-------+
|              anime|3303822|
|       gentlefemdom|  62808|
|           Goldfish|  27526|
|         MLBTheShow| 587781|
|             travel| 386688|
|         costa_rica|  11099|
|       SaltLakeCity| 125313|
|UnresolvedMysteries| 332677|
|     TrueOffMyChest| 726501|
|         traderjoes|  69021|
|             AdPorn|   3857|
|         MensRights| 503714|
|            Amateur| 114799|
|  BeautyGuruChatter| 564832|
|         NHLStreams|  67022|
|          GemsofWar|  11396|
|      gastricsleeve|  31746|
|                NIU|   3486|
|    NewLondonCounty|  22403|
|    Notakeonlythrow|   6424|
+-------------------+-------+
only showing top 20 rows



### Context Vocabulary

In [None]:
context_vocabulary = political_comments.groupBy("author").count()
context_vocabulary.show()

### Training Data

In [None]:
training_data = political_comments.groupBy("subreddit","author").count()
training_data.show()

In [None]:
wdf = word_vocabulary.toPandas()
fig = plt.figure()
plt.yscale('log')
plt.title('Number Comments per Political Subreddit')

# plt.xlabel('Duration (in seconds)')
plt.ylabel('Number of Comments')
plt.bar(wdf["subreddit"],wdf["count"])
plt.xticks(rotation='vertical')
fig.tight_layout()

## Hyperparameter Sweep

In [None]:
# Create a temp context for the word and context vocabulary files (which get passed to the word2vecf script)
temp_dir = tempfile.TemporaryDirectory()

In [None]:
# Create temp files
file_data = os.path.join(temp_dir.name, 'data.txt')
file_wv = os.path.join(temp_dir.name, 'wv.txt')
file_cv = os.path.join(temp_dir.name, 'cv.txt')

In [None]:
print("Writing training data to {}...".format(file_data))
training_data.select("subreddit", "author").toPandas().to_csv(file_data, header=False, index=False, sep=' ')
print("Writing word vocab data to {}...".format(file_wv))
word_vocabulary.toPandas().to_csv(file_wv, header=False, index=False, sep=' ')
print("Writing context vocab data to {}...".format(file_cv))
context_vocabulary.toPandas().to_csv(file_cv, header=False, index=False, sep=' ')

In [None]:
# Word2vec parameters, using negative sampling
sample = "sample"
lr = 0.0082
# Training algorithm: hierarchical softmax or negative sampling
training_alg = "negative"
negative = 23

In [None]:
import subprocess
import sys
import os

def generate_embedding(p1, p2, param1, param2, file_data, file_wv, file_cv):
    output = "vecs_{}_{}.txt".format(p1,p2)
    command = "./word2vecf/word2vecf -train {} -wvocab {} -cvocab {} -output {} -threads 180 -alpha 0.26 -size 200 -{} {} -{} {}".format(file_data,file_wv,file_cv,output,param1,p1,param2,p2)
    if not os.path.exists(output):
        print(command)
        subprocess.run(command, shell=True)
    
    return output

In [None]:
embedding_args = {"param1": sample, 
                  "p1": lr, 
                  "param2": training_alg, 
                  "p2": negative, 
                  "file_data": file_data , 
                  "file_wv": file_wv, 
                  "file_cv": file_cv
                 }
generated_embeddings = generate_embedding(**embedding_args)

In [None]:
def load_embedding(filename):
    embedding = pd.read_csv(filename, sep=' ', header=None, skiprows=1)
    embedding.set_index(0)
    embedding = embedding.rename(columns={0: 'subreddits'})
    subreddits, vectors = embedding.iloc[:, 0], embedding.iloc[:, 1:200]
    vectors = vectors.divide(np.linalg.norm(vectors, axis=1), axis=0)
    return subreddits, vectors

In [None]:
subreddits, vectors = load_embedding(generated_embeddings)
vectors

### Reduce to 3 dimensions

In [None]:
from sklearn.decomposition import PCA
# PCA Dim Reduction
pca =  PCA(n_components = 3)
three_dim =  pd.DataFrame(pca.fit_transform(vectors))
three_dim['subreddit'] = subreddits

In [None]:
import plotly.express as px
fig = px.scatter_3d(three_dim, x=0, y=1, z=2,text="subreddit")
fig.show()

### Reduce to 2 dimensions

In [None]:
# PCA Dim Reduction
pca =  PCA(n_components = 2)
two_dim =  pd.DataFrame(pca.fit_transform(vectors))
two_dim['subreddit'] = subreddits
fig = px.scatter(two_dim, x=0, y=1,text="subreddit")
fig.show()