In [None]:
import sys
import os
sys.path.append("..")
from commembed.jupyter import *
import commembed.linalg as linalg
import commembed.dimens as dimens
import numpy as np
import pandas as pd
import matplotlib as matplotlib
import matplotlib.pyplot as plt
import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
import seaborn as sns
import matplotlib.gridspec as gridspec
#import commembed.data as data
#spark = data.spark_context()

%config InlineBackend.figure_format = 'retina'
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

%load_ext autoreload
%autoreload 2

In [9]:
all_objects = data.load("all_objects")
# all comments before 2019
#spark.sql("select count(*) from all_objects where id like 't1_%' and created_utc < 1546369200").show()

(Freshly loading table all_objects)


In [11]:
spark.sql("""select count(*) as total_comments,
    count(distinct author) as distinct_authors,
    from_unixtime(min(created_utc), 'yyyy-MM-dd HH:mm:ss') as first_date,
    from_unixtime(max(created_utc), 'yyyy-MM-dd HH:mm:ss') as last_date from all_objects
    where id like 't1_%' and created_utc < 1546300800""").show()

+--------------+----------------+-------------------+-------------------+
|total_comments|distinct_authors|         first_date|          last_date|
+--------------+----------------+-------------------+-------------------+
|    5075175045|        34665723|2005-12-12 00:26:28|2018-12-31 18:59:59|
+--------------+----------------+-------------------+-------------------+



In [2]:
embedding = load_embedding('reddit', 'master')
dimen_list = dimens.load_dimen_list('final')
scores = dimens.score_embedding(embedding, dimen_list)

In [3]:
scores_pctl = scores.apply(lambda x: np.digitize(x, np.percentile(x, np.arange(1, 100))), axis=0)
scores_z = scores.apply(lambda x: (x-np.mean(x))/np.std(x), axis=0)

In [None]:
print(scores_z.loc["progressive"])
print(scores_z.loc["LesbianGamers"])
print(scores_pctl.loc["cycling"])

# total numbers / embedding cutoff numbers

In [12]:
monthly_user_counts = data.load("all_objects_monthly_user_counts")

In [14]:
# verify integrity
spark.sql("select sum(num_comments), min(month), max(month) from all_objects_monthly_user_counts where month < '2019-01'").show()

+-----------------+----------+----------+
|sum(num_comments)|min(month)|max(month)|
+-----------------+----------+----------+
|       5075996316|   2005-12|   2018-12|
+-----------------+----------+----------+



In [15]:
monthly_totals = spark.sql("""
    select month, sum(num_comments) as total_comments
    from all_objects_monthly_user_counts
    where month < '2019-01'
    group by 1
""").toPandas()

In [17]:

scores_df = spark.createDataFrame(scores.reset_index())
scores_df.createOrReplaceTempView("scores")
counts_w_sub = spark.sql("""
    select author, subreddit, cast((gender is not null) as int) as in_embedding, num_comments
    from all_objects_monthly_user_counts
    
    left join scores
    on scores.community = subreddit
    
    where month < '2019-01'
""")
counts_w_sub.createOrReplaceTempView("counts_w_sub")

In [18]:
# Percent of comments included in embedding
spark.sql("select (sum(in_embedding*num_comments)/sum(num_comments)) as pct_comments_in_embedding from counts_w_sub").show()

+-------------------------+
|pct_comments_in_embedding|
+-------------------------+
|       0.9537859152378471|
+-------------------------+



In [19]:
# Percent of users included in embedding (at least 1 comment)
spark.sql("select avg(in_embedding) as pct_users_in_embedding from (select max(in_embedding) as in_embedding from counts_w_sub group by author)").show()

+----------------------+
|pct_users_in_embedding|
+----------------------+
|    0.9316604452842877|
+----------------------+



In [20]:
# Percent of users included in embedding (all comments)
spark.sql("select avg(in_embedding) as pct_users_in_embedding from (select min(in_embedding) as in_embedding from counts_w_sub group by author)").show()

+----------------------+
|pct_users_in_embedding|
+----------------------+
|    0.6895200859253661|
+----------------------+



In [21]:
user_sub_counts = spark.sql("""
    select author, count(distinct subreddit) as sub_count
    from all_objects_monthly_user_counts
    where month < '2019-01'
    group by 1
""").cache()
user_sub_counts.createOrReplaceTempView("user_sub_counts")

In [22]:
spark.sql("select avg(sub_count) from user_sub_counts").show()
spark.sql("select avg(cast(sub_count > 1 as int)) as proportion_gt_1 from user_sub_counts").show()

+-----------------+
|   avg(sub_count)|
+-----------------+
|9.613033611732035|
+-----------------+

+------------------+
|   proportion_gt_1|
+------------------+
|0.5285568330137629|
+------------------+



# appendix: input pairs and discovered pairs

In [7]:
def format_pair(a, b):
    a = a.replace("_", "\\_")
    b = b.replace("_", "\\_")
    
    return "\\textsf{r/%s}& $\\,\\to\\,$ & \\textsf{r/%s}" % (b, a)

to_print = [
    "age","gender","partisan","affluence","age_b","gender_b","partisan_b",
    'sociality', 'edginess', 'time'
]

out = ""
glossary = []

is_first = True
axes_by_name = {d[0]:d[1] for d in dimen_list}
for name in to_print:
    data = axes_by_name[name]
        
    seeds = data['seeds']
    positive_comms = data['positive_comms']
    negative_comms = data['negative_comms']
    glossary.extend(positive_comms)
    glossary.extend(negative_comms)
    
    pairs = [format_pair(a, b) for a, b in zip(positive_comms, negative_comms)]
    pairs = pairs[1:] # chop off seed
    
    midpoint = int(len(pairs) / 3)
    rows = ["%s & %s & %s \\\\" % (a, b, c) for a, b, c in [pairs[0:3], pairs[3:6], pairs[6:]]]
    tbody = "\n".join(rows)
    if not is_first:
        out += ("\\hline")
    is_first = False
    out += (r"""
    
\multicolumn{6}{c}{\normalsize{\dimension{%s}}} &
 \cellcolor{blue!25} r/%s & \cellcolor{blue!25}$\,\to\,$ & \cellcolor{blue!25} r/%s \\
 
%s
    """ % (axis_name(name), seeds[0][0].replace("_", "\\_"), seeds[0][1].replace("_", "\\_"), tbody))
    
with open("../paper_resources/seeds.tex", "w") as text_file:
    text_file.write(out)
with open("../paper_resources/glossary_seeds.txt", "w") as text_file:
    text_file.write("\n".join(glossary))