In [1]:
import paths
from commembed.jupyter import *
import commembed.linalg as linalg
import commembed.dimens as dimens
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
%config InlineBackend.figure_format = 'retina'
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
from pyspark.sql.functions import *

import commembed.data as data
spark = data.spark_context()

# Data prep

In [None]:
embedding = load_embedding('reddit', 'master')
dimen_list = dimens.load_dimen_list('final')

def create_nullhyp_counts():
    nullhyp_data_path = "~/reddit_nullhyp/train_set_unshuffled.csv.cat"
    nullhyp_data = spark.read.csv(nullhyp_data_path, schema='subreddit STRING, author STRING', sep=' ')
    
    spark.sql("""select author, subreddit, count(*) as num_comments
          from nullhyp_data group by 1, 2""") \
        .write.parquet("/ada/data/reddit/parquet/all_objects_nullhyp_user_counts.parquet")
    
    nullhyp_embedding = load_embedding('reddit', 'nullhyp')
    copycat_dimen_list = dimens.load_copycat_dimen_list('final', nullhyp_embedding)
    nullhyp_scores = dimens.score_embedding(nullhyp_embedding, copycat_dimen_list)
    
    scores = dimens.score_embedding(embedding, dimen_list)
    scores = scores.apply(lambda x: (x - np.mean(x))/np.std(x), axis=0)
    nullhyp_scores = nullhyp_scores.apply(lambda x: (x - np.mean(x))/np.std(x), axis=0)
    
    for partisan_dimen in ["partisan", "partisan_b"]:
    
        neutral_cutoff = neutral_cutoffs

        scores_subset = scores[scores[partisan_dimen+'_neutral'] > neutral_cutoff[partisan_dimen]]
        print("%d political subreddits selected" % len(scores_subset))

        nullhyp_scores_subset = nullhyp_scores.sort_values(partisan_dimen+'_neutral', ascending=False).iloc[:len(scores_subset)]
        print("%d nullhyp political subreddits selected" % len(nullhyp_scores_subset))
        display(nullhyp_scores_subset.sort_values(partisan_dimen).head(5))

        plt.hist(nullhyp_scores[partisan_dimen], bins=100, label='all')
        plt.hist(nullhyp_scores_subset[partisan_dimen], bins=100, label='political')
        plt.title(partisan_dimen)
        plt.show()
        plt.hist(scores[partisan_dimen], bins=100, label='all')
        plt.hist(scores_subset[partisan_dimen], bins=100, label='political')
        plt.title(partisan_dimen)
        plt.show()

        spark.createDataFrame(nullhyp_scores_subset.reset_index()) \
            .createOrReplaceTempView("scores")

        result = spark.sql("""

            select author, subreddit, count(*) as num_comments

            from nullhyp_data

            inner join scores
            on subreddit = community

            group by 1, 2
        """)
        result.write.parquet(data.DATA_PATH+"/all_objects_nullhyp_user_counts_filt_%s.parquet" % partisan_dimen, mode='overwrite')
    
#create_nullhyp_counts()

In [None]:
partisan_dimen = "partisan"
nullhyp = True

data_suffix = f"_filt_{partisan_dimen}" if partisan_dimen is not None else ""
user_counts_table = ("all_objects_nullhyp_user_counts" if nullhyp else "all_objects_user_counts") + data_suffix
df = data.load(user_counts_table)

In [None]:
if partisan_dimen is not None:
    scores_z, _ = load_politics_z_df(partisan_dimen, "all")
else:
    if nullhyp:
        embedding = load_embedding('reddit', 'nullhyp')
        dimen_list = dimens.load_copycat_dimen_list('final', embedding)
    else:
        embedding = load_embedding('reddit', 'master')
        dimen_list = dimens.load_dimen_list('final', embedding)
        
    
    scores = dimens.score_embedding(embedding, dimen_list)
    scores_z = scores.apply(lambda x: (x - np.mean(x))/np.std(x), axis=0)

In [None]:

num_bins = 5
if (num_bins % 2) != 1:
    raise "num_bins must be odd to accomodate center bin"

breadth = 2

bins_per_side = (num_bins - 1) / 2
bin_size = breadth / bins_per_side
bins = np.arange(1, bins_per_side+1) * bin_size

# remember the double-width center bin
bin_labels = np.array(np.flip(bins*-1).tolist() + [0] + bins.tolist())
bins = np.array(np.flip(bins*-1).tolist() +  bins.tolist())


scores_bin = scores_z.apply(lambda x: bin_labels[np.digitize(x,bins)], axis=0)

scores_bin_orig = scores_bin

In [None]:
# If nullhyp, create a new distribution of scores that has the same number of communities as the real kind
if nullhyp and partisan_dimen is not None:
    
    nullhyp_embedding = load_embedding('reddit', 'nullhyp')
    copycat_dimen_list = dimens.load_copycat_dimen_list('final', nullhyp_embedding)
    nullhyp_scores = dimens.score_embedding(nullhyp_embedding, copycat_dimen_list)

    # Select the n most political subs from the nullhyp, where n is the number of political subs in real life
    nullhyp_scores_bin = nullhyp_scores.sort_values(partisan_dimen+"_neutral", ascending=False).iloc[0:len(scores_bin)]
    nullhyp_scores_bin = nullhyp_scores_bin[[partisan_dimen]].rename(columns={partisan_dimen:"partisan_dimen"})
    nullhyp_scores_bin = nullhyp_scores_bin.sort_values("partisan_dimen").reset_index()
    
    # Assign the same bin to communities with the same index, so the ordering/amount of subs in bin is preserved
    nullhyp_scores_bin = (nullhyp_scores_bin.join(scores_bin_orig.sort_values("partisan_dimen").reset_index(), rsuffix='_actual'))
    
    plt.hist(nullhyp_scores_bin["partisan_dimen_actual"])
    
    scores_bin = nullhyp_scores_bin[['community', 'partisan_dimen_actual']].rename(columns={"partisan_dimen_actual": "partisan_bin"}).set_index("community")

In [None]:
spark.createDataFrame(scores_bin.reset_index().melt(id_vars=["community"], value_name="bin", var_name="dimen")) \
    .createOrReplaceTempView("scores_bin")

user_bin_freq = spark.sql(f"""

    select author, dimen, bin, sum(num_comments) as num_comments
    
    from {user_counts_table}
    
    inner join scores_bin
    on subreddit = community

    group by 1, 2, 3
""")
base = "user_dists_nullhyp" if nullhyp else "user_dist"
user_bin_freq.write.parquet("/ada/data/reddit/parquet/echo_chambers/%s_bins=%s_breadth=%s_%s.parquet" % (base, num_bins, breadth, data_suffix),
                           mode='overwrite')

# Analysis

In [None]:
data_suffix = "_filt_partisan"
nullhyp=False

addtl = "_nullhyp" if nullhyp else ""
user_dists_table = f"user_dists{addtl}_bins=5_breadth=2_" + data_suffix

spark.read.parquet("/ada/data/reddit/parquet/echo_chambers/" + user_dists_table + ".parquet") \
    .filter(col("author") != "[deleted]") \
    .createOrReplaceTempView("user_dists")

totals = spark.sql("select author, dimen, sum(num_comments) as total_comments from user_dists group by 1, 2").cache()
totals.createOrReplaceTempView("totals")

bin_totals = spark.sql("select bin, dimen, sum(num_comments) as total_comments from user_dists group by 1, 2").cache()
bin_totals.createOrReplaceTempView("bin_totals")

bin_totals.toPandas().to_csv('echo_chamber_bin_sizes_%s.csv' % user_dists_table)

In [None]:
result = spark.sql(f"""
    select fracs2.dimen, bin1, bin2, weighted_bin2_frac/bin_totals.total_comments as bin1_avg_frac_bin2,
        weighted_bin2_comments/bin_totals.total_comments as bin1_avg_bin2_comments
    from (
    select dimen, bin1, bin2, sum(bin1_comments*bin2_frac) as weighted_bin2_frac,
        sum(bin1_comments*bin2_comments) as weighted_bin2_comments
    
    from (
        select s1.author, s1.dimen, s1.bin as bin1, s2.bin as bin2,
            s1.num_comments as bin1_comments,
            s2.num_comments as bin2_comments,
            (s2.num_comments/total_comments) as bin2_frac
            
        from user_dists as s1

        inner join user_dists as s2
        on s1.author = s2.author and s1.dimen = s2.dimen

        inner join totals
        on s1.author = totals.author and s1.dimen = totals.dimen
    ) fracs
    
    group by 1, 2, 3
    ) fracs2
    
    inner join bin_totals
    on bin_totals.bin = bin1 and bin_totals.dimen = fracs2.dimen
""").toPandas()
result.to_csv("echo_chamber_results_%s.csv" % user_dists_table)

In [None]:
# Per community version
sub_totals = spark.sql("select subreddit, sum(num_comments) as total_comments from user_counts group by 1").cache()
sub_totals.createOrReplaceTempView("sub_totals")

In [None]:
result = spark.sql(f"""
    select fracs2.dimen, fracs2.subreddit, bin2, weighted_bin2_frac/sub_totals.total_comments as subreddit_avg_frac_bin2
    from (
    select dimen, subreddit, bin2, sum(sub_comments*bin2_frac) as weighted_bin2_frac
    
    from (
        select user_counts.author, s2.dimen, subreddit, s2.bin as bin2,
            user_counts.num_comments as sub_comments, (s2.num_comments/total_comments) as bin2_frac
            
        from user_counts

        inner join user_dists as s2
        on user_counts.author = s2.author

        inner join totals
        on s2.author = totals.author and s2.dimen = totals.dimen
    ) fracs
    
    group by 1, 2, 3
    ) fracs2
    
    inner join sub_totals
    on sub_totals.subreddit = fracs2.subreddit
""").toPandas()
result

In [None]:
largest_subs = sub_totals.toPandas()

largest_subs = largest_subs.set_index("subreddit")
result = result.join(largest_subs, on="subreddit", how="inner")
embedding = load_embedding('reddit', 'master')
result = result[result["subreddit"].isin(embedding.vectors.index)]

result.to_parquet("echo_chamber_community_results_%s.parquet" % user_dists_table)