In [1]:
import os
os.getcwd()
os.chdir("..")
os.chdir("..")
os.getcwd()

'/Users/macuser/Desktop/TriadMotif'

In [2]:
import convokit

In [3]:
from convokit.threadRandomizer import randomize_thread

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

First we download the reddit corpus:

In [5]:
convokit.__file__

'/Users/macuser/Desktop/TriadMotif/convokit/__init__.py'

In [6]:
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus"))

We will compute features over only the first 10 comments that occur in a thread, for  threads which are at least 10 comments long (controling for thread length in this way):

In [7]:
threads = corpus.utterance_threads(prefix_len=10)

In [8]:
def display_thread_helper(thread, root, indent=0):
    print(" "*indent + thread[root].user.name)
    children = [k for k, v in thread.items() if v.reply_to == root]
    for child in children:
        display_thread_helper(thread, child, indent=indent+4)
def display_thread(threads, root):
    return display_thread_helper(threads[root],root)

The following are threads that come from the /r/canada subreddit. 

In [10]:
demo_threads = [
    # https://www.reddit.com/r/canada/comments/mmyld/polygamypolyamoury_laws_upheld_in_supreme_court/c32bcq8/
                't1_c32bcq8', 
    # https://www.reddit.com/r/canada/comments/mmd20/look_what_i_found_today_yes_and_it_is_in_canada/c322oa7/
                't1_c322oa7',
    # https://www.reddit.com/r/canada/comments/mo0lt/happy_thursday/c32hv0h/
                't1_c32hv0h'
               ]

We print the structure of the thread: the reply-structure (where subsequent replies in the comment tree are indented), and the authors of each comment. There are some high-level qualitative differences that you might immediately see; our goal is to automatically extract a superset of these intuitive distinctions. For instance, we notice that thread t1_c32bcq8 has a very linear structure, while t1_c322oa7 is flatter; to capture this difference, we might compute statistics on the distribution of in-degrees of nodes in the reply tree. We also intuit that some conversations involve a few people replying repeatedly to each other, whereas others may involve users stopping by to chime in once and then leaving; we'll later codify this difference via statistics on the indegrees and outdegrees of hypernodes (users).

In [11]:
for thread in demo_threads:
    print(thread)
    print('---')
    display_thread(threads, thread)
    print()

t1_c32bcq8
---
drays
    QuirkyGroundhog
        Sajentine
            QuirkyGroundhog
                drays
                    QuirkyGroundhog
                        drays
                            QuirkyGroundhog
                                drays
                                    QuirkyGroundhog

t1_c322oa7
---
[deleted-t1_c322oa7]
    Aneeid
    veedubbin86
        figureskatingaintgay
            sirspate
        lovevolcano
    bunny1979
        veedubbin86
            bunny1979
    SoFaKiNg42

t1_c32hv0h
---
Can-eh-dian
    h00pla
        Switchbladeannie
        lengthynewt
            h00pla
                lengthynewt
                    h00pla
    thoriginal
    lengthynewt
    andrewmp



Let's randomize these threads and see the (random) results:

In [12]:
for thread in demo_threads:
    randomized = randomize_thread(thread, threads[thread])
    print(thread)
    print('---')
    display_thread({thread: randomized}, thread)
    print()

t1_c32bcq8
---
drays
    QuirkyGroundhog
    Sajentine
        QuirkyGroundhog
            drays
                QuirkyGroundhog
        drays
            QuirkyGroundhog
                drays
                    QuirkyGroundhog

t1_c322oa7
---
[deleted-t1_c322oa7]
    Aneeid
        veedubbin86
        bunny1979
            veedubbin86
                SoFaKiNg42
                    figureskatingaintgay
        sirspate
    bunny1979
        lovevolcano

t1_c32hv0h
---
Can-eh-dian
    h00pla
        Switchbladeannie
        thoriginal
            lengthynewt
            h00pla
        lengthynewt
            h00pla
                lengthynewt
                andrewmp



Let's randomize all threads to create a random baseline:

In [13]:
randomized = {root:randomize_thread(root, threads[root]) for root in threads}  

Construct a new Corpus using randomized threads:

In [14]:
utts = [utt for t in randomized for utt in randomized[t].values()]

In [15]:
c = convokit.model.Corpus(utterances=utts)

We extract hypergraph features for this new Corpus, as well as for the original Corpus.

In [16]:
random_hc = convokit.HyperConvo(c)

In [17]:
random_feats = random_hc.fit_transform()

In [18]:
random_feats['t1_c32bcq8']

{'count[NO_EDGE_TRIADS]': 0,
 'count[SINGLE_EDGE_TRIADS]': 0,
 'count[INCOMING_TRIADS]': 0,
 'count[OUTGOING_TRIADS]': 0,
 'count[DYADIC_TRIADS]': 0,
 'count[UNIDIRECTIONAL_TRIADS]': 0,
 'count[INCOMING_2TO3_TRIADS]': 0,
 'count[INCOMING_1TO3_TRIADS]': 0,
 'count[DIRECTED_CYCLE_TRIADS]': 0,
 'count[OUTGOING_3TO1_TRIADS]': 0,
 'count[INCOMING_RECIPROCAL_TRIADS]': 0,
 'count[OUTGOING_RECIPROCAL_TRIADS]': 0,
 'count[DIRECTED_CYCLE_1TO3_TRIADS]': 0,
 'count[DIRECIPROCAL_TRIADS]': 0,
 'count[DIRECIPROCAL_2TO3_TRIADS]': 1,
 'count[TRIRECIPROCAL_TRIADS]': 0,
 'count[LATENT_NO_EDGE_TRIADS]': 1,
 'count[LATENT_SINGLE_EDGE_TRIADS]': 1,
 'count[LATENT_INCOMING_TRIADS]': 0,
 'count[LATENT_OUTGOING_TRIADS]': 0,
 'count[LATENT_DYADIC_TRIADS]': 0,
 'count[LATENT_UNIDIRECTIONAL_TRIADS]': 1,
 'count[LATENT_INCOMING_2TO3_TRIADS]': 0,
 'count[LATENT_INCOMING_1TO3_TRIADS]': 1,
 'count[LATENT_DIRECTED_CYCLE_TRIADS]': 0,
 'count[LATENT_OUTGOING_3TO1_TRIADS]': 0,
 'count[LATENT_INCOMING_RECIPROCAL_TRIADS]': 

In [19]:
type(random_feats)

dict

original corpus:

In [20]:
# create a hyperconvo object and use it to extract features
hc = convokit.HyperConvo(corpus)
threads_feats = hc.fit_transform()

In [21]:
def clean_value(x):
    if np.isinf(x) or np.isnan(x):
        return -1
    return x

In [22]:
feat_names = list(next(iter(threads_feats.values())).keys())

In [23]:
feat_names[:10]

['count[NO_EDGE_TRIADS]',
 'count[SINGLE_EDGE_TRIADS]',
 'count[INCOMING_TRIADS]',
 'count[OUTGOING_TRIADS]',
 'count[DYADIC_TRIADS]',
 'count[UNIDIRECTIONAL_TRIADS]',
 'count[INCOMING_2TO3_TRIADS]',
 'count[INCOMING_1TO3_TRIADS]',
 'count[DIRECTED_CYCLE_TRIADS]',
 'count[OUTGOING_3TO1_TRIADS]']

In [24]:
random_thread_ids = []
r_feats = []
for key, feat_dict in random_feats.items():
    random_thread_ids.append(key)
    r_feats.append([clean_value(feat_dict[k]) for k in feat_names])

In [25]:
len(r_feats)

99145

In [26]:
thread_ids = []
feats = []
for key, feat_dict in threads_feats.items():
    thread_ids.append(key)
    feats.append([clean_value(feat_dict[k]) for k in feat_names])

For later convenience we will store feature values in a dataframe:

In [27]:
random_feat_df = pd.DataFrame(data=r_feats, index=random_thread_ids, columns=feat_names)

In [28]:
feat_df = pd.DataFrame(data=feats, index=thread_ids, columns=feat_names)

Here are some examples of features computed over the three example threads from before:

In [29]:
motif_count_feats = [x for x in feat_names if ('count' in x) and ('mid' not in x) and ('present' not in x)]
prob_feats = [x for x in feat_names if ('trans' in x)]

Let's get an aggregate statistic:

In [30]:
display_thread(threads, 't1_c0odlio')

[deleted-t1_c0odlio]
    geekologist
        dlogan3344
    _greg
        [deleted-t1_c0odlio]
    billmeyersriggs
        [deleted-t1_c0odlio]
            billmeyersriggs
                [deleted-t1_c0odlio]
    dlogan3344


# Deviations

In [52]:
overall_mean_diff = feat_df.mean() - random_feat_df.mean()
overall_mean_diff[:10]

count[NO_EDGE_TRIADS]           1.866852
count[SINGLE_EDGE_TRIADS]      -1.903929
count[INCOMING_TRIADS]          0.046921
count[OUTGOING_TRIADS]         -0.624792
count[DYADIC_TRIADS]            0.847526
count[UNIDIRECTIONAL_TRIADS]   -1.000565
count[INCOMING_2TO3_TRIADS]    -0.311150
count[INCOMING_1TO3_TRIADS]     0.466963
count[DIRECTED_CYCLE_TRIADS]   -0.006546
count[OUTGOING_3TO1_TRIADS]     0.061738
dtype: float64

In [53]:
mean_deviation = overall_mean_diff / random_feat_df.std()
mean_deviation[:10]

count[NO_EDGE_TRIADS]           0.130412
count[SINGLE_EDGE_TRIADS]      -0.149408
count[INCOMING_TRIADS]          0.017588
count[OUTGOING_TRIADS]         -0.510938
count[DYADIC_TRIADS]            0.657844
count[UNIDIRECTIONAL_TRIADS]   -0.392988
count[INCOMING_2TO3_TRIADS]    -0.404613
count[INCOMING_1TO3_TRIADS]     0.298046
count[DIRECTED_CYCLE_TRIADS]   -0.077132
count[OUTGOING_3TO1_TRIADS]     0.080939
dtype: float64

In [37]:
mean_deviation[motif_count_feats].sort_values()

count[LATENT_OUTGOING_TRIADS]              -0.540036
count[OUTGOING_TRIADS]                     -0.510938
count[LATENT_INCOMING_2TO3_TRIADS]         -0.490490
count[LATENT_DIRECTED_CYCLE_1TO3_TRIADS]   -0.439643
count[INCOMING_2TO3_TRIADS]                -0.404613
count[UNIDIRECTIONAL_TRIADS]               -0.392988
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.382501
count[DIRECTED_CYCLE_1TO3_TRIADS]          -0.367873
count[LATENT_DIRECIPROCAL_2TO3_TRIADS]     -0.346667
count[DIRECIPROCAL_2TO3_TRIADS]            -0.330094
count[LATENT_INCOMING_RECIPROCAL_TRIADS]   -0.243870
count[LATENT_OUTGOING_RECIPROCAL_TRIADS]   -0.235564
count[INCOMING_RECIPROCAL_TRIADS]          -0.230173
count[OUTGOING_RECIPROCAL_TRIADS]          -0.195603
count[LATENT_DIRECTED_CYCLE_TRIADS]        -0.188403
count[LATENT_SINGLE_EDGE_TRIADS]           -0.153322
count[SINGLE_EDGE_TRIADS]                  -0.149408
count[TRIRECIPROCAL_TRIADS]                -0.086083
count[LATENT_TRIRECIPROCAL_TRIADS]         -0.

- All the paths leading up to DIRECIPROCAL TRIADS, except for UNIDIRECTIONAL TRIADS, occur in the real dataset at rates better than chance. 

- All the triads with closure (and OUTGOING TRIADS) occur at rates less than chance.

In fact, if you think about it, UNIDIRECTIONAL and OUTGOING triads (the two types without closure) are still triads that represent an interaction 'beyond the dyadic relationship'.  And they both occur at rates less than chance.

This perhaps implies that triad motifs with closure are some kind of 'anti-phenomenon'. Are they still 'real' then?

## Specific subreddits

At the aggregate level, it might seem that triad motifs with closure simply do not happen. But perhaps at the level of specific subreddits, we would see a different trend unfold.

Let's group by subreddit then use cosine similarity to find the subreddit that is most unlike the aggregate means.

In [75]:
motif_feat_df = feat_df[motif_count_feats]
motif_feat_df_mean = motif_feat_df.mean()
motif_feat_df_sd = motif_feat_df.std()

Getting subreddit labels:

In [39]:
def get_subreddit(threads, thread_id):
    if thread_id not in threads:
        return None
    return threads[thread_id][thread_id].other["user-info"]["subreddit"]

subreddits = [get_subreddit(threads, thread_id) for thread_id in threads]
# 99145 threads, from 100 subreddits. Roughly 1000 threads per subreddit

In [40]:
motif_feat_df['subreddit'] = subreddits

In [41]:
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
subreddit_means = motif_feat_df.groupby('subreddit').mean()

In [43]:
subreddit_means.loc['AdviceAnimals'].shape

(32,)

In [44]:
import numpy as np

In [45]:
def cosine_sim(X, Y):
    assert(len(X) == len(Y))
    return np.array(X).dot(np.array(Y)) / (np.linalg.norm(X)*np.linalg.norm(Y))

In [89]:
cosine_sims = []
for subreddit_name in subreddit_means.index:
    cosine_sims.append(cosine_sim(subreddit_means.loc[subreddit_name][:-1], motif_feat_df_mean))
cosine_sims[:10]

[0.9987792749814556,
 0.9998391335684169,
 0.9942445255348041,
 0.9928461839808349,
 0.9913286616738011,
 0.9962510131831028,
 0.9993992071230523,
 0.9918657641789929,
 0.9773257537373765,
 0.9997914800345411]

In [90]:
subreddit_means['sim'] = cosine_sims

Top 15 most dissimilar (compared to mean) subreddits:

In [92]:
subreddit_means.sort_values(by='sim')[:15]

Unnamed: 0_level_0,count[NO_EDGE_TRIADS],count[SINGLE_EDGE_TRIADS],count[INCOMING_TRIADS],count[OUTGOING_TRIADS],count[DYADIC_TRIADS],count[UNIDIRECTIONAL_TRIADS],count[INCOMING_2TO3_TRIADS],count[INCOMING_1TO3_TRIADS],count[DIRECTED_CYCLE_TRIADS],count[OUTGOING_3TO1_TRIADS],...,count[LATENT_INCOMING_1TO3_TRIADS],count[LATENT_DIRECTED_CYCLE_TRIADS],count[LATENT_OUTGOING_3TO1_TRIADS],count[LATENT_INCOMING_RECIPROCAL_TRIADS],count[LATENT_OUTGOING_RECIPROCAL_TRIADS],count[LATENT_DIRECTED_CYCLE_1TO3_TRIADS],count[LATENT_DIRECIPROCAL_TRIADS],count[LATENT_DIRECIPROCAL_2TO3_TRIADS],count[LATENT_TRIRECIPROCAL_TRIADS],sim
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Random_Acts_Of_Amazon,0.534,0.215,0.156,0.026,0.151,0.029,0.007,0.396,0.0,0.05,...,0.972,0.003,0.085,0.014,0.04,0.036,0.56,0.074,0.019,0.901357
pokemontrades,0.367,0.095,0.153,0.01,0.045,0.015,0.002,0.18,0.0,0.022,...,0.329,0.001,0.036,0.009,0.006,0.015,0.144,0.014,0.002,0.952305
MLPLounge,1.746,1.33,0.414,0.052,0.687,0.215,0.014,0.975,0.004,0.189,...,1.735,0.026,0.269,0.019,0.049,0.093,0.754,0.106,0.017,0.962696
friendsafari,9.835,1.572,3.697,0.161,0.502,0.181,0.071,1.887,0.0,0.096,...,3.116,0.003,0.137,0.026,0.078,0.045,1.183,0.085,0.017,0.965928
SteamGameSwap,2.932,0.691,1.125,0.045,0.381,0.113,0.021,0.877,0.0,0.154,...,1.536,0.011,0.222,0.023,0.018,0.054,0.686,0.066,0.016,0.971538
DebateReligion,3.402,2.209,0.731,0.075,0.997,0.307,0.018,1.525,0.0,0.26,...,2.475,0.004,0.334,0.017,0.043,0.046,0.968,0.062,0.006,0.977326
electronic_cigarette,3.179,2.304,0.693,0.092,1.113,0.454,0.024,1.312,0.0,0.359,...,2.225,0.024,0.485,0.034,0.058,0.14,0.923,0.127,0.019,0.978158
Dota2Trade,2.259,1.932,0.527,0.098,0.767,0.39,0.038,0.963,0.002,0.348,...,1.618,0.034,0.479,0.028,0.044,0.142,0.673,0.112,0.012,0.978175
POLITIC,2.413793,1.57931,0.558621,0.055172,0.572414,0.227586,0.006897,0.77931,0.0,0.275862,...,1.544828,0.006897,0.37931,0.02069,0.027586,0.075862,0.786207,0.048276,0.006897,0.982029
techsupport,2.337,1.879,0.501,0.068,0.697,0.328,0.02,0.948,0.0,0.261,...,1.551,0.02,0.359,0.02,0.05,0.097,0.614,0.088,0.02,0.982347


Let's ignore subreddits based on exchanges / swaps in favour of those that are discussion-based.

In the above list, this would be MLPLounge, DebateReligion, electronic_cigarette, POLITIC (smaller sample), MensRights, conspiracy, teenagers. We exclude POLITIC because it has a smaller sample size than the rest.

### MLPLounge

In [80]:
((subreddit_means.loc['MLPLounge'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

count[LATENT_SINGLE_EDGE_TRIADS]           -0.798222
count[LATENT_NO_EDGE_TRIADS]               -0.726247
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.700190
count[SINGLE_EDGE_TRIADS]                  -0.628700
count[NO_EDGE_TRIADS]                      -0.617084
count[UNIDIRECTIONAL_TRIADS]               -0.588574
count[LATENT_INCOMING_TRIADS]              -0.579787
count[INCOMING_TRIADS]                     -0.528848
count[DYADIC_TRIADS]                       -0.452157
count[LATENT_DYADIC_TRIADS]                -0.438796
count[INCOMING_1TO3_TRIADS]                -0.397341
count[OUTGOING_3TO1_TRIADS]                -0.361446
count[LATENT_OUTGOING_3TO1_TRIADS]         -0.338098
count[LATENT_INCOMING_1TO3_TRIADS]         -0.285218
count[OUTGOING_TRIADS]                     -0.266160
count[LATENT_OUTGOING_TRIADS]              -0.241042
count[INCOMING_2TO3_TRIADS]                -0.215259
count[LATENT_INCOMING_2TO3_TRIADS]         -0.185364
count[INCOMING_RECIPROCAL_TRIADS]          -0.

Interestingly, all the features that feature more commonly in MLPLounge are mainly the features involving triadic closure. (Direciprocal triads is an exception to this.) Though, some of the features involving triadic closure are fewer as well.

### DebateReligion

In [81]:
((subreddit_means.loc['DebateReligion'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

count[LATENT_SINGLE_EDGE_TRIADS]           -0.641983
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.594834
count[LATENT_NO_EDGE_TRIADS]               -0.591328
count[SINGLE_EDGE_TRIADS]                  -0.549932
count[UNIDIRECTIONAL_TRIADS]               -0.538704
count[NO_EDGE_TRIADS]                      -0.510320
count[LATENT_INCOMING_TRIADS]              -0.450072
count[INCOMING_TRIADS]                     -0.449476
count[DYADIC_TRIADS]                       -0.304201
count[OUTGOING_3TO1_TRIADS]                -0.265185
count[LATENT_OUTGOING_3TO1_TRIADS]         -0.255638
count[OUTGOING_TRIADS]                     -0.221137
count[LATENT_OUTGOING_TRIADS]              -0.217812
count[INCOMING_2TO3_TRIADS]                -0.202154
count[LATENT_INCOMING_2TO3_TRIADS]         -0.193007
count[LATENT_DYADIC_TRIADS]                -0.174106
count[DIRECTED_CYCLE_1TO3_TRIADS]          -0.130024
count[LATENT_DIRECTED_CYCLE_1TO3_TRIADS]   -0.124823
count[INCOMING_1TO3_TRIADS]                -0.

### electronic_cigarette

In [82]:
((subreddit_means.loc['electronic_cigarette'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

count[LATENT_SINGLE_EDGE_TRIADS]           -0.621810
count[LATENT_NO_EDGE_TRIADS]               -0.588435
count[SINGLE_EDGE_TRIADS]                  -0.541419
count[NO_EDGE_TRIADS]                      -0.524697
count[LATENT_INCOMING_TRIADS]              -0.474439
count[UNIDIRECTIONAL_TRIADS]               -0.459020
count[INCOMING_TRIADS]                     -0.458990
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.455166
count[DYADIC_TRIADS]                       -0.248836
count[INCOMING_1TO3_TRIADS]                -0.217829
count[OUTGOING_TRIADS]                     -0.187859
count[INCOMING_2TO3_TRIADS]                -0.182496
count[LATENT_DYADIC_TRIADS]                -0.174459
count[LATENT_OUTGOING_TRIADS]              -0.165993
count[OUTGOING_3TO1_TRIADS]                -0.130963
count[LATENT_INCOMING_1TO3_TRIADS]         -0.066229
count[LATENT_OUTGOING_3TO1_TRIADS]         -0.064076
count[LATENT_INCOMING_2TO3_TRIADS]         -0.060522
count[DIRECTED_CYCLE_TRIADS]               -0.

### MensRights

In [83]:
((subreddit_means.loc['MensRights'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

count[LATENT_SINGLE_EDGE_TRIADS]           -0.550491
count[LATENT_NO_EDGE_TRIADS]               -0.543074
count[NO_EDGE_TRIADS]                      -0.507419
count[SINGLE_EDGE_TRIADS]                  -0.484426
count[UNIDIRECTIONAL_TRIADS]               -0.401562
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.392823
count[INCOMING_TRIADS]                     -0.363843
count[LATENT_INCOMING_TRIADS]              -0.357862
count[DYADIC_TRIADS]                       -0.275564
count[LATENT_DYADIC_TRIADS]                -0.263633
count[INCOMING_1TO3_TRIADS]                -0.193326
count[LATENT_INCOMING_1TO3_TRIADS]         -0.118965
count[OUTGOING_3TO1_TRIADS]                -0.067241
count[LATENT_OUTGOING_3TO1_TRIADS]         -0.060270
count[OUTGOING_TRIADS]                     -0.005808
count[INCOMING_2TO3_TRIADS]                 0.000978
count[LATENT_OUTGOING_TRIADS]               0.003761
count[DIRECTED_CYCLE_TRIADS]                0.021649
count[DIRECTED_CYCLE_1TO3_TRIADS]           0.

### conspiracy

In [93]:
((subreddit_means.loc['conspiracy'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

count[LATENT_SINGLE_EDGE_TRIADS]           -0.554655
count[LATENT_NO_EDGE_TRIADS]               -0.539748
count[NO_EDGE_TRIADS]                      -0.496846
count[SINGLE_EDGE_TRIADS]                  -0.469282
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.399106
count[UNIDIRECTIONAL_TRIADS]               -0.392347
count[LATENT_INCOMING_TRIADS]              -0.392262
count[INCOMING_TRIADS]                     -0.380119
count[DYADIC_TRIADS]                       -0.308496
count[LATENT_DYADIC_TRIADS]                -0.283096
count[INCOMING_1TO3_TRIADS]                -0.211970
count[LATENT_INCOMING_1TO3_TRIADS]         -0.134607
count[OUTGOING_3TO1_TRIADS]                -0.107914
count[LATENT_OUTGOING_3TO1_TRIADS]         -0.084374
count[OUTGOING_TRIADS]                     -0.082152
count[INCOMING_2TO3_TRIADS]                -0.067825
count[LATENT_OUTGOING_TRIADS]              -0.033764
count[DIRECTED_CYCLE_TRIADS]               -0.022026
count[LATENT_DIRECTED_CYCLE_TRIADS]         0.

### teenagers

In [95]:
((subreddit_means.loc['teenagers'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

count[LATENT_SINGLE_EDGE_TRIADS]           -0.604241
count[LATENT_NO_EDGE_TRIADS]               -0.556809
count[LATENT_UNIDIRECTIONAL_TRIADS]        -0.519443
count[SINGLE_EDGE_TRIADS]                  -0.511847
count[NO_EDGE_TRIADS]                      -0.480793
count[UNIDIRECTIONAL_TRIADS]               -0.469320
count[LATENT_INCOMING_TRIADS]              -0.389634
count[INCOMING_TRIADS]                     -0.376363
count[DYADIC_TRIADS]                       -0.328065
count[LATENT_DYADIC_TRIADS]                -0.272834
count[OUTGOING_3TO1_TRIADS]                -0.255695
count[LATENT_OUTGOING_3TO1_TRIADS]         -0.223922
count[INCOMING_1TO3_TRIADS]                -0.182140
count[OUTGOING_TRIADS]                     -0.180029
count[LATENT_OUTGOING_TRIADS]              -0.173140
count[LATENT_INCOMING_2TO3_TRIADS]         -0.119122
count[INCOMING_2TO3_TRIADS]                -0.107140
count[LATENT_INCOMING_1TO3_TRIADS]         -0.089916
count[DIRECTED_CYCLE_1TO3_TRIADS]          -0.

These subreddits are all **similarly** different from the average subreddit. They all have:
- Slightly higher inclusion of direciprocal, direciprocal_2to3, trireciprocal triads, suggesting high discussion engagement for particular users.
- Much lower no_edge, single_edge triads, implying fewer drive-by commenters, i.e. these are not expansionary high-activity threads but threads for a small group of participants.

In [59]:
tgts = ["DebateReligion", "electronic_cigarette", "MensRights", "conspiracy", "teenagers"]

In [64]:

for tgt in tgts:
    
    a.append((subreddit_means.loc[tgt] - motif_feat_df_mean).sort_values())

In [65]:
a

[count[LATENT_NO_EDGE_TRIADS]               -17.781192
 count[LATENT_SINGLE_EDGE_TRIADS]            -9.865647
 count[NO_EDGE_TRIADS]                       -7.915545
 count[SINGLE_EDGE_TRIADS]                   -6.136867
 count[LATENT_INCOMING_TRIADS]               -1.884047
 count[INCOMING_TRIADS]                      -1.795128
 count[LATENT_UNIDIRECTIONAL_TRIADS]         -1.230825
 count[UNIDIRECTIONAL_TRIADS]                -0.993802
 count[DYADIC_TRIADS]                        -0.637364
 count[LATENT_DYADIC_TRIADS]                 -0.492012
 count[LATENT_OUTGOING_3TO1_TRIADS]          -0.201509
 count[INCOMING_1TO3_TRIADS]                 -0.195934
 count[OUTGOING_3TO1_TRIADS]                 -0.195595
 count[LATENT_OUTGOING_TRIADS]               -0.121895
 count[OUTGOING_TRIADS]                      -0.112967
 count[LATENT_INCOMING_2TO3_TRIADS]          -0.075755
 count[INCOMING_2TO3_TRIADS]                 -0.061701
 count[LATENT_DIRECTED_CYCLE_1TO3_TRIADS]    -0.035820
 count[DIR

In [None]:
subreddit_means.index

In [None]:
random_feat_df['subreddit'] = subreddits
feat_df['subreddit'] = subreddits

In [None]:
motif_prob_feats = motif_count_feats + prob_feats

In [None]:
random_subreddit_means = random_feat_df.groupby('subreddit').mean()


In [None]:
#random_subreddit_means[motif_prob_feats].to_csv("subreddits_x_motifs_random.csv")

In [None]:
#random_subreddit_means[motif_prob_feats].T.to_csv("motifs_x_subreddits_random.csv")

In [None]:
random_subreddit_sd = random_feat_df.groupby('subreddit').std()

In [None]:
random_subreddit_sd.loc['AdviceAnimals'][motif_count_feats]

In [None]:
random_subreddit_sd[motif_count_feats].T.sort_index()

In [None]:
random_subreddit_sd[prob_feats].T.sort_index()

Let's examine how the empirical values deviate from the random thread statistics:

In [None]:
def get_deviations(threads, thread_id, feats):
    """
    Returns a pandas series of % SD deviations
    """
    subreddit = get_subreddit(threads, thread_id)
    feat_stats = feat_df.loc[thread_id][feats]
    
    random_means = random_subreddit_means.loc[subreddit][feats]
    random_sds = random_subreddit_sd.loc[subreddit][feats]
    deviations = (feat_stats - random_means) / random_sds
    return deviations

As an example:

In [None]:
get_deviations(threads, 't1_c32bcq8', motif_prob_feats)

With this, let's create a dataframe where each thread is represented as a series of deviations / non-deviations. We can then group by subreddit and see which subreddit has the greatest percentage of deviations for various feats.

In [None]:
deviation_df = pd.DataFrame(dtype=float)

In [None]:
for thread in threads:
    deviation_df[thread] = get_deviations(threads, thread, motif_prob_feats)

In [None]:
deviation_df.dtypes

In [None]:
deviation_df

In [None]:
deviation_df_T = deviation_df.T

In [None]:
for k in deviation_df_T:
    deviation_df_T[k] = pd.to_numeric(deviation_df_T[k])

In [None]:
deviation_df_T['subreddit'] = subreddits

In [None]:
dev_means = deviation_df_T.groupby('subreddit').mean()

In [None]:
#dev_means.to_csv("deviation_means.csv")

In [None]:
#random_feat_df.mean(axis=0).to_csv("total_avg_feats.csv")