In [1]:
import os
os.chdir("../..")

In [2]:
import convokit

In [3]:
from convokit import Corpus

In [4]:
fake_news_dir = '/sauna/fake-news'
# politics_dir = '/sauna/reddit_201810_raw/corpus/pokemontrades_banlist~-~politics/politics'
reddit_dir = '/sauna/fake-news/reddit-corpus'
donald_corpus = '/sauna/reddit_201810_raw/corpus/TheTwoBeerQueers~-~The_Donald/The_Donald/'

In [5]:
corpus = Corpus(filename=donald_corpus)
# corpus = Corpus(filename=os.path.join(fake_news_dir, 'donald_basic_predictive_valid_convos'))

In [6]:
threads = corpus.utterance_threads(include_root=False)

In [7]:
# Use only the first 10 comments in each thread
thread_pfxs = corpus.utterance_threads(prefix_len=5, include_root=False)

In [8]:
from collections import defaultdict
thread_roots_by_self_post = defaultdict(list)
for top_level_comment, thread in threads.items():
    if len(thread) < 5: continue
    rt = thread[next(iter(thread))].root
    thread_roots_by_self_post[rt].append(top_level_comment)

In [9]:
def get_thread_depth(utts): # List of utts
    depth = defaultdict(int)
    for utt in utts:
        depth[utt.id] = depth[utt.reply_to] + 1
    return max(depth.values())

In [10]:
def get_thread_width(utts): # List of utts
    width = defaultdict(int)
    for utt in utts:
        width[utt.reply_to] += 1
    return max(width.values())

In [11]:
def get_interaction_specificity(utts):
    utts = list(utts)
    return get_thread_depth(utts) / get_thread_width(utts)

In [12]:
# first generate positive and negative examples based on task
import random
def generate_pos_neg(task: str, post_to_thread_obj, threads, thread_pfxs):
    pos, neg = [], []
    if task == "comment-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos = [root for root in thread_roots if len(threads[root]) >= 15]
            has_neg = [root for root in thread_roots if len(threads[root]) == 10]
            
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    elif task == "commenter-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for root in thread_roots:
                if len(set(c.user.name for c in threads[root].values())) >= \
                    len(set(c.user.name for c in thread_pfxs[root].values())) * 2:
                    has_pos.append(root)
                elif len(set(c.user.name for c in threads[root].values())) == \
                    len(set(c.user.name for c in thread_pfxs[root].values())):
                    has_neg.append(root)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    elif task == "graph-depth-growth":
        for post_id, thread_ids in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for thread_id in thread_ids:
                if len(threads[thread_id]) >= 20:
                    if get_thread_depth(threads[thread_id].values()) >= \
                        get_thread_depth(thread_pfxs[thread_id].values()) * 2:
                        has_pos.append(thread_id)
                    else:
                        has_neg.append(thread_id)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))   
    elif task == "graph-width-growth":
        for post_id, thread_ids in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for thread_id in thread_ids:
                if len(threads[thread_id]) >= 20:
                    if get_thread_width(threads[thread_id].values()) >= \
                        get_thread_width(thread_pfxs[thread_id].values()) * 2:
                        has_pos.append(thread_id)
                    else:
                        has_neg.append(thread_id)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg)) 
    elif task == 'interaction-specificity':
        for post_id, thread_ids in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for thread_id in thread_ids:
                if len(threads[thread_id]) >= 20:
                    if get_interaction_specificity(threads[thread_id].values()) >= \
                        get_interaction_specificity(thread_pfxs[thread_id].values()) * 1.8:
                        has_pos.append(thread_id)
                    else:
                        has_neg.append(thread_id)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg)) 
    print("- {} positive, {} negative pts for {} task".format(len(pos), len(neg), task))
    
    return pos, neg

In [13]:
pos_comment_growth, neg_comment_growth = generate_pos_neg("comment-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 22719 positive, 22719 negative pts for comment-growth task


In [14]:
pos_commenter_growth, neg_commenter_growth = generate_pos_neg("commenter-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 80311 positive, 80311 negative pts for commenter-growth task


In [15]:
pos_depth_growth, neg_depth_growth = generate_pos_neg("graph-depth-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 14817 positive, 14817 negative pts for graph-depth-growth task


In [16]:
pos_width_growth, neg_width_growth = generate_pos_neg("graph-width-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 10021 positive, 10021 negative pts for graph-width-growth task


In [17]:
pos_int_spec, neg_int_spec = generate_pos_neg("interaction-specificity",
                                              thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 9000 positive, 9000 negative pts for interaction-specificity task


In [18]:
# import json
# with open('thread_ids_growth.json', 'w') as f:
#     ids = {"pos-comment-growth": pos_comment_growth,
#            "neg-comment-growth": neg_comment_growth,
#            "pos-commenter-growth": pos_commenter_growth,
#            "neg-commenter-growth": neg_commenter_growth
#           }
#     json.dump(ids, f)

In [19]:
# import json
# with open('thread_ids_growth.json', 'r') as f:
#     ids = json.load(f)
    
# pos_comment_growth = ids['pos-comment-growth']
# neg_comment_growth = ids['neg-comment-growth']
# pos_commenter_growth = ids['pos-commenter-growth']
# neg_commenter_growth = ids['neg-commenter-growth']

In [20]:
thread_to_convo = {thread_id: convo_id for convo_id, thread_ids in thread_roots_by_self_post.items() for thread_id in thread_ids}

In [21]:
commenter_growth_convos = set()
for thread_id in pos_commenter_growth:
    commenter_growth_convos.add(thread_to_convo[thread_id])

In [22]:
comment_growth_convos = set()
for thread_id in pos_comment_growth:
    comment_growth_convos.add(thread_to_convo[thread_id])

In [23]:
depth_growth_convos = set()
for thread_id in pos_depth_growth:
    depth_growth_convos.add(thread_to_convo[thread_id])

In [24]:
width_growth_convos = set()
for thread_id in pos_width_growth:
    width_growth_convos.add(thread_to_convo[thread_id])

In [25]:
paired_convos = comment_growth_convos.union(commenter_growth_convos).union(depth_growth_convos).union(width_growth_convos)

In [26]:
corpus.filter_conversations_by(lambda convo: convo.id in paired_convos)

In [27]:
# corpus.dump('donald_basic_predict_2', base_path=fake_news_dir)

In [28]:
hc = convokit.HyperConvo(prefix_len=5, min_thread_len=5, include_root=False)
orig_hc = convokit.HyperConvo_0(prefix_len=5, min_thread_len=5, include_root=False)


In [29]:
import pandas as pd

In [30]:
hyperconvo_feats = orig_hc.retrieve_feats(corpus, prefix_len=5, min_thread_len=5, include_root=False)

  "norm.max": lambda l: np.max(l) / np.sum(l),
  if len(l) > 1 else np.nan,
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  pk = 1.0*pk / np.sum(pk, axis=0)
  if len(l) > 1 else np.nan


In [31]:
motif_feats = hc.retrieve_feats(corpus)

In [32]:
motif_feats_df = pd.DataFrame.from_dict(motif_feats, orient='index')
motif_feat_names = list(motif_feats_df.columns)

In [33]:
dyadic_feats = hc.retrieve_dyadic_motif_counts(corpus)

In [34]:
dyadic_df = pd.DataFrame.from_dict(dyadic_feats, orient='index')
dyadic_feat_names = list(dyadic_df.columns)

In [35]:
hyperconv_df = pd.DataFrame.from_dict(hyperconvo_feats, orient='index')
hyperconv_feat_names = list(hyperconv_df.columns)

In [36]:
path_stats = hc.retrieve_motif_pathway_stats(corpus)
path_stats_df = pd.DataFrame.from_dict(path_stats, orient='index')
columns = ['PATH-'+', '.join(filter(lambda x: type(x) == str, col)).strip() for col in path_stats_df.columns.values]
path_stats_df.columns = columns

path_stats_enum_df = pd.DataFrame()

for path_stat in columns:
    path_stats_enum_df['is-present[{}]'.format(path_stat)] = path_stats_df[path_stat] > 0
    path_stats_enum_df['count[{}]'.format(path_stat)] = path_stats_df[path_stat]

path_feat_names = list(path_stats_enum_df.columns)

In [37]:
def get_num_users(thread):
    return len(set(utt.user.name for utt in thread.values()))

thread_to_usercount = dict()
for thread_id in thread_pfxs:
    if len(thread_pfxs[thread_id]) < 5: continue
    thread_to_usercount[thread_id] = {"num_users": get_num_users(thread_pfxs[thread_id])}

In [38]:
num_users_df = pd.DataFrame.from_dict(thread_to_usercount, orient='index')
num_users_feat = ['num_users']

In [39]:
feats_df = pd.concat([hyperconv_df, motif_feats_df, dyadic_df, path_stats_enum_df, num_users_df], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [40]:
# threads = {k: v for k, v in corpus.utterance_threads(include_root=False).items() if k in valid_threads}

In [41]:
# Use only the first 10 comments in each thread
# thread_pfxs = {k: v for k, v in corpus.utterance_threads(prefix_len=10, include_root=False).items() if k in valid_threads}

In [42]:
feats_df = feats_df.astype('float64')

In [43]:
feats_df = feats_df.fillna(-1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 
from sklearn.model_selection import cross_val_score
import numpy as np

for task in ["comment-growth", "commenter-growth", "graph-depth-growth", "graph-width-growth", "interaction-specificity"]: #, "post-deleted", "user-deleted"
    print("TASK: {}\n".format(task))
    
    if task == "comment-growth":
        pos, neg = pos_comment_growth, neg_comment_growth
    elif task == "commenter-growth":
        pos, neg = pos_commenter_growth, neg_commenter_growth
    elif task == "graph-depth-growth":
        pos, neg = pos_depth_growth, neg_depth_growth
    elif task == "graph-width-growth":
        pos, neg = pos_width_growth, neg_width_growth
    elif task == "interaction-specificity":
        pos, neg = pos_int_spec, neg_int_spec
        
#     pos, neg = generate_pos_neg(task, thread_roots_by_self_post, threads, thread_pfxs)
    for feature_set, name in [(hyperconv_feat_names, "hyperconvo"),
                        (motif_feat_names, "motif"),
                        (dyadic_feat_names, "dyadic"),
                        (path_feat_names, "motifpaths"),
                        (motif_feat_names + path_feat_names, "motif-all"),
                        (hyperconv_feat_names + motif_feat_names, "hyperconv-motif"),
                        (hyperconv_feat_names + path_feat_names, "hyperconv-paths"),
                        (hyperconv_feat_names + motif_feat_names + path_feat_names, "hyperconvo-motifall"),
                        (num_users_feat, "usercount"),
#                         (hyperconv_feat_names + num_users_feat, "hyperconv-usercount"),
#                         (motif_feat_names + num_users_feat, "motif-usercount"),
#                         (path_feat_names + num_users_feat, "motifpaths-usercount"),
#                         (motif_feat_names + path_feat_names + num_users_feat, "motifsall+usercount"),
                        (hyperconv_feat_names + motif_feat_names + path_feat_names + num_users_feat, "hyperconvo-motifall+usercount"),
                       ]:
        clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])      
        loo = LeaveOneOut()
        pp = convokit.PairedPrediction()
        X, y = pp._generate_paired_X_y(feats_df[feature_set], pos, neg)

#         clf.fit(X, y)
#         clf.score(X, y)
#         print(X.shape)
#         print(X[0])
#         print(y.shape)
        scores = cross_val_score(clf, X, y, cv=20)
        print("- {}, cv_accuracy: {:.4f}".format(name, scores.mean()))


#         print("Feature set: {}".format(name))
#         pp.fit_predict(feats_df[feature_set], pos, neg, test_size=0.2)
#         pp.print_extreme_coefs(feature_set, num_features=5)

TASK: comment-growth

- hyperconvo, cv_accuracy: 0.5693
- motif, cv_accuracy: 0.5666
- dyadic, cv_accuracy: 0.5583
- motifpaths, cv_accuracy: 0.5641
- motif-all, cv_accuracy: 0.5643
- hyperconv-motif, cv_accuracy: 0.5661
- hyperconv-paths, cv_accuracy: 0.5661
- hyperconvo-motifall, cv_accuracy: 0.5660
- usercount, cv_accuracy: 0.5585
- hyperconvo-motifall+usercount, cv_accuracy: 0.5662
TASK: commenter-growth

- hyperconvo, cv_accuracy: 0.6099
- motif, cv_accuracy: 0.6013
- dyadic, cv_accuracy: 0.5921
- motifpaths, cv_accuracy: 0.6038
- motif-all, cv_accuracy: 0.6036
- hyperconv-motif, cv_accuracy: 0.6100
- hyperconv-paths, cv_accuracy: 0.6110
- hyperconvo-motifall, cv_accuracy: 0.6109
- usercount, cv_accuracy: 0.5886
