In [1]:
import os
os.getcwd()
os.chdir("../..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

We can use Hypergraph features for various predictive tasks:

In [3]:
from convokit import PairedPrediction

In [4]:
import convokit
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [5]:
os.chdir('..')

In [None]:
os.chdir('reddit-corpus')

In [19]:
os.listdir()

['reddit-corpus']

In [10]:
corpus = convokit.Corpus(filename=convokit.download('reddit-corpus'))
# corpus = convokit.Corpus(filename='reddit-corpus')

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus


In [11]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)
hc.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x11f7371d0>

In [22]:
remake_cache = True
if remake_cache:
    with open("hyperconvo_feats.p", "wb") as f:
        hyperconvo_feats = {}
        for convo in corpus.iter_conversations():
            hyperconvo_feats.update(convo.meta["hyperconvo"])
        pickle.dump(hyperconvo_feats, f)
else:
    with open("hyperconvo_feats.p", "rb") as f:
        hyperconvo_feats = pickle.load(f)

In [12]:
threads = corpus.utterance_threads(include_root=False)

In [16]:
# Use only the first 10 comments in each thread
thread_pfxs = corpus.utterance_threads(prefix_len=10, include_root=False)

In [17]:
len(threads)

100000

In [18]:
corpus.print_summary_stats()

Number of Users: 521777
Number of Utterances: 2004262
Number of Conversations: 84979


In [19]:
from collections import defaultdict
thread_roots_by_self_post = defaultdict(list)
for top_level_comment, thread in threads.items():
    rt = thread[next(iter(thread))].root
    thread_roots_by_self_post[rt].append(top_level_comment)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
hyperconvo_feats = hc.retrieve_feats(corpus)

In [21]:
path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [22]:
motif_counts = hc.retrieve_motif_counts(corpus)

In [23]:
threads_motifs = hc.retrieve_motifs(corpus)

In [24]:
latent_motif_count = {thread_id: hc._latent_motif_count(motif_dict, trans=False)[0] for thread_id, motif_dict in threads_motifs.items()}

In [399]:
hyperconv_df = pd.DataFrame.from_dict(hyperconvo_feats, orient='index')
hyperconv_feat_names = list(hyperconv_df.columns)

In [405]:
path_stats_df = pd.DataFrame.from_dict(path_stats, orient='index')
columns = [', '.join(filter(lambda x: type(x) == str, col)).strip() for col in path_stats_df.columns.values]
path_stats_df.columns = columns
path_feat_names = list(path_stats_df.columns)

In [454]:
motif_counts_df = pd.DataFrame.from_dict(motif_counts, orient='index')
motif_feat_names = list(motif_counts_df.columns)

In [459]:
latentmotif_df = pd.DataFrame.from_dict(latent_motif_count, orient='index')
latentmotif_df.columns = ['LATENT_'+c for c in latentmotif_df.columns]
latent_motif_feat_names = list(latentmotif_df.columns)

In [None]:
def get_num_users(thread):
    return len(set(utt.user.name for utt in thread.values()))

thread_to_usercount = dict()
for thread_id in thread_pfxs:
    thread_to_usercount[thread_id] = {"num_users": get_num_users(thread_pfxs[thread_id])}

In [473]:
num_users_df = pd.DataFrame.from_dict(thread_to_usercount, orient='index')
num_users_feat = ['num_users']

In [472]:
feats_df = pd.concat([hyperconv_df, path_stats_df, motif_counts_df, latentmotif_df, num_users_df], axis=1)

In [28]:
# first generate positive and negative examples based on task

def generate_pos_neg(task: str, post_to_thread_obj, threads, thread_pfxs):
    pos, neg = [], []
    if task == "comment-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos = [root for root in thread_roots if len(threads[root]) >= 15]
            has_neg = [root for root in thread_roots if len(threads[root]) == 10]
            
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    elif task == "commenter-growth":
        for post_id, thread_roots in post_to_thread_obj.items():
            has_pos, has_neg = [], []
            for root in thread_roots:
                if len(threads[root]) >= 20:
                    if len(set(c.user.name for c in threads[root].values())) >= \
                        len(set(c.user.name for c in thread_pfxs[root].values())) * 2:
                            has_pos.append(root)
                    else:
                        has_neg.append(root)
            if has_pos and has_neg:
                pos.append(random.choice(has_pos))
                neg.append(random.choice(has_neg))
    print("- {} positive, {} negative pts for {} task".format(len(pos), len(neg), task))
    
    return pos, neg


In [29]:
pos_comment_growth, neg_comment_growth = generate_pos_neg("comment-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 1827 positive, 1827 negative pts for comment-growth task


In [30]:
pos_commenter_growth, neg_commenter_growth = generate_pos_neg("commenter-growth", 
                                                          thread_roots_by_self_post,
                                                          threads,
                                                          thread_pfxs
                                                         )

- 849 positive, 849 negative pts for commenter-growth task


In [69]:
random.seed(2019)
for task in ["comment-growth", "commenter-growth"]: #, "post-deleted", "user-deleted"
    print("TASK: {}\n".format(task))
    
    pos, neg = generate_pos_neg(task, thread_roots_by_self_post, threads, thread_pfxs)

    pp = PairedPrediction()
    pp.fit_predict(hyperconvo_feats, pos, neg)
    
    for feature_set, name in [(hyperconv_feat_names, "hyperconvo"),
                        (hyperconv_feat_names + num_users_feat, "hyperconv-usercount"),
                        (latent_motif_feat_names, "latentmotif"),
                        (latent_motif_feat_names + num_users_feat, "latentmotif-usercount"),
                        (path_feat_names, "motifpaths"),
                        (path_feat_names + num_users_feat, "motifpaths-usercount"),
                        (hyperconv_feat_names + motif_feat_names, "hyperconv-motif"),
                        (hyperconv_feat_names + path_feat_names, "hyperconv-paths"),
                        (hyperconv_feat_names + latent_motif_feat_names, "hyperconv-latent"),
                        (hyperconv_feat_names + motif_feat_names + path_feat_names + latent_motif_feat_names, "hyperconvo-motifall"),
                        (num_users_feat, "usercount")
                       ]:
        print("Feature set: {}".format(name))
        pp.fit_predict(feats[feature_set], pos, neg, test_size=0.2)
        pp.print_extreme_coefs(feature_set, num_features=5)

TASK: comment-growth

- 1827 positive, 1827 negative pts for comment-growth task
Excluded {} data point(s) that contained NaN values.
Test accuracy of 0.5506
Feature set: hyperconvo
Excluded {} data point(s) that contained NaN values.
Test accuracy of 0.5506
TOP 5 FEATURES
prop-nonzero[indegree over C->C mid-thread responses]: 1.245
mean[outdegree over C->c responses]: 0.351
mean[outdegree over C->C responses]: 0.351
mean[indegree over C->C responses]: 0.351
norm.max[outdegree over C->c mid-thread responses]: 0.335

BOTTOM 5 FEATURES
max[indegree over C->C responses]: -0.375
entropy[indegree over C->C mid-thread responses]: -0.376
entropy[indegree over C->C responses]: -0.404
mean-nonzero[indegree over C->C responses]: -0.520
prop-nonzero[indegree over C->C responses]: -1.022

Feature set: hyperconv-usercount
Excluded {} data point(s) that contained NaN values.
Test accuracy of 0.5506
TOP 5 FEATURES
prop-nonzero[indegree over C->C mid-thread responses]: 1.246
mean[outdegree over C->c r

Test accuracy of 0.4940
TOP 5 FEATURES
entropy[outdegree over C->c mid-thread responses]: 0.578
entropy[outdegree over C->C mid-thread responses]: 0.578
norm.max[outdegree over c->c mid-thread responses]: 0.540
norm.2nd-largest[outdegree over c->c mid-thread responses]: 0.534
entropy[indegree over C->C mid-thread responses]: 0.442

BOTTOM 5 FEATURES
2nd-largest / max[indegree over C->c responses]: -0.458
max[indegree over C->C mid-thread responses]: -0.527
2nd-largest / max[indegree over C->C mid-thread responses]: -0.546
prop-multiple[indegree over c->c mid-thread responses]: -0.619
prop-multiple[indegree over C->c mid-thread responses]: -0.619

Feature set: latentmotif
Test accuracy of 0.4824
TOP 5 FEATURES
INCOMING_1TO3_TRIADS: 0.453
SINGLE_EDGE_TRIADS: 0.324
DIRECTED_CYCLE_TRIADS: 0.117
INCOMING_2TO3_TRIADS: 0.077
INCOMING_TRIADS: 0.026

BOTTOM 5 FEATURES
OUTGOING_3TO1_TRIADS: -0.078
TRIRECIPROCAL_TRIADS: -0.098
OUTGOING_TRIADS: -0.155
NO_EDGE_TRIADS: -0.238
DIRECIPROCAL_TRIADS: -0

In [302]:
pos, neg = generate_pos_neg("comment-growth", thread_roots_by_self_post, threads, thread_pfxs)

- 1827 positive, 1827 negative pts for comment-growth task


In [None]:
threads_text = dict()
for thread_id in pos + neg:
    threads_text[thread_id] = {"text": " ".join(utt.text for utt in thread_pfxs[thread_id].values())}

In [319]:
text_df = pd.DataFrame.from_dict(threads_text).T

In [320]:
# First split the pos + neg pairs in train, test sets

In [321]:
from sklearn.model_selection import train_test_split
pos_neg_train, pos_neg_test = train_test_split(list(zip(pos, neg)), test_size=0.2, random_state=42)

In [322]:
pos_train = [x[0] for x in pos_neg_train]
neg_train = [x[1] for x in pos_neg_train] 
pos_test = [x[0] for x in pos_neg_test]
neg_test = [x[1] for x in pos_neg_test]

In [323]:
# BOW vectorizer on train dataset

In [325]:
train_text_df = text_df.loc[pos_train + neg_train]

In [326]:
test_text_df = text_df.loc[pos_test + neg_test]

In [332]:
len(test_text_df)

732

In [379]:
cv = CountVectorizer(min_df=0.15, max_df=0.7)

In [380]:
train_text_arr = cv.fit_transform(train_text_df['text'])

In [381]:
train_text_transform_df = pd.DataFrame(train_text_arr.toarray(), columns=cv.get_feature_names(), index=train_text_df.index)

In [382]:
train_text_transform_df.head()

Unnamed: 0,about,actually,after,again,all,already,also,always,am,an,...,will,without,work,would,wouldn,wrong,yeah,years,yes,your
dsmhsbk,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e3d0msv,0,2,0,0,1,0,1,0,0,6,...,0,0,0,0,0,0,1,2,0,0
dxn8zba,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dz19xqz,1,0,0,0,4,0,1,2,1,2,...,1,0,0,1,0,0,0,0,0,1
e47jq7n,0,1,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0


In [383]:
test_text_arr = cv.transform(test_text_df['text'])
test_text_transform_df = pd.DataFrame(test_text_arr.toarray(), columns=cv.get_feature_names(), index=test_text_df.index)   


In [384]:
from pandas import DataFrame
def _generate_paired_X_y(feats: DataFrame, pos_ids, neg_ids):

    X, y = [], []
    flip = True

    excluded = 0
    for idx in range(len(pos_ids)):
        pos_feats = np.array(feats.loc[pos_ids[idx]])
        neg_feats = np.array(feats.loc[neg_ids[idx]])

        if (np.isnan(pos_feats).any() or np.isnan(neg_feats).any()):
            excluded += 1
            continue

        if flip:
            y.append(1)
            diff = pos_feats - neg_feats
        else:
            y.append(0)
            diff = neg_feats - pos_feats

        X.append(diff)
        flip = not flip

    if excluded > 0:
        print("Excluded {} data point(s) that contained NaN values.".format(excluded))

    return np.array(X), np.array(y)

In [385]:
X_train, y_train = _generate_paired_X_y(train_text_transform_df, pos_train, neg_train)

In [386]:
X_test, y_test = _generate_paired_X_y(test_text_transform_df, pos_test, neg_test)

In [387]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])       

In [388]:
X_train.shape

(1461, 192)

In [389]:
clf = clf.fit(X_train, y_train)

In [390]:
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- {}: {:.4f} train, {:.4f} test".format("cumulative_bow", train_acc, test_acc))

- cumulative_bow: 0.6489 train, 0.5000 test


In [397]:
def print_extreme_coefs(clf, feature_names, num_features: int = 5):
    coefs = clf.named_steps['logreg'].coef_[0].tolist()

    assert len(feature_names) == len(coefs)

    feats_coefs = sorted(list(zip(feature_names, coefs)), key=lambda x: x[1], reverse=True)

    print()
    print("TOP {} FEATURES".format(num_features))
    for ft, coef in feats_coefs[:num_features]:
        print("{}: {:.3f}".format(ft, coef))
    print()
    print("BOTTOM {} FEATURES".format(num_features))
    for ft, coef in feats_coefs[-num_features:]:
        print("{}: {:.3f}".format(ft, coef))
    print()

In [398]:
print_extreme_coefs(clf, cv.get_feature_names(), num_features=20)


TOP 20 FEATURES
my: 0.310
like: 0.248
both: 0.203
right: 0.169
don: 0.166
any: 0.160
going: 0.157
from: 0.156
only: 0.155
he: 0.153
how: 0.140
also: 0.136
which: 0.134
had: 0.126
else: 0.123
find: 0.120
everyone: 0.113
every: 0.111
thought: 0.111
them: 0.109

BOTTOM 20 FEATURES
maybe: -0.114
know: -0.118
actually: -0.118
few: -0.119
your: -0.120
through: -0.123
want: -0.126
one: -0.127
wrong: -0.131
would: -0.147
him: -0.151
got: -0.158
com: -0.175
who: -0.175
man: -0.179
does: -0.183
up: -0.199
can: -0.265
re: -0.294
was: -0.295



In [None]:
import random
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

random.seed(2019)

for task in ["comment-growth", "commenter-growth"]: #, "post-deleted", "user-deleted"
    print("TASK: {}\n".format(task))
    
    pos, neg = generate_pos_neg(task, thread_roots_by_self_post, threads, thread_pfxs)

    X, y = generate_paired_features(hyperconvo_feats, pos, neg)
    X_motifcnt, y_motifcnt = generate_paired_features(motif_counts, pos, neg)
    X_latent, y_latent = generate_paired_features(latent_motif_count, pos, neg)
    X_path, y_path = generate_paired_features(path_stats, pos, neg)
    X_hcmotif, y_hcmotif = generate_paired_features(hyperconv_motif, pos, neg)
    X_hcpath, y_hcpath = generate_paired_features(hyperconv_paths, pos, neg)
    X_hclatent, y_hclatent = generate_paired_features(hyperconv_latent, pos, neg)
    X_all, y_all = generate_paired_features(hyperconv_motifall, pos, neg)
    for X, y, feats, name in [(X, y, hyperconvo_feats, "hyperconv"),
                       (X_motifcnt, y_motifcnt, motif_counts, "motifcount"),
                       (X_latent, y_latent, latent_motif_count, "latentmotif"),
                       (X_path, y_path, path_stats, "motifpaths"),
                       (X_hcmotif, y_hcmotif, hyperconv_motif, "hyperconv-motif"),
                       (X_hcpath, y_hcpath, hyperconv_paths, "hyperconv-paths"),
                       (X_hclatent, y_hclatent, hyperconv_latent, "hyperconv-latent"),
                       (X_all, y_all, hyperconv_motifall, "hyperconvo-motifall")
                      ]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
        clf.fit(X_train, y_train)

        train_acc = clf.score(X_train, y_train)
        test_acc = clf.score(X_test, y_test)
        print("- {}: {:.4f} train, {:.4f} test".format(name, train_acc, test_acc))
        print_extreme_coefs(clf, feats)