In [1]:
import os
os.getcwd()
os.chdir("..")
os.chdir("..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

In [2]:
import convokit
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

In [3]:
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus-small


In [4]:
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

In [5]:
# 'e58slx0'

In [6]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)


In [7]:
threads_motifs = hc.retrieve_motifs(corpus)

In [20]:
threads_motif_path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [12]:
threads_paths = hc.retrieve_motif_paths(corpus)

In [23]:
from typing import List

In [24]:
from convokit import TriadMotif

In [29]:
def validate_motif(motif: TriadMotif):
    utts_replied_to = [edge_set[0]['reply_to'] for edge_set in motif.edges]
    return max(Counter(utts_replied_to).values()) == 2

In [30]:
incoming_id = ('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')
incoming_2to3_id = ('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS', 'INCOMING_2TO3_TRIADS')

In [32]:
from random import choice

In [33]:
neg = []
pos = []
for thread_id, motif_paths in threads_paths.items():
    valid_incoming = [motif for motif in motif_paths[incoming_id] if validate_motif(motif)]
    valid_2to3 = [motif for motif in motif_paths[incoming_2to3_id] if validate_motif(motif)]

    if valid_incoming and valid_2to3:
        neg.append(choice(valid_incoming))
        pos.append(choice(valid_2to3))

In [34]:
len(neg)

378

In [35]:
len(pos)

378

378 / 408 of the pairs satisfy the criteria

In [47]:
pos_bow_feats = dict()
neg_bow_feats = dict()

In [48]:
def get_tlc(motif: TriadMotif):
    return motif.edges[0][0]['top_level_comment']

In [294]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

pos_text = []
neg_text = []
for motif in pos:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w.strip() for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w.strip() for w in time_sorted_edges[1]['text'].split(" ")])
    pos_text.append(text1 + " " + text2)
    
for motif in neg:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w.strip() for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w.strip() for w in time_sorted_edges[1]['text'].split(" ")])
    neg_text.append(text1 + " " + text2)

# pos_train, pos_test, neg_train, neg_test = train_test_split(pos, neg, test_size=0.2, random_state=42)
pos_ids, neg_ids = [get_tlc(motif) for motif in pos], [get_tlc(motif) for motif in neg]


In [295]:
pos_id_to_text = {pos_ids[i]: pos_text[i] for i in range(len(pos_ids))}
neg_id_to_text = {neg_ids[i]: neg_text[i] for i in range(len(neg_ids))}



In [296]:
train_ids, test_ids = train_test_split(list(pos_id_to_text), test_size=0.2, random_state=42)

In [297]:
pos_train = [pos_id_to_text[id] for id in train_ids]
neg_train = [neg_id_to_text[id] for id in train_ids]
pos_test = [pos_id_to_text[id] for id in test_ids]
neg_test = [neg_id_to_text[id] for id in test_ids]

In [346]:
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1, 3)) # excluding stop_words field improves performance
cv.fit(pos_train + neg_train)



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=0.05,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [347]:
pos_data = cv.transform(pos_train + pos_test).toarray()
neg_data = cv.transform(neg_train + neg_test).toarray()
cols = cv.get_feature_names()
pos_df = pd.DataFrame(pos_data, index=train_ids + test_ids, columns=cols)
neg_df = pd.DataFrame(neg_data, index=train_ids + test_ids, columns=cols)

In [348]:
pos_df.shape

(374, 159)

In [349]:
def generate_paired_X_y(pos_df, neg_df):
    df = pd.DataFrame(columns=pos_df.columns)
    y = []
    for idx in range(pos_df.shape[0]):
        if idx % 2 == 0:
            df = df.append(pos_df.iloc[idx] - neg_df.iloc[idx])
            y.append(1)
        else:
            df = df.append(neg_df.iloc[idx] - pos_df.iloc[idx])
            y.append(0)
    y = pd.DataFrame(y, index=train_ids+test_ids)
    return df, y

In [350]:
X, y = generate_paired_X_y(pos_df, neg_df)

In [351]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import normalize, StandardScaler, Normalizer



In [352]:
from sklearn.linear_model import LogisticRegression
clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
X_train = X.loc[train_ids]
y_train = y.loc[train_ids]
X_test = X.loc[test_ids]
y_test = y.loc[test_ids]
clf.fit(X.loc[train_ids], y.loc[train_ids])

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [353]:
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- BOW: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

print()

- BOW: 0.8696 train, 0.5333 test



In [354]:
def print_extreme_coefs(clf, feats, k):
    coefs = clf.named_steps['logreg'].coef_[0].tolist()
    
    assert len(feats) == len(coefs)
    feats_coefs = sorted(list(zip(feats, coefs)), key=lambda x: x[1], reverse=True)
    
    print("TOP {} FEATURES".format(k))
    for ft, coef in feats_coefs[:k]:
        print("{}: {:.3f}".format(ft, coef))
    print()
    print("BOTTOM {} FEATURES".format(k))
    for ft, coef in feats_coefs[-k:]:
        print("{}: {:.3f}".format(ft, coef))
    print()

In [355]:
print_extreme_coefs(clf, list(cv.get_feature_names()), k=20)

TOP 20 FEATURES
1_don: 1.090
1_at: 0.920
1_of 1_the: 0.904
1_he: 0.856
1_i: 0.840
1_because: 0.742
1_the: 0.740
2_ gt: 0.714
2_with: 0.699
2_there: 0.677
2_a: 0.667
1_your: 0.658
1_how: 0.639
2_not: 0.609
2_to: 0.602
2_people: 0.594
2_in 2_the: 0.568
1_like: 0.564
com: 0.559
1_than: 0.551

BOTTOM 20 FEATURES
1_are: -0.527
1_people: -0.536
1_they: -0.538
1_as: -0.568
2_about: -0.573
1_has: -0.578
2_know: -0.582
2_the: -0.596
2_even: -0.616
2_it: -0.694
2_in: -0.712
2_would: -0.740
the: -0.784
1_more: -0.847
1_of: -0.878
ve: -0.881
1_in 1_the: -0.987
re: -1.122
2_first: -1.151
1_was: -1.164



### Time diff between first/second edge, length of first edge text, length of second edge text

In [308]:
def get_features_from_motif(motif_inst):
    time_sorted_edges = sorted([e[0] for e in motif_inst.edges], key=lambda x: x['timestamp'])
    time_diff = time_sorted_edges[1]['timestamp'] - time_sorted_edges[0]['timestamp']
    num_words_1 = len(list(time_sorted_edges[0]['text'].split(" ")))
    num_words_2 = len(list(time_sorted_edges[1]['text'].split(" ")))
    return [time_diff, num_words_1, num_words_2]

In [309]:
pos_feats = {get_tlc(motif): get_features_from_motif(motif) for motif in pos}
neg_feats = {get_tlc(motif): get_features_from_motif(motif) for motif in neg}

In [310]:
pos_feats_df = pd.DataFrame.from_dict(pos_feats).T
neg_feats_df = pd.DataFrame.from_dict(neg_feats).T

In [311]:
pos_feats_df.columns = ['time_diff', 'first_utt_len', 'second_utt_len']
neg_feats_df.columns = ['time_diff', 'first_utt_len', 'second_utt_len']

In [282]:
X, y = generate_paired_X_y(pos_feats_df, neg_feats_df)

In [312]:
X_train2 = X.loc[train_ids]
y_train2 = y.loc[train_ids]
X_test2 = X.loc[test_ids]
y_test2 = y.loc[test_ids]

clf2 = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf2.fit(X_train2, y_train2)

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [313]:
y_train2.shape

(299, 1)

In [314]:
X_train2.shape

(299, 159)

In [315]:
train_acc = clf2.score(X_train2, y_train2)
test_acc = clf2.score(X_test2, y_test2)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

- Basic features: 0.8696 train, 0.5333 test


### BOW + Basic features

In [327]:
X_train_combined = pd.concat([X_train, X_train2], axis=1)

In [328]:
clf3 = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf3.fit(X_train_combined, y_train)

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [329]:
X_test_combined = pd.concat([X_test, X_test2], axis=1)

In [330]:
train_acc = clf3.score(X_train_combined, y_train)
test_acc = clf3.score(X_test_combined, y_test)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

- Basic features: 0.8696 train, 0.5067 test
