In [1]:
import os
os.chdir('../../..')

In [2]:
from convokit import Corpus, download



In [3]:
corpus = Corpus(filename=download('subreddit-Cornell'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/subreddit-Cornell


In [4]:
corpus.print_summary_stats()

Number of Users: 7568
Number of Utterances: 74467
Number of Conversations: 10744


## Bag-of-words prediction for utterances having positive scores

In [5]:
from convokit import BoWClassifier

In [6]:
bow_classifer = BoWClassifier(obj_type="utterance", labeller=lambda utt: utt.meta['score'] > 0)

Initializing default unigram CountVectorizer...
Initializing default classification model (standard scaled logistic regression)


In [7]:
bow_classifer.fit(corpus)

In [8]:
bow_classifer.transform(corpus)

<convokit.model.corpus.Corpus at 0x136d1fd68>

In [9]:
next(corpus.iter_utterances()).meta

{'score': 2,
 'top_level_comment': None,
 'retrieved_on': -1,
 'gilded': -1,
 'gildings': None,
 'subreddit': 'Cornell',
 'stickied': False,
 'permalink': '/r/Cornell/comments/nyx4d/so_i_was_away_this_past_semester_whats_going_on/',
 'author_flair_text': '',
 'bow_vector': <1x9340 sparse matrix of type '<class 'numpy.int64'>'
 	with 42 stored elements in Compressed Sparse Row format>,
 'prediction': True,
 'pred_score': 0.9999998491105527}

In [10]:
next(corpus.iter_utterances()).meta['bow_vector'].toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [37]:
bow_classifer.summarize(corpus).head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
doubowe,False,3.5142449999999996e-48
2mx7u9,False,8.327791e-35
15enm8,False,7.272589000000001e-28
2pl46k,False,3.508764e-23
e8pd1us,False,1.208424e-22


In [42]:
corpus.get_utterance('15enm8').text

'One, just to get this out of the way: I\'m only a sophomore in high school. In spite of this, my high school is one of the top public schools in New Jersey (and to put it bluntly it\'s a very affluent area... although I\'m not necessarily affluent like my classmates). The point of telling you guys that is kids start talking about all these amazing schools they want to go to in like eighth grade, so I know quite a bit about colleges. As stated in the title, I really want to go to Cornell, and I just was hoping that some of you guys and girls on here would be awesome enough to give out some SAT scores, ACT scores (if you took them), and extra curricular activities you guys got/did? My unweighted GPA is a 3.8 (weighted is a 4.2), and my first PSAT was an overall 1900, and from taking that I (not to sound cocky here) *know* that I\'m going to get that score up a *lot*. I\'m in all the highest level classes I can be in, and I\'m looking to take multiple AP courses next year (junior). Do yo

In [12]:
bow_classifer.get_coefs().head()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
child,1.270001
appealing,1.11569
whereas,1.109702
messed,1.081836
focusing,1.071464


In [13]:
bow_classifer.get_coefs().tail()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
criteria,-0.667785
entered,-0.722992
bathroom,-0.742234
korea,-0.764383
goal,-0.784593


In [14]:
from convokit import Classifier

In [15]:
c = Classifier(pred_feats = None, obj_type = "utterance", labeller = lambda utt: utt.meta['score'] > 0)

In [16]:
y_true, y_pred = c.get_y_true_pred(corpus)

In [17]:
c.base_accuracy(corpus)

0.9279546644822538

In [18]:
c.accuracy(corpus)

0.9491452589737737

In [19]:
print(c.classification_report(corpus))

              precision    recall  f1-score   support

       False       0.88      0.34      0.49      5365
        True       0.95      1.00      0.97     69102

    accuracy                           0.95     74467
   macro avg       0.91      0.67      0.73     74467
weighted avg       0.95      0.95      0.94     74467



## Bag-of-words prediction for comment thread doubling in length versus staying the same length based on first 5 utterances

In [20]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.id == utt.meta['top_level_comment']]

In [21]:
corpus.print_summary_stats()

Number of Users: 7568
Number of Utterances: 74467
Number of Conversations: 10744


In [22]:
len(top_level_comment_ids)

32893

In [23]:
threads_corpus = corpus.reindex_conversations(new_convo_roots=top_level_comment_ids)


['c3oyf4d', 'c3od15i', 'c3ocsyl', 'c3p8bze', 'c3p1rn8']


In [24]:
threads_corpus.print_summary_stats()

Number of Users: 6160
Number of Utterances: 63697
Number of Conversations: 32888


In [25]:
for thread in threads_corpus.iter_conversations():
    thread_len = len(list(thread.iter_utterances()))
    if thread_len == 5:
        thread.meta['thread_doubles'] = False
    elif thread_len >= 10:
        thread.meta['thread_doubles'] = True
    else:
        thread.meta['thread_doubles'] = None

In [26]:
bow = BoWClassifier(obj_type="conversation", 
                    text_func=lambda convo: " ".join([utt.text for utt in convo.get_chronological_utterance_list()][:5]),
                    selector=lambda convo: convo.meta['thread_doubles'] is not None,
                    labeller=lambda convo: convo.meta['thread_doubles']
                   )

Initializing default unigram CountVectorizer...
Initializing default classification model (standard scaled logistic regression)


In [27]:
bow.fit(threads_corpus)

In [28]:
bow.transform(threads_corpus)

<convokit.model.corpus.Corpus at 0x136c6b400>

In [29]:
bow.summarize(threads_corpus).head()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
d8y9akn,False,1.55807e-16
dwqaa06,False,2.766371e-16
dxfib8r,False,2.728401e-15
dl7q7n2,False,8.207078e-14
drduxx1,False,2.465283e-12


In [30]:
bow.summarize(threads_corpus).tail()

Unnamed: 0_level_0,prediction,pred_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
e70wjy3,True,1.0
dsldpxg,True,1.0
dwa6k96,True,1.0
dandio0,True,1.0
dt05qyf,True,1.0


In [31]:
c = Classifier(pred_feats = None, obj_type = "conversation", 
               selector=lambda convo: convo.meta['thread_doubles'] is not None, 
               labeller = lambda convo: convo.meta['thread_doubles'] == True)

In [32]:
c.base_accuracy(threads_corpus)

0.3238095238095238

In [33]:
c.accuracy(threads_corpus)

0.9992063492063492

In [34]:
print(c.classification_report(threads_corpus))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00       852
        True       1.00      1.00      1.00       408

    accuracy                           1.00      1260
   macro avg       1.00      1.00      1.00      1260
weighted avg       1.00      1.00      1.00      1260



In [35]:
bow.get_coefs()

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
tried,0.611836
sad,0.513460
basically,0.508508
runs,0.481761
talked,0.463195
describe,0.456155
degree,0.441427
content,0.417373
china,0.414063
imagine,0.390082
