In [1]:
import os
os.chdir('../..')

In [2]:
import convokit
from convokit import Corpus, PairedPrediction, download



In [3]:
corpus = Corpus(filename=download('friends-corpus'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/friends-corpus


In [4]:
corpus.print_summary_stats()

Number of Users: 700
Number of Utterances: 67373
Number of Conversations: 3107


In the friends-corpus, Conversations are Scenes, and Utterances correspond to utterances by characters in that scene.

Let's do a basic paired prediction, predicting for whether or not a scene goes on to have all six friends participating, based on features from the first 5 utterances.

## Adding features for first 5 utterances and label for whether all 6 Friends eventually participate

In [5]:
def add_convo_features(convo):
    utts = sorted(list(convo.iter_utterances()), key=lambda utt: utt.id)
    utts_5 = utts[:5]
    user_ids = {utt.user.id for utt in utts_5}
    main_chars = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Rachel Green', 'Ross Geller']
    
    for char in main_chars:
        convo.add_meta('has_{}_5'.format(char), int(char in user_ids))
    
    convo.add_meta('num_main_characters_5', sum(convo.meta['has_{}_5'.format(char)] for char in main_chars))
    
    all_speakers = {utt.user.id for utt in utts}
    convo.add_meta('all_friends_present', len(set(main_chars).intersection(all_speakers))==6)

In [6]:
for convo in corpus.iter_conversations():
    add_convo_features(convo)

In [7]:
# Number of Conversations with all friends present
num_convos_with_all_present = 0
for convo in corpus.iter_conversations():
    if convo.meta['all_friends_present'] == 1:
        num_convos_with_all_present += 1
print(num_convos_with_all_present)

311


We will pair based on the episode:

In [8]:
main_chars = ['Monica Geller', 'Joey Tribbiani', 'Chandler Bing', 'Phoebe Buffay', 'Rachel Green', 'Ross Geller']
    
pred_feats = ['has_{}_5'.format(char) for char in main_chars]
pred_feats.append('num_main_characters_5')

In [9]:
pp = PairedPrediction(obj_type="conversation",
                     pairing_func=lambda convo: convo.meta['season']+convo.meta['episode'],
                     pred_feats=pred_feats,
                     pos_label_func=lambda convo: convo.meta['all_friends_present'] == 1,
                     neg_label_func=lambda convo: convo.meta['all_friends_present'] == 0)

In [10]:
# Add the pairing information to the conversations
pp.transform(corpus)

<convokit.model.corpus.Corpus at 0x132ae0c18>

In [11]:
convo = next(corpus.iter_conversations())

In [12]:
# Most conversations will have None in the paired prediction labels

convo.meta

{'season': 's01',
 'episode': 'e01',
 'scene': 'c01',
 'has_Monica Geller_5': 1,
 'has_Joey Tribbiani_5': 1,
 'has_Chandler Bing_5': 1,
 'has_Phoebe Buffay_5': 1,
 'has_Rachel Green_5': 0,
 'has_Ross Geller_5': 0,
 'num_main_characters_5': 4,
 'all_friends_present': True,
 'label': None,
 'pair_id': None,
 'pair_orientation': None}

In [13]:
for convo in corpus.iter_conversations():
    if convo.meta['pair_id'] is not None:
        sample_convo = convo
        break

In [14]:
sample_convo.meta

{'season': 's01',
 'episode': 'e01',
 'scene': 'c04',
 'has_Monica Geller_5': 0,
 'has_Joey Tribbiani_5': 1,
 'has_Chandler Bing_5': 1,
 'has_Phoebe Buffay_5': 0,
 'has_Rachel Green_5': 0,
 'has_Ross Geller_5': 1,
 'num_main_characters_5': 3,
 'all_friends_present': False,
 'label': 'neg',
 'pair_id': 's01e01',
 'pair_orientation': 'neg'}

In [15]:
import numpy as np
np.mean(pp.summarize(corpus))

Found 149 valid pairs.


0.7315436241610739

In [16]:
pp.get_coefs(feature_names=pred_feats)

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
num_main_characters_5,0.991779
has_Phoebe Buffay_5,0.710907
has_Chandler Bing_5,0.373066
has_Monica Geller_5,0.291678
has_Rachel Green_5,0.239632
has_Ross Geller_5,0.192501
has_Joey Tribbiani_5,0.184483


In [17]:
pp.print_extreme_coefs(feature_names=pred_feats, num_features=2)


TOP 2 FEATURES
num_main_characters_5: 0.992
has_Phoebe Buffay_5: 0.711

BOTTOM 2 FEATURES
has_Ross Geller_5: 0.193
has_Joey Tribbiani_5: 0.184



DISCLAIMER: This is just a demo. The results here are **not rigorous** and should not be taken even with a pinch of salt.