In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from convokit import Corpus, download

In [2]:
# Download the dataset
corpus = Corpus(filename=download("friends-corpus"))

Dataset already exists at /home/enrico/.convokit/downloads/friends-corpus


In [3]:
df = corpus.get_utterances_dataframe().reset_index()

In [4]:
# Get the utterances from only the main characters
main_characters = ['Monica Geller', 'Ross Geller', 'Phoebe Buffay', 'Joey Tribbiani', 'Chandler Bing', 'Rachel Green']
df_main_char = df[df['speaker'].isin(main_characters)]
df_main_char['num_words'] = df_main_char['text'].apply(lambda x: len(x.split()))
df_main_char.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_main_char['num_words'] = df_main_char['text'].apply(lambda x: len(x.split()))


Unnamed: 0,id,timestamp,text,speaker,reply_to,conversation_id,meta.tokens,meta.character_entities,meta.emotion,meta.caption,meta.transcript_with_note,meta.tokens_with_note,vectors,num_words
0,s01_e01_c01_u001,,There's nothing to tell! He's just some guy I ...,Monica Geller,,s01_e01_c01_u001,"[[There, 's, nothing, to, tell, !], [He, 's, j...","[[], [[0, 1, Paul the Wine Guy], [4, 5, Paul t...",,,,,[],11
1,s01_e01_c01_u002,,"C'mon, you're going out with the guy! There's ...",Joey Tribbiani,s01_e01_c01_u001,s01_e01_c01_u001,"[[C'mon, ,, you, 're, going, out, with, the, g...","[[[2, 3, Monica Geller], [8, 9, Paul the Wine ...",,,,,[],14
2,s01_e01_c01_u003,,"All right Joey, be nice. So does he have a hum...",Chandler Bing,s01_e01_c01_u002,s01_e01_c01_u001,"[[All, right, Joey, ,, be, nice, .], [So, does...","[[[2, 3, Joey Tribbiani]], [[2, 3, Paul the Wi...",,,,,[],16
3,s01_e01_c01_u004,,"Wait, does he eat chalk?",Phoebe Buffay,s01_e01_c01_u003,s01_e01_c01_u001,"[[Wait, ,, does, he, eat, chalk, ?]]","[[[3, 4, Paul the Wine Guy]]]",,,,,[],5
5,s01_e01_c01_u006,,"Just, 'cause, I don't want her to go through w...",Phoebe Buffay,s01_e01_c01_u005,s01_e01_c01_u001,"[[Just, ,, ', cause, ,, I, do, n't, want, her,...","[[[5, 6, Phoebe Buffay], [9, 10, Monica Geller...",,,,,[],16


In [5]:
# First, slower implementation of apperances count of characters
character_appearances = {}
conversations = {}
df_main_char.reset_index()
for i, row in df_main_char.iterrows():
    character = row['speaker']
    conversation_id = row['conversation_id']
    if conversation_id in conversations:
        if not (character in conversations[conversation_id]):
            conversations[conversation_id].append(character)
    else:
        conversations[conversation_id] = [character]
for id in conversations.keys():
    for character in conversations[id]:
        if character in character_appearances:
            character_appearances[character] += 1
        else:
            character_appearances[character] = 1
for character in character_appearances:
    print(f"{character}'s appearances = {character_appearances[character]}")

with open('character_appearances.json', 'w') as fp:
    json.dump(character_appearances, fp)

Monica Geller's appearances = 1442
Joey Tribbiani's appearances = 1454
Chandler Bing's appearances = 1513
Phoebe Buffay's appearances = 1342
Ross Geller's appearances = 1416
Rachel Green's appearances = 1461


In [6]:
# Second, faster implementation of apperances count of characters
character_appearances = {}

for i, row in df_main_char.iterrows():
    character = row['speaker']
    conversation_id = row['conversation_id']
    if character in character_appearances:
        if not (conversation_id in character_appearances[character]):
            character_appearances[character].append(conversation_id)
    else:
        character_appearances[character] = [conversation_id]

for character in character_appearances:
    character_appearances[character] = len(character_appearances[character])
    print(f"{character}'s appearances = {character_appearances[character]}")

Monica Geller's appearances = 1442
Joey Tribbiani's appearances = 1454
Chandler Bing's appearances = 1513
Phoebe Buffay's appearances = 1342
Ross Geller's appearances = 1416
Rachel Green's appearances = 1461


In [7]:
# Last implementation of apperances count of characters with per season separation
character_appearances = {}
prec = {}
for character in main_characters:
    prec[character] = 's01'

for i, row in df_main_char.iterrows():

    character = row['speaker']
    conversation_id = row['conversation_id']
    season = conversation_id.split('_')[0]
    if character in character_appearances:
        if prec[character] != season:
            character_appearances[character][prec[character]] = len(character_appearances[character][prec[character]])
            character_appearances[character][season] = [conversation_id]
        elif not (conversation_id in character_appearances[character][season]):
            character_appearances[character][season].append(conversation_id)
    else:
        character_appearances[character] = {}
        character_appearances[character][season] = [conversation_id]
    prec[character] = season

for character in main_characters:
    character_appearances[character][prec[character]] = len(character_appearances[character][prec[character]])
    count = 0
    for season in character_appearances[character]:
        count += character_appearances[character][season]
    character_appearances[character]['all'] = count
print(character_appearances)

with open('character_appearances.json', 'w') as fp:
    json.dump(character_appearances, fp)

{'Monica Geller': {'s01': 170, 's02': 150, 's03': 153, 's04': 149, 's05': 166, 's06': 156, 's07': 151, 's08': 116, 's09': 130, 's10': 101, 'all': 1442}, 'Joey Tribbiani': {'s01': 166, 's02': 140, 's03': 152, 's04': 159, 's05': 168, 's06': 163, 's07': 148, 's08': 129, 's09': 125, 's10': 104, 'all': 1454}, 'Chandler Bing': {'s01': 182, 's02': 158, 's03': 163, 's04': 166, 's05': 175, 's06': 171, 's07': 140, 's08': 119, 's09': 139, 's10': 100, 'all': 1513}, 'Phoebe Buffay': {'s01': 152, 's02': 139, 's03': 149, 's04': 138, 's05': 138, 's06': 142, 's07': 149, 's08': 116, 's09': 117, 's10': 102, 'all': 1342}, 'Ross Geller': {'s01': 171, 's02': 157, 's03': 159, 's04': 148, 's05': 156, 's06': 145, 's07': 131, 's08': 132, 's09': 116, 's10': 101, 'all': 1416}, 'Rachel Green': {'s01': 172, 's02': 149, 's03': 150, 's04': 164, 's05': 157, 's06': 160, 's07': 151, 's08': 137, 's09': 117, 's10': 104, 'all': 1461}}


In [8]:
# Implementation of line count on every character's appearance
lines_counts = {}
lines_counts_file = {}
for character in main_characters:
    lines_counts[character] = {}
    lines_counts_file[character] = {}

for i, row in df_main_char.iterrows():
    character = row['speaker']
    conversation_id = row['conversation_id']
    id = row['id']
    tokens = row['meta.tokens']
    id_to_tokens = {"id":(id,tokens)}
    if conversation_id in lines_counts[character]:
        lines_counts[character][conversation_id].append(id_to_tokens)
        lines_counts_file[character][conversation_id].append(id_to_tokens)
    else:
        lines_counts[character][conversation_id] = [id_to_tokens]
        lines_counts_file[character][conversation_id] = [id_to_tokens]
for character in lines_counts:
    for conversation_id in lines_counts[character]:
        lines_counts_file[character][conversation_id] = len(lines_counts[character][conversation_id])
        print(f"{conversation_id}'s # of lines = {len(lines_counts[character][conversation_id])}")

with open('lines_counts.json', 'w') as fp:
    json.dump(lines_counts_file, fp)

s01_e01_c01_u001's # of lines = 12
s01_e01_c02_u001's # of lines = 15
s01_e01_c05_u001's # of lines = 5
s01_e01_c08_u001's # of lines = 7
s01_e01_c11_u001's # of lines = 12
s01_e01_c12_u001's # of lines = 4
s01_e01_c13_u001's # of lines = 6
s01_e01_c14_u001's # of lines = 9
s01_e01_c15_u001's # of lines = 3
s01_e02_c01_u001's # of lines = 2
s01_e02_c03_u001's # of lines = 12
s01_e02_c04_u001's # of lines = 8
s01_e02_c05_u001's # of lines = 2
s01_e02_c11_u001's # of lines = 4
s01_e03_c01_u001's # of lines = 2
s01_e03_c03_u001's # of lines = 10
s01_e03_c04_u001's # of lines = 2
s01_e03_c05_u001's # of lines = 9
s01_e03_c06_u001's # of lines = 4
s01_e03_c09_u001's # of lines = 1
s01_e03_c10_u001's # of lines = 5
s01_e03_c12_u001's # of lines = 14
s01_e03_c13_u001's # of lines = 2
s01_e03_c14_u001's # of lines = 3
s01_e04_c01_u001's # of lines = 3
s01_e04_c02_u001's # of lines = 8
s01_e04_c05_u001's # of lines = 7
s01_e04_c07_u001's # of lines = 4
s01_e04_c10_u001's # of lines = 12
s01_e04

In [9]:
# Computation of number of words used and of number of usages of each word, still per each character
words_usages = {}
words_usages_file = {}
for character in main_characters:
    words_usages[character] = {}
    words_usages_file[character] = {}

for character in lines_counts:
    for conversation_id in lines_counts[character]:
        for line in lines_counts[character][conversation_id]:
            id, tokens = line['id']
            count = 0
            for sentence in tokens:
                for token in sentence:
                    if token in words_usages[character]:
                        words_usages[character][token] += 1
                    else:
                        words_usages[character][token] = 1
                count += len(sentence)
            print(f"{id}'s # of words = {count}")
            words_usages_file[character][id] = count
            
with open("words_per_line.json",'w') as fp:
    json.dump(words_usages_file,fp)

s01_e01_c01_u001's # of words = 15
s01_e01_c01_u007's # of words = 27
s01_e01_c01_u018's # of words = 9
s01_e01_c01_u023's # of words = 6
s01_e01_c01_u026's # of words = 7
s01_e01_c01_u028's # of words = 7
s01_e01_c01_u034's # of words = 5
s01_e01_c01_u040's # of words = 39
s01_e01_c01_u048's # of words = 2
s01_e01_c01_u051's # of words = 37
s01_e01_c01_u055's # of words = 17
s01_e01_c01_u057's # of words = 8
s01_e01_c02_u001's # of words = 22
s01_e01_c02_u014's # of words = 14
s01_e01_c02_u017's # of words = 18
s01_e01_c02_u021's # of words = 29
s01_e01_c02_u023's # of words = 13
s01_e01_c02_u028's # of words = 11
s01_e01_c02_u031's # of words = 21
s01_e01_c02_u033's # of words = 2
s01_e01_c02_u035's # of words = 8
s01_e01_c02_u037's # of words = 16
s01_e01_c02_u039's # of words = 2
s01_e01_c02_u043's # of words = 19
s01_e01_c02_u046's # of words = 26
s01_e01_c02_u048's # of words = 10
s01_e01_c02_u054's # of words = 5
s01_e01_c05_u001's # of words = 4
s01_e01_c05_u003's # of words = 

In [10]:
for character in words_usages:
    for word in words_usages[character]:
        print(f"{word}'s count = {words_usages[character][word]}")
with open("words_usages.json",'w') as fp:
    json.dump(words_usages,fp)

There's count = 68
's's count = 1918
nothing's count = 43
to's count = 1708
tell's count = 165
!'s count = 3726
He's count = 120
just's count = 657
some's count = 156
guy's count = 138
I's count = 4255
work's count = 113
with's count = 525
Okay's count = 508
,'s count = 6536
everybody's count = 41
relax's count = 6
.'s count = 6511
This's count = 125
is's count = 882
not's count = 549
even's count = 96
a's count = 1486
date's count = 27
It's count = 390
two's count = 73
people's count = 89
going's count = 236
out's count = 349
dinner's count = 43
and's count = 1029
-'s count = 752
having's count = 46
sex's count = 50
And's count = 248
they's count = 221
were's count = 185
n't's count = 1373
looking's count = 27
at's count = 310
you's count = 3405
before's count = 57
?!'s count = 338
Are's count = 88
okay's count = 169
sweetie's count = 38
?'s count = 2834
Carol's count = 6
moved's count = 9
her's count = 251
stuff's count = 71
today's count = 35
Let's count = 77
me's count = 702
get's 

In [11]:
words_usages_file = {}
prec = {}
for character in main_characters:
    words_usages_file[character] = {}
    for i in range(1,11):
        if i < 10:
            words_usages_file[character][f"s0{i}"] = {}
        else:
            words_usages_file[character][f"s{i}"] = {}
    words_usages_file[character]["all"] = words_usages[character]

for i,row in df_main_char.iterrows():
    character = row['speaker']
    conversation_id = row['conversation_id']
    season = conversation_id.split('_')[0]
    sentences = row['meta.tokens']
    for sentence in sentences:
        for token in sentence:
            if token in words_usages_file[character][season]:
                words_usages_file[character][season][token] += 1
            else:
                words_usages_file[character][season][token] = 1
print(words_usages_file)
with open("words_usages.json",'w') as fp:
    json.dump(words_usages_file,fp)




In [12]:
episodes_per_season = {}
for i, row in df_main_char.iterrows():
    conversation_id = row['conversation_id']
    split = conversation_id.split('_')
    season = split[0]
    episode = split[1]
    if season in episodes_per_season:
        if not (episode in episodes_per_season[season]):
            episodes_per_season[season].append(episode)
    else:
        episodes_per_season[season] = [episode]
for season in episodes_per_season:
    episodes_per_season[season] = len(episodes_per_season[season]) 
    print(f"season {season} has {episodes_per_season[season]} episodes")

with open('episodes_per_season.json', 'w') as fp:
    json.dump(episodes_per_season, fp)

season s01 has 24 episodes
season s02 has 24 episodes
season s03 has 25 episodes
season s04 has 24 episodes
season s05 has 24 episodes
season s06 has 25 episodes
season s07 has 24 episodes
season s08 has 24 episodes
season s09 has 24 episodes
season s10 has 18 episodes


In [29]:
tokens = {}
for character in main_characters:
    tokens[character] = {}
for i, row in df_main_char.iterrows():
    character = row['speaker']
    line = row['text']
    id = row['id']
    split = id.split('_')
    season = split[0]
    episode = split[1]
    conversation = split[2]
    utt = split[3]
    if season in tokens[character]:
        if episode in tokens[character][season]:
            if conversation in tokens[character][season][episode]:
                tokens[character][season][episode][conversation].append(line)
            else:
                tokens[character][season][episode][conversation] = [line]
        else:
            tokens[character][season][episode] = {}
            tokens[character][season][episode][conversation] = [line]
    else:
        tokens[character][season] = {}
        tokens[character][season][episode] = {}
        tokens[character][season][episode][conversation] = [line]

with open('lines_per_conversation.json', 'w') as fp:
    json.dump(tokens, fp)

In [27]:
tokens = {}
for character in main_characters:
    tokens[character] = {}
for i, row in df_main_char.iterrows():
    character = row['speaker']
    line = row['text']
    # flattened_line = [item for sublist in line for item in sublist]
    id = row['id']
    split = id.split('_')
    season = split[0]
    episode = split[1]
    conversation = split[2]
    utt = split[3]
    if season in tokens[character]:
        if episode in tokens[character][season]:
            if conversation in tokens[character][season][episode]:
                tokens[character][season][episode][conversation][utt] = line
            else:
                tokens[character][season][episode][conversation] = {}
                tokens[character][season][episode][conversation][utt] = line
        else:
            tokens[character][season][episode] = {}
            tokens[character][season][episode][conversation] = {}
            tokens[character][season][episode][conversation][utt] = line
    else:
        tokens[character][season] = {}
        tokens[character][season][episode] = {}
        tokens[character][season][episode][conversation] = {}
        tokens[character][season][episode][conversation][utt] = line


with open('separated_lines_per_conversation.json', 'w') as fp:
    json.dump(tokens, fp)