In [1]:
import pandas as pd

raw_data_1 = pd.read_json('./ppg-episode-transcripts-1.json', encoding='utf-8')
raw_data_2 = pd.read_json('./ppg-episode-transcripts-2.json', encoding='utf-8')

In [2]:
raw_data = pd.concat([raw_data_1, raw_data_2])

raw_data.head()

Unnamed: 0,episode,dialog
0,'Twas the Fight Before Christmas,"[Narrator: 'Twas the city of Townsville, and a..."
1,15 Minutes of Fame,[Angela: And over here is our newest acquisiti...
2,A Comedy of Terrors,[Kaoru Matsubara: Check him out.]
3,A Documentary,[Brian Larsen: The city of Townsville. This is...
4,A Made Up Story,[Narrator: The city of Townsville! Where some...


In [3]:
raw_data.describe()

Unnamed: 0,episode,dialog
count,252,251
unique,252,237
top,'Twas the Fight Before Christmas,[]
freq,1,15


In [4]:
# Drop all
# - Animated shorts
# - Movies
# - PPG Z episodes
episodes_to_drop = [
   'a comedy of terrors', # powerpuff girls Z
    'enter the entourage',  # powerpuff girls Z
    "it's all because of him", # powerpuff girls Z
    'powerpuff girls to the rescue', # powerpuff girls Z

   'air buttercup', # Animated short, 2016
   "bubbles' beauty blog", # Animated short, 2016
   'ping pong z', # Animated short, 2016
   'run, blossom, run!', # Animated short, 2016
    "i'll be bake", # Animated short, 2017
    'mojo builds a shelf', # Animated short, 2017
    'the powerpuff girls rule!!!', # TV short, 2008

   'crime 101', # Pilot, 1995/1996
   'meat fuzzy lumkins', # Pilot, 1995

   'whoopass stew' # Movie, their first-ever appearance, 1992
   'the powerpuff girls movie', # Movie, 2002
   "'twas the fight before christmas",  # Movie, 2003
   'powerpuff girls special: dance pantsed', # Movie (weird style), 2014

   'ttg v ppg', # Teen titans, not ppg 
]

raw_data = raw_data[~raw_data['episode'].str.lower().isin(episodes_to_drop)]

In [5]:
all_lines = raw_data.explode(column='dialog')
all_lines = all_lines.reset_index(drop=True)
all_lines.head()

Unnamed: 0,episode,dialog
0,15 Minutes of Fame,Angela: And over here is our newest acquisitio...
1,15 Minutes of Fame,"Blossom: Bubbles, you can calm down now. You d..."
2,15 Minutes of Fame,"Buttercup: Yeah, dude! Check it out!"
3,15 Minutes of Fame,Cat: Meow
4,15 Minutes of Fame,"Blossom: That's right, cats can always get up ..."


In [6]:
all_lines[all_lines['dialog'].isna()]['episode']

2494                      Bye Bye, Bellum
2873                        Cheep Thrills
6668                     Horn, Sweet Horn
11011                         Night Mayor
11709              Once Upon a Townsville
14907                        Sideline Dad
14908                    Silent Treatment
15316    Small World: Heart to Heartstone
16126                        Strong-Armed
17210         Take Your Kids to Dooms Day
17477                       The Big Sleep
17930                             The Fog
18623                     The Squashening
18624                        The Stayover
19471           Total Eclipse of the Kart
Name: episode, dtype: object

In [7]:
speakers = all_lines['dialog'].str.extract("^([.,*#-'/\w\s]*):")

all_lines['speaker'] = speakers
all_lines['speaker'] = all_lines['speaker'].str.strip()
all_lines['speaker'] = all_lines['speaker'].str.lower()
speakers.nunique()

0    1114
dtype: int64

In [8]:
all_lines['dialog'].head(50)

0     Angela: And over here is our newest acquisitio...
1     Blossom: Bubbles, you can calm down now. You d...
2                  Buttercup: Yeah, dude! Check it out!
3                                             Cat: Meow
4     Blossom: That's right, cats can always get up ...
5                                      Cat: Meow, Meow.
6     Brian Larsen: The city of Townsville. This is ...
7     Brian: Oh, you might know them as those three ...
8     Brian: I'm Brian, and this is my documentary. ...
9         Brian: Sugar... spice... and everything nice.
10    Brian: Yep, those are pretty normal ingredient...
11    Brian: The city of Townsville. A great city wi...
12    Brian: The newly renovated and revitalized Old...
13    Brian: Although this bus looks better than tha...
14    Brian: There are other reminders and countless...
15    Brian: Their photos have graced newspapers and...
16    Brian: They have merchandising... and even the...
17    Brian: My first visit was unannounced... a

In [9]:
dialog_lines = all_lines['dialog']
dialog_lines = dialog_lines.reset_index(drop=True)
dialog_lines

0        Angela: And over here is our newest acquisitio...
1        Blossom: Bubbles, you can calm down now. You d...
2                     Buttercup: Yeah, dude! Check it out!
3                                                Cat: Meow
4        Blossom: That's right, cats can always get up ...
                               ...                        
20406    Mojo: But it is I who planned it! I did it, no...
20407    Mojo: Iam responsible for trying to destroy you! 
20408                   Blossom: Okay. Thenyougo to jail. 
20409                                  Mojo: That's right!
20410    Narrator: So once again the day is saved-thank...
Name: dialog, Length: 20411, dtype: object

In [10]:
def process_dialog(dialog):
    if isinstance(dialog, str):
        parts = dialog.split(': ')
        if len(parts) > 1:
            return ' '.join(parts[1:])
        elif len(parts) == 1:
            return parts[0]
    else:
        return None
    
dialog_without_speaker = dialog_lines.apply(process_dialog)

In [11]:
dialog_without_speaker.head(50)

0     And over here is our newest acquisition to the...
1           Bubbles, you can calm down now. You did it!
2                             Yeah, dude! Check it out!
3                                                  Meow
4     That's right, cats can always get up a tree......
5                                           Meow, Meow.
6     The city of Townsville. This is where the Powe...
7     Oh, you might know them as those three cute li...
8     I'm Brian, and this is my documentary. And you...
9                Sugar... spice... and everything nice.
10    Yep, those are pretty normal ingredients, all ...
11    The city of Townsville. A great city with many...
12    The newly renovated and revitalized Old Townsv...
13    Although this bus looks better than that corpo...
14    There are other reminders and countless monume...
15    Their photos have graced newspapers and tabloi...
16    They have merchandising... and even their own ...
17    My first visit was unannounced... and just

In [12]:
all_lines['dialog'] = dialog_without_speaker
all_lines['dialog']

0        And over here is our newest acquisition to the...
1              Bubbles, you can calm down now. You did it!
2                                Yeah, dude! Check it out!
3                                                     Meow
4        That's right, cats can always get up a tree......
                               ...                        
20406    But it is I who planned it! I did it, not them...
20407          Iam responsible for trying to destroy you! 
20408                            Okay. Thenyougo to jail. 
20409                                        That's right!
20410    So once again the day is saved-thanks to the P...
Name: dialog, Length: 20411, dtype: object

In [13]:
unique_speakers = all_lines['speaker'].unique()

for speaker in unique_speakers:
    if isinstance(speaker, str) and ',' in speaker:
        print(speaker)

bubbles, buttercup
blossom, buttercup
blossom, bubbles and buttercup
kids, blossom
slim, junior
mojo, princess, boys
blossom, bubbles, buttercup and the professor
blossom, bubbles, and buttercup
professor, blossom, buttercup
brick, butch
blossom, bubbles
teen blossom, teen buttercup
girl 2, boy 1
blossom/ms. bellum, buttercup/professor
mayor, ms. bellum
mojo, 'him'
bossman, bossman 2
slim, slim 2
junior, juniors 2 and 3
bubbles, buttercup, random boy, random girl, another random boy, robin snyder, another random boy, two random boys, blossom, jared, and barry
random girl , random boy , random boy , buttercup  on the left, jared , and robin , blossom , and two boys
blossom, buttercup, jared, barry, and robin
chad, man with glasses, and pink haired lady
driver, robbers
girls, duplicates
kids, parents
stan, sandra
fields, anthropist, blind
class, ms. keane
girls, professor
professor, ms. keane
ace, billy, arturo
fuzzy, princess, 'him'
kim, jeff, mary
joey, kim
necktie wearing monster, wai

In [14]:
# Save one as second speaker
double_speakers = [    
    "blossom and buttercup",
    "blossom and bubbles",
    "the professor and blossom",
    "both blossom and buttercup",
    "buttercup and blossom",
    "blossom and the professor",
    "bubbles and blossom",
    "bubbles and buttercup",
    "buttercup and bubbles",
    "professor and buttercup",
    "professor utonium and javier xavier",
    "man and woman",
    "girls and erica",
    "momoko and miyako",
    "kaoru and miyako",
    "ms. bellumold woman 1",
    "ms. keaneold woman 2",
    "mayor and ms. bellum",
    "blossom/ms. bellum",
    "ms. bellum/blossom",
    "buttercup/green batman",
    "bubbles/bubbs wonder",
    "buttercup/professor",
    "professor/buttercup",
    "blossom/ms. bellum",
    "bubbles/mayor",
    "mayor/bubbles",
    "ms. bellum/blossom",
    "girl 1/ace",
    "girl 2/grubber",
    "girl 3/snake",
    "old woman/mojo",
    "mojo/old woman",
    "blossom/dachshund",
    "bubbles/centipede",
    "buttercup/cobra",
    "professor/older girl",
    "mojo/fish",
    "professor/ugly baby",
    "professor/nurse",
    "professor/astronaut",
    "professor/pink creature",
    "professor/morbid woman",
    "bubbles/narrator",
    "narrator/bubbles",
    "blossom/bubbles",
    "octi/him",
    "buttercup/bubblecup",
    "bubbles/children",
    "old man/janitor",
    "eyes/blonde hair covered boy",
    "blossom/liberty belle",
    "bubbles/harmony bunny",
    "buttercup/mange",
    "blossom/buttercup",
    "bubbles, buttercup",
"girls, professor",
"blossom, buttercup",
"blossom, bubbles and buttercup",
"kids, blossom",
"slim, junior",
"mojo, princess, boys",
"blossom, bubbles, buttercup and the professor",
"blossom, bubbles, and buttercup",
"professor, blossom, buttercup",
"brick, butch",
"blossom, bubbles",
"teen blossom, teen buttercup",
"girl 2, boy 1",
"skinny, tiny",
"blossom/ms. bellum, buttercup/professor",
"mayor, ms. bellum",
"mojo, 'him'",
"bossman, bossman 2",
"slim, slim 2",
"junior, juniors 2 and 3",
"bubbles, buttercup, random boy, random girl, another random boy, robin snyder, another random boy, two random boys, blossom, jared, and barry",
"random girl , random boy , random boy , buttercup  on the left, jared , and robin , blossom , and two boys",
"blossom, buttercup, jared, barry, and robin",
"chad, man with glasses, and pink haired lady",
"driver, robbers",
"girls, duplicates",
"kids, parents",
"stan, sandra",
"fields, anthropist, blind",
"class, ms. keane",
"pug faced paulie, scared",
"momoko, miyako, and kaoru",
"professor, ms. keane",
"ace, billy, arturo",
"fuzzy, princess, 'him'",
"kim, jeff, mary",
"joey, kim",
"man 9, bass monster",
"gang, fuzzy, 'him'",
"bertha, beatrice",
"lloyd, floyd",
"ace, snake, lil' arturo, big billy",
"002, 017",
"ace, snake, arturo, billy",
"ace, snake",
"boys 2, 3, 4",
"monty, slim",
"bubbles, buttercup and the professor",
"girls, robin",
"barry, bubbles, and buttercup",
"bubbles, blossom",
"many monkeys, chimpanzees, gorillas, orangutans, and even an uakari",
"crowd, professor, ms. bellum",
"governor, yokel, lummox",
"octi/'him'",
"'him'/ms. keane",
"blossom & buttercup",
"blossom & bubbles",
"bubbles & blossom",
"blossom & professor utonium",
"bubbles & buttercup",
"buttercup & bubbles",
"spikey haired boy & eyes/blonde hair covered boy",
"donny & chelsea"
]

blossom_values = [ 
    "blossom #2",
    "blosoom",
    "all blossoms",
    "blossom  and blossom",
    "the blossoms",
    "siren blossom",
    "teen blossom",
    "blossom as she fights the bunnies",
    "knockoff blossom",
    "survivor reject blossom",
    "angel blossom",
    "robot blossom",
    "robo blossom",
    "lice blossom",
    "french renaissance blossom",
    "old blossom",
    "blossom, in the shop",
    "blossom, to bubbles",
    "blossom's thoughts",
    "blossom ",
    "both blossom"
]

bubbles_values = [
    "bubbles",
    "bubbes",
    "dark bubbles",
    "teen bubbles",
    "knockoff bubbles",
    "robo bubbles",
    "french renaissance bubbles",
    "bubbles and doubles",
    "old bubbles",
    "bubblecup",
    "bubbles' voice",
    "bubbles #2"
]

buttercup_values = [
    "buttercup double",
    "buttecup",
    "real buttercup",
    "buttercup on video",
    "teen buttercup",
    "knockoff buttercup",
    "reject buttercup",
    "angel buttercup",
    "robo buttercup",
    "flashback buttercup",
    "old buttercup",
    "buttercup, to herself",
    "buttercup, disguised",
    "buttercup, as she accidentaly reveals herself",
    "buttercup, scared",
    "and buttercup",
    "buttercup  on the left",
    "buttercup's thoughts",
    "buttercup #2"
]

mojo_values = [
    "mojo",
    "jojo",
    "'mojo'",
    "'mojo' ",
    "mojo'",
    "mojo's thoughts"
]

professor_values = [
    "professor",
    "dream professor",
    "memory professor",
    "professor utonium",
    "kid professor",
    "young prof",
    "student prof",
    "the professor",
    "flashback professor",
]

mayor_values = [
    "mayor",
    "the mayor",
    "younger mayor",
    "mayor mayer",
    "mayor of new townsville",
    "lice mayor",
    "citiesville mayor",
]

him_values = [
    "him",
    "'him'",
    "memory him",
    "black spored him"
]

narrator_values = [
    "narrator",
    "male narrator",
    "man voiceover",
]

princess_values = [
    "princess morebucks",
    "princess",
    "princess morbucks",
    "morbucks",
    "morbucks' heart"
]

fuzzy_values = [
    "fuzzy",
    "fuzzy lumpkins"
]

brick_values = [
    "brick",
    "teen brick",
    "brickowski"
]

woman_values = [
    "old woman",
    "woman",
    "muscle woman",
    "woman in the suit",
    "policewoman",
    "woman 2",
    "woman 3",
    "brown skinned woman",
    "woman 1",
    "young woman",
    "adult teenage woman",
    "woman 4",
    "woman 5",
    "woman 6",
    "woman 7",
    "iss woman",
    "cowgirl woman",
    "backgrounder woman",
    "fat woman",
    "fainting woman",
    "nature woman",
    "jewel owner woman",
    "jamaican woman",
    "curler woman",
    "woman 8",
    "woman 9",
    "repairwoman",
    "morbid woman",
    "woman #2",
    "woman #3",
    "woman #1",
    "woman's voice",
    "lunch lady",
    "and pink haired lady",
    "rocket a.i. lady",
    "cleaning lady",
    "video lady",
    "lady" #1,
    "old lady",
    "banklady",
    "news lady",
    "elderly lady",
    "anchor lady",
    "lady",
    "baby lady",
    "random lady",
]

man_values = [
    "man",
    "policeman",
    "old man",
    "swan boat man",
    "barber man",
    "man 1",
    "man 2",
    "man 3",
    "man in court",
    "man in a blue long sleeved robe",
    "old man 1",
    "old man 2",
    "old man 3",
    "man with glasses",
    "policeman 1",
    "policeman 2",
    "security man 1",
    "security man 2",
    "postman",
    "renaissance man",
    "delivery man",
    "man with a green hat",
    "man wit a green hat",
    "man in the space suit",
    "running man",
    "pointing man",
    "manager",
    "french man with a french hat",
    "brown haired man with glasses",
    "man 4",
    "man 5",
    "hotdog man",
    "man 6",
    "man 7",
    "man 8",
    "man 9",
    "man 10",
    "man 11",
    "abs man",
    "commercial man",
    "prison man",
    "salesman",
    "henchman",
    "mailman",
    "backgrounder man",
    "bank manager",
    "pizza man",
    "rainbow man",
    "bagman",
    "gunman",
    "short man",
    "newsman",
    "jewel owner man",
    "smooth man",
    "anchorman",
    "businessman",
    "french renaissance man",
    "fireman 1",
    "man 12",
    "man 13",
    "man 14",
    "fireman 2",
    "man being stung by bees",
    "foam finger man",
    "man in overalls",
    "man in trough",
]

bliss_values = [
    "memory bliss",
    "bliss",
    "memory teenage bliss",
    "young memory bliss"
]

monster_values = [
    "monster",
    "pink tentacled monster",
    "green monster",
    "bass monster",
    "female fish monster",
    "red lizard monster",
    "lawyer monster",
    "monsters",
    "fancy monster",
    "purple octopus",
    "dragon",
    "pincered monster",
    "necktie wearing monster",
    "oneeyed monster",
    "singledout monster",
]

boy_values = [
    "boy",
    "huge boy",
    "pimple faced boy",
    "boy 1",
    "boy 2",
    "boy 3",
    "boy 4",
    "nerd boy",
    "bigger boy",
    "smaller boy",
    "boy with brown spiky hair",
    "man boy",
    "manboy",
    "spikey haired boy",
    "freckle faced boy",
    "boy 5",
    "boy 6",
    "boy 7",
    "boy nerd",
    "green haired boy with rapper glasses",
    "french renaissance orange haired boy",
    "redhead boy",
    "'injured' boy",
    "brownskinned boy",
    "random boy",
    "another random boy",
    "two random boys",
    "random boy ",
    "and two boys",
    "spikey haired boy & eyes/blonde hair covered boy",
    "blonde hair covered boy",
    "boys 2",
    "boy #1",
]

ppg_values = [
    "all 3 girls",
    "all three girls",
    "girls",
    "the girls",
    "old girls",
    "the 3 girls",
    "all girls",
    "powerpuff girls z",
    "the 4 girls"
]

rrb_values = [
    "boys"
]

girl_values = [
    "girl",
    "cool girl",
    "girl with braces",
    "girl 1",
    "girl 2",
    "girl 3",
    "blonde teenage girl",
    "blonde girl",
    "girl with glasses",
    "girl without glasses",
    "2 girls",
    "cool girl 1",
    "cool girl 3",
    "cool girl 2",
    "all 3 cool girls",
    "girl with brown hair",
    "brown haired girl",
    "cowgirl",
    "friendly girl",
    "derbytante girls",
    "spirit girl",
    "mail girl",
    "flower girl",
    "girl student",
    "girl 4",
    "2 unnamed girls",
    "unnamed girl with a pink shirt",
    "teenage girl 1",
    "teenage girl 3",
    "teenage girls",
    "pimple girl",
    "older girl",
    "random girl",
    "random girl ",
    "girl #1",
    "girl #2",
    "girl #3",
]

miyako_values = [
    "memory miyako",
    "miyako",
    "flashback miyako",
]

sapna_values = [
    "sapna,"
    "dream sapna",
    "spider sapna"
]

kaoru_values = [
    "kaoru matsubara"
]

announcer_values = [
    "space towtruck and the infinite jumper cables movie announcer",
    "announcer",
    "announcer 1",
    "announcer 2",
    "another announcer",
    "radio announcer",
    "tv announcer",
    "space towtruck announcer",
    "male pageant announcer",
    "video game announcer",
]

keane_values = [
    "ms. keane",
    "young keane",
    "keanes",
    "keane 2",
    "keane 1",
    "miss keane",
    "french renaissance ms. keane"
]

bellum_values = [
    "ms. bellum",
    "young bellum",
    "miss bellum"
]

billy_values = [
    "billy",
    "big billy"
]

butch_values = [
    "butch",
    "teen butch"
]

boomer_values = [
    "boomer",
    "teen boomer"
]

talking_dog_values = [
    "talking dog",
    "talking  dog",
    "talking dog billboard"
]

junior_values = [
    "junior",
    "junior 4"
]

bossman_values = [
    "bossman",
    "bossman 3",
    "bossman 2"
]

dick_values = [
    "dick",
    "student dick"
]

marianne_values = [
    'mrs. smith'
]

harold_values = [
    'mr. smith'
]

sedusa_values = [
    'ima'
]

In [15]:
import re

# All double speakers in consistent format (comma-separated)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: re.sub(r'(/|\sand\s|,\s|\s&\s)', ',', x) if x in double_speakers else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: re.sub('/', ',', x) if x in double_speakers else x)

all_lines.loc[all_lines['speaker'] == 'ms. bellumold woman 1', 'speaker'] = 'ms. bellum,woman'
all_lines.loc[all_lines['speaker'] == 'ms. keaneold woman 2', 'speaker'] = 'ms. keane,woman'

double_speaker_items = all_lines[all_lines['speaker'].isin(double_speakers)]
double_speaker_items['speaker']

Series([], Name: speaker, dtype: object)

In [16]:
len(all_lines['speaker'])

20411

In [17]:
new_double_speakers = all_lines[all_lines['speaker'].str.contains(',', na=False)]
len(new_double_speakers['speaker'].value_counts())

100

In [18]:
all_lines['speaker'] = all_lines['speaker'].str.split(',')
all_lines = all_lines.explode('speaker')
all_lines = all_lines.reset_index(drop=True)
all_lines

Unnamed: 0,episode,dialog,speaker
0,15 Minutes of Fame,And over here is our newest acquisition to the...,angela
1,15 Minutes of Fame,"Bubbles, you can calm down now. You did it!",blossom
2,15 Minutes of Fame,"Yeah, dude! Check it out!",buttercup
3,15 Minutes of Fame,Meow,cat
4,15 Minutes of Fame,"That's right, cats can always get up a tree......",blossom
...,...,...,...
21049,"You Snooze, You Lose","But it is I who planned it! I did it, not them...",mojo
21050,"You Snooze, You Lose",Iam responsible for trying to destroy you!,mojo
21051,"You Snooze, You Lose",Okay. Thenyougo to jail.,blossom
21052,"You Snooze, You Lose",That's right!,mojo


In [19]:
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'blossom' if x in blossom_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'bubbles' if x in bubbles_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'buttercup' if x in buttercup_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'professor' if x in professor_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'mayor' if x in mayor_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'mojo jojo' if x in mojo_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'narrator' if x in narrator_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'him' if x in him_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'princess morbucks' if x in princess_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'fuzzy' if x in fuzzy_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'woman' if x in woman_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'man' if x in man_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'brick' if x in brick_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'bliss' if x in bliss_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'monster' if x in monster_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'boy' if x in boy_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'girl' if x in girl_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'powerpuff girls' if x in ppg_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'rowdyruff boys' if x in rrb_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'sapna' if x in sapna_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'miyako' if x in miyako_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'kaoru' if x in kaoru_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'announcer' if x in announcer_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'ms. keane' if x in keane_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'ms. bellum' if x in bellum_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'billy' if x in billy_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'butch' if x in butch_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'boomer' if x in boomer_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'talking dog' if x in talking_dog_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'junior' if x in junior_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'bossman' if x in bossman_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'marianne' if x in marianne_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'harold' if x in harold_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'sedusa' if x in sedusa_values else x)

In [20]:
unique_speakers = all_lines['speaker'].unique()

for speaker in unique_speakers:
    if isinstance(speaker, str) and "femme" in speaker:
        print(speaker)

femme fatale


In [21]:
all_lines['speaker'].value_counts().head(60)

blossom              3997
buttercup            3211
bubbles              2890
professor            1682
narrator              966
mojo jojo             834
mayor                 726
powerpuff girls       436
ms. keane             370
princess morbucks     361
him                   246
man                   237
woman                 174
ms. bellum            173
brick                 165
bliss                 108
ace                   103
announcer              85
boy                    82
sedusa                 77
girl                   73
bossman                68
talking dog            67
monster                65
billy                  64
barry                  63
dick                   62
fuzzy                  61
bernie                 60
maylyn                 60
marianne               53
boomer                 50
butch                  50
sapna                  50
robin                  46
brian                  46
major glory            45
mitch                  42
donny       

In [22]:
# Identify episodes with only 1 line, probably we need to filter them out
all_lines['episode'].value_counts()

Criss Cross Crisis                            281
Superfriends                                  240
Power-Noia                                    227
Members Only                                  225
Film Flam                                     222
                                             ... 
Man Up 3: The Good, the Bad, and the Manly      1
The Buttercup Job                               1
Horn, Sweet Horn                                1
The Oct-Father                                  1
The Fog                                         1
Name: episode, Length: 236, dtype: int64

In [23]:
import numpy as np

# Create a villain column
conditions = [
    all_lines['speaker'] == 'mojo jojo',
    all_lines['speaker'] == 'him',
    all_lines['speaker'] == 'talking dog',
    all_lines['speaker'] == 'sedusa',
    all_lines['speaker'] == 'princess morbucks',
    all_lines['speaker'] == 'dick',
    all_lines['speaker'] == 'bernie',
    all_lines['speaker'] == 'sandman',
    all_lines['speaker'] == 'lenny',
    all_lines['speaker'] == 'femme fatale',
    all_lines['speaker'] == 'gnome',
    all_lines['speaker'] == 'fuzzy',
    all_lines['speaker'].isin(['brick', 'boomer', 'butch']), 
    all_lines['speaker'].isin(['ace', 'grubber', 'billy', 'snake']),
    all_lines['speaker'].isin(['bossman', 'junior', 'slim']),
    all_lines['speaker'].isin(['marianne', 'harold', 'julie', 'bud']),
]
values = [
    'mojo jojo', 
    'him',
    'talking dog',
    'sedusa',
    'princess morbucks',
    'dick',
    'bernie',
    'sandman',
    'lenny',
    'femme fatale',
    'gnome',
    'fuzzy',
    'gangreen gang',
    'rowdyruff boys',
    'amoeba boys',
    'smith family'
]

# Use numpy's where function to create the new column
all_lines['villain'] = np.select(conditions, values, default=None)
all_lines['villain'].head()

0    None
1    None
2    None
3    None
4    None
Name: villain, dtype: object

In [24]:
all_lines['villain'].value_counts()

mojo jojo            834
princess morbucks    361
gangreen gang        265
him                  246
rowdyruff boys       200
amoeba boys          116
smith family         105
sedusa                77
talking dog           67
dick                  62
fuzzy                 61
bernie                60
lenny                 25
femme fatale          23
sandman               14
gnome                  7
Name: villain, dtype: int64

In [25]:
seasons_1998_raw = pd.read_json('./ppg-seasons-1998.json', encoding='utf-8')
seasons_2016_raw = pd.read_json('./ppg-seasons-2016.json', encoding='utf-8')

seasons_2016_raw.head()

Unnamed: 0,year,season,episode_nr,title,description
0,2016,1,1,Escape from Monster Island,While Bubbles tries to decide whether to take ...
1,2016,1,2,Princess Buttercup,"Buttercup starts hanging out with a bad crowd,..."
2,2016,1,3,The Stayover,"Recovering from a candy hangover, Blossom and ..."
3,2016,1,4,Painbow,A rainbow makes everyone in Townsville unnatur...
4,2016,1,5,"Horn, Sweet Horn","At Bubbles' request, the Professor tries to ma..."


In [26]:
seasons_raw = pd.concat([seasons_1998_raw, seasons_2016_raw])
seasons_raw.head(10)

Unnamed: 0,year,season,episode_nr,title,description
0,1998,1,0,A Sticky Situation!,Begins with the same opening sequence as the s...
1,1998,1,1,"Monkey See, Doggie Do/Mommy Fearest","In ""Monkey See, Doggie Do"", Mojo Jojo swipes s..."
2,1998,1,2,Insect Inside/Powerpuff Bluff,"""Insect Inside"" The city of Townsville becomes..."
3,1998,1,3,Octi Evil/Geshundfight,"""Octi-Evil"" Him is out to break up the Powerpu..."
4,1998,1,4,Buttercrush/Fuzzy Logic,"""Buttercrush"" Buttercup develops a crush on Ac..."
5,1998,1,5,Boogie Frights/Abracadaver,"In ""Boogie Frights"", the Boogie Man comes up w..."
6,1998,1,6,Telephonies/Tough Love,"""Telephonies"" The Gangreen Gang gets hold of t..."
7,1998,1,7,Major Competition/Mr. Mojo's Rising,"""Major Competition"" The Powerpuff Girl's place..."
8,1998,1,8,Paste Makes Waste/Ice Sore,"In ""Paste Makes Waste"", one of the Pokey Oaks ..."
9,1998,1,9,Bubblevicious/The Bare Facts,Bubbles is tired of being the cute and wants t...


In [27]:
seasons_raw['title'].head(50)

0                                   A Sticky Situation!
1                   Monkey See, Doggie Do/Mommy Fearest
2                         Insect Inside/Powerpuff Bluff
3                                Octi Evil/Geshundfight
4                               Buttercrush/Fuzzy Logic
5                            Boogie Frights/Abracadaver
6                                Telephonies/Tough Love
7                   Major Competition/Mr. Mojo's Rising
8                            Paste Makes Waste/Ice Sore
9                          Bubblevicious/The Bare Facts
10                              Cat Man Do/Impeach Fuzz
11            Just Another Manic Mojo/Mime for a Change
12                                   The Rowdyruff Boys
13                                         Uh Oh Dynamo
14            Stuck Up, Up, and Away/Schoolhouse Rocked
15                           Collect Her/Supper Villain
16                     Birthday Bash/Too Pooped to Puff
17                       Beat Your Greens/Down '

In [28]:
seasons = seasons_raw.copy()

seasons['title'] = seasons['title'].str.lower()
seasons['title'] = seasons['title'].str.split('/')
seasons['title_1'] = seasons['title'].str[0]
seasons['title_2'] = seasons['title'].str[1]
all_lines['episode'] = all_lines['episode'].str.lower()
seasons.head(10)

Unnamed: 0,year,season,episode_nr,title,description,title_1,title_2
0,1998,1,0,[a sticky situation!],Begins with the same opening sequence as the s...,a sticky situation!,
1,1998,1,1,"[monkey see, doggie do, mommy fearest]","In ""Monkey See, Doggie Do"", Mojo Jojo swipes s...","monkey see, doggie do",mommy fearest
2,1998,1,2,"[insect inside, powerpuff bluff]","""Insect Inside"" The city of Townsville becomes...",insect inside,powerpuff bluff
3,1998,1,3,"[octi evil, geshundfight]","""Octi-Evil"" Him is out to break up the Powerpu...",octi evil,geshundfight
4,1998,1,4,"[buttercrush, fuzzy logic]","""Buttercrush"" Buttercup develops a crush on Ac...",buttercrush,fuzzy logic
5,1998,1,5,"[boogie frights, abracadaver]","In ""Boogie Frights"", the Boogie Man comes up w...",boogie frights,abracadaver
6,1998,1,6,"[telephonies, tough love]","""Telephonies"" The Gangreen Gang gets hold of t...",telephonies,tough love
7,1998,1,7,"[major competition, mr. mojo's rising]","""Major Competition"" The Powerpuff Girl's place...",major competition,mr. mojo's rising
8,1998,1,8,"[paste makes waste, ice sore]","In ""Paste Makes Waste"", one of the Pokey Oaks ...",paste makes waste,ice sore
9,1998,1,9,"[bubblevicious, the bare facts]",Bubbles is tired of being the cute and wants t...,bubblevicious,the bare facts


In [29]:
lines_with_seasons = pd.merge(all_lines, seasons, how='left', left_on='episode', right_on='title_1')
lines_with_seasons = pd.merge(lines_with_seasons, seasons, how='left', left_on='episode', right_on='title_2')

In [30]:
lines_with_seasons.head()

Unnamed: 0,episode,dialog,speaker,villain,year_x,season_x,episode_nr_x,title_x,description_x,title_1_x,title_2_x,year_y,season_y,episode_nr_y,title_y,description_y,title_1_y,title_2_y
0,15 minutes of fame,And over here is our newest acquisition to the...,angela,,2016.0,2.0,3.0,[15 minutes of fame],Green Wing: When volunteering at a retirement ...,15 minutes of fame,,,,,,,,
1,15 minutes of fame,"Bubbles, you can calm down now. You did it!",blossom,,2016.0,2.0,3.0,[15 minutes of fame],Green Wing: When volunteering at a retirement ...,15 minutes of fame,,,,,,,,
2,15 minutes of fame,"Yeah, dude! Check it out!",buttercup,,2016.0,2.0,3.0,[15 minutes of fame],Green Wing: When volunteering at a retirement ...,15 minutes of fame,,,,,,,,
3,15 minutes of fame,Meow,cat,,2016.0,2.0,3.0,[15 minutes of fame],Green Wing: When volunteering at a retirement ...,15 minutes of fame,,,,,,,,
4,15 minutes of fame,"That's right, cats can always get up a tree......",blossom,,2016.0,2.0,3.0,[15 minutes of fame],Green Wing: When volunteering at a retirement ...,15 minutes of fame,,,,,,,,


In [31]:
lines_with_seasons.groupby(['season_y'])['episode'].nunique()

season_y
1.0     7
2.0    13
3.0    11
4.0     1
5.0    10
6.0    12
Name: episode, dtype: int64

In [32]:
lines_with_seasons.groupby(['season_x'])['episode'].nunique()

season_x
1.0    48
2.0    46
3.0    48
4.0    12
5.0    12
6.0    15
Name: episode, dtype: int64

In [33]:
lines_with_seasons.columns

Index(['episode', 'dialog', 'speaker', 'villain', 'year_x', 'season_x',
       'episode_nr_x', 'title_x', 'description_x', 'title_1_x', 'title_2_x',
       'year_y', 'season_y', 'episode_nr_y', 'title_y', 'description_y',
       'title_1_y', 'title_2_y'],
      dtype='object')

In [34]:
lines_with_seasons['season'] = np.where(lines_with_seasons['season_x'].notna(), lines_with_seasons['season_x'], lines_with_seasons['season_y'])
lines_with_seasons['episode_nr'] = np.where(lines_with_seasons['episode_nr_x'].notna(), lines_with_seasons['episode_nr_x'], lines_with_seasons['episode_nr_y'])
lines_with_seasons['description'] = np.where(lines_with_seasons['description_x'].notna(), lines_with_seasons['description_x'], lines_with_seasons['description_y'])
lines_with_seasons['title_1'] = np.where(lines_with_seasons['title_1_x'].notna(), lines_with_seasons['title_1_x'], lines_with_seasons['title_1_y'])
lines_with_seasons['title_2'] = np.where(lines_with_seasons['title_2_x'].notna(), lines_with_seasons['title_2_x'], lines_with_seasons['title_2_y'])
lines_with_seasons['year'] = np.where(lines_with_seasons['year_x'].notna(), lines_with_seasons['year_x'], lines_with_seasons['year_y'])
lines_with_seasons['season'].value_counts()

2.0    4877
3.0    4525
1.0    4370
4.0    2471
5.0    2407
6.0    2404
Name: season, dtype: int64

In [35]:
lines_with_seasons['season'].isna().sum()

0

In [36]:
lines_with_seasons.drop(columns=[
    'season_x', 'episode_nr_x',
       'title_x', 'description_x', 'title_1_x', 'title_2_x', 'season_y',
       'episode_nr_y', 'title_y', 'description_y', 'title_1_y', 'title_2_y', 'year_x', 'year_y' 
], inplace=True)

In [37]:
lines_with_seasons['episode_nr'] = np.where(lines_with_seasons['title_2'] == lines_with_seasons['episode'], lines_with_seasons['episode_nr'] + 0.2, lines_with_seasons['episode_nr'])
lines_with_seasons['episode_nr'] = np.where((lines_with_seasons['title_1'] == lines_with_seasons['episode']) & (lines_with_seasons['title_2'].notna()), lines_with_seasons['episode_nr'] + 0.1, lines_with_seasons['episode_nr'])

In [38]:
lines_with_seasons[lines_with_seasons['season'].isna()]['episode'].unique()

array([], dtype=object)

In [39]:
# NB They are all 2016, so for now I'm not too worried
lines_with_seasons[lines_with_seasons['dialog'].isna()]

Unnamed: 0,episode,dialog,speaker,villain,season,episode_nr,description,title_1,title_2,year
2564,"bye bye, bellum",,,,1.0,7.0,The girls are overworked helping the Mayor whe...,"bye bye, bellum",,2016.0
2947,cheep thrills,,,,1.0,24.0,Cheep Thrills: After finding a baby monster in...,cheep thrills,,2016.0
6975,"horn, sweet horn",,,,1.0,5.0,"At Bubbles' request, the Professor tries to ma...","horn, sweet horn",,2016.0
11383,night mayor,,,,6.0,6.2,"""Little Miss Interprets"" When the Girls hear t...",little miss interprets,night mayor,1998.0
12096,once upon a townsville,,,,1.0,17.0,"Chased by a dragon, Princess Bluebell accident...",once upon a townsville,,2016.0
15351,sideline dad,,,,3.0,36.0,"When the Girls start playing soccer, the Profe...",sideline dad,,2016.0
15352,silent treatment,,,,5.0,14.1,The girls are pulled into a silent movie by an...,silent treatment,sweet 'n sour,1998.0
15764,small world: heart to heartstone,,,,3.0,16.0,The Powerpuff Girls are sent to the arctic on ...,small world: heart to heartstone,,2016.0
16597,strong-armed,,,,1.0,9.0,Bubbles wears a robotic cast for her broken ar...,strong-armed,,2016.0
17753,take your kids to dooms day,,,,2.0,12.0,The Girls spend the day with the Professor at ...,take your kids to dooms day,,2016.0


In [40]:
lines_with_seasons.dropna(subset=['dialog'], inplace=True)

In [41]:
lines_with_seasons[lines_with_seasons['dialog'].isna()]

Unnamed: 0,episode,dialog,speaker,villain,season,episode_nr,description,title_1,title_2,year


In [42]:
lines_with_seasons.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21039 entries, 0 to 21053
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   episode      21039 non-null  object 
 1   dialog       21039 non-null  object 
 2   speaker      20877 non-null  object 
 3   villain      2523 non-null   object 
 4   season       21039 non-null  float64
 5   episode_nr   21039 non-null  float64
 6   description  21039 non-null  object 
 7   title_1      21039 non-null  object 
 8   title_2      11008 non-null  object 
 9   year         21039 non-null  float64
dtypes: float64(3), object(7)
memory usage: 1.8+ MB


In [43]:
lines_with_seasons.head()

Unnamed: 0,episode,dialog,speaker,villain,season,episode_nr,description,title_1,title_2,year
0,15 minutes of fame,And over here is our newest acquisition to the...,angela,,2.0,3.0,Green Wing: When volunteering at a retirement ...,15 minutes of fame,,2016.0
1,15 minutes of fame,"Bubbles, you can calm down now. You did it!",blossom,,2.0,3.0,Green Wing: When volunteering at a retirement ...,15 minutes of fame,,2016.0
2,15 minutes of fame,"Yeah, dude! Check it out!",buttercup,,2.0,3.0,Green Wing: When volunteering at a retirement ...,15 minutes of fame,,2016.0
3,15 minutes of fame,Meow,cat,,2.0,3.0,Green Wing: When volunteering at a retirement ...,15 minutes of fame,,2016.0
4,15 minutes of fame,"That's right, cats can always get up a tree......",blossom,,2.0,3.0,Green Wing: When volunteering at a retirement ...,15 minutes of fame,,2016.0


In [44]:
# Fill all lines without a speaker with the speaker before. CHECK!!
no_speakers = lines_with_seasons[lines_with_seasons['speaker'].isna()]
no_speakers.head()

Unnamed: 0,episode,dialog,speaker,villain,season,episode_nr,description,title_1,title_2,year
3514,cop out,"Back to you there, Kevin.",,,3.0,5.2,"""Gettin' Twiggy With It"" Mitch is told to take...",gettin' twiggy with it,cop out,1998.0
4769,dream scheme,It is the setting of this little story.,,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4771,dream scheme,Where people are dressed in PJ and robe.,,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4772,dream scheme,"They kick off their slippers and climb into beds,",,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4773,dream scheme,Fluff up their pillows and lay down their heads.,,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0


In [45]:
no_speakers.count()

episode        162
dialog         162
speaker          0
villain          0
season         162
episode_nr     162
description    162
title_1        162
title_2        125
year           162
dtype: int64

In [46]:
lines_with_seasons['speaker'].fillna(method='ffill', inplace=True)

In [47]:
lines_with_seasons.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21039 entries, 0 to 21053
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   episode      21039 non-null  object 
 1   dialog       21039 non-null  object 
 2   speaker      21039 non-null  object 
 3   villain      2523 non-null   object 
 4   season       21039 non-null  float64
 5   episode_nr   21039 non-null  float64
 6   description  21039 non-null  object 
 7   title_1      21039 non-null  object 
 8   title_2      11008 non-null  object 
 9   year         21039 non-null  float64
dtypes: float64(3), object(7)
memory usage: 1.8+ MB


In [48]:
previous_no_speakers = lines_with_seasons[lines_with_seasons.index.isin(no_speakers.index)]
previous_no_speakers.count()

episode        162
dialog         162
speaker        162
villain          0
season         162
episode_nr     162
description    162
title_1        162
title_2        125
year           162
dtype: int64

In [49]:
previous_no_speakers.iloc[:10, :]

Unnamed: 0,episode,dialog,speaker,villain,season,episode_nr,description,title_1,title_2,year
3514,cop out,"Back to you there, Kevin.",female reporter,,3.0,5.2,"""Gettin' Twiggy With It"" Mitch is told to take...",gettin' twiggy with it,cop out,1998.0
4769,dream scheme,It is the setting of this little story.,narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4771,dream scheme,Where people are dressed in PJ and robe.,narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4772,dream scheme,"They kick off their slippers and climb into beds,",narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4773,dream scheme,Fluff up their pillows and lay down their heads.,narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4775,dream scheme,Enter the Sandman to do them up right.,narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4777,dream scheme,He pulls out a magical brown grain of sand.,narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4779,dream scheme,"And out goes the victim, asleep for the night.",narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4781,dream scheme,"The Sandman does his job, singing his ditty.",narrator,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0
4783,dream scheme,Putting you to sleep is what I'm about.,sandman,,2.0,5.1,"""Dream Scheme"" The Sandman would like a little...",dream scheme,"you snooze, you lose",1998.0


In [50]:
lines_with_seasons['word_count_for_line'] = lines_with_seasons['dialog'].str.split(' ')
lines_with_seasons['word_count_for_line'] = lines_with_seasons['word_count_for_line'].apply(len)
lines_with_seasons[['dialog', 'word_count_for_line']].head(15)

Unnamed: 0,dialog,word_count_for_line
0,And over here is our newest acquisition to the...,49
1,"Bubbles, you can calm down now. You did it!",9
2,"Yeah, dude! Check it out!",5
3,Meow,1
4,"That's right, cats can always get up a tree......",15
5,"Meow, Meow.",2
6,The city of Townsville. This is where the Powe...,11
7,"Oh, you might know them as those three cute li...",121
8,"I'm Brian, and this is my documentary. And you...",31
9,Sugar... spice... and everything nice.,5


In [51]:
lines_1998 = lines_with_seasons[lines_with_seasons['year'] == 1998]

summed_speakers_1998 = lines_1998.groupby('speaker')['word_count_for_line'].sum().reset_index()
summed_speakers_1998 = summed_speakers_1998.sort_values('word_count_for_line', ascending=False).reset_index(drop=True)
summed_speakers_1998.head(50)

Unnamed: 0,speaker,word_count_for_line
0,blossom,18626
1,professor,13171
2,buttercup,11452
3,narrator,11423
4,bubbles,11101
5,mojo jojo,9180
6,mayor,6673
7,ms. keane,2992
8,princess morbucks,2861
9,him,2334


In [52]:
lines_2016 = lines_with_seasons[lines_with_seasons['year'] == 2016]

summed_speakers_2016 = lines_2016.groupby('speaker')['word_count_for_line'].sum().reset_index()
summed_speakers_2016.sort_values('word_count_for_line', ascending=False).head(50).reset_index(drop=True)

Unnamed: 0,speaker,word_count_for_line
0,blossom,12830
1,buttercup,12047
2,bubbles,10564
3,professor,4322
4,princess morbucks,1865
5,bliss,1082
6,mayor,832
7,mojo jojo,795
8,man,753
9,sapna,667


In [53]:
summed_villains = lines_1998.groupby('villain')['word_count_for_line'].sum().reset_index()
summed_villains = summed_villains.sort_values('word_count_for_line', ascending=False).reset_index(drop=True)
summed_villains.head(50)

Unnamed: 0,villain,word_count_for_line
0,mojo jojo,9180
1,princess morbucks,2861
2,him,2334
3,gangreen gang,1833
4,rowdyruff boys,1348
5,smith family,970
6,bernie,915
7,sedusa,817
8,dick,802
9,talking dog,793


In [54]:
lines_with_seasons['villain'].value_counts().head(50)

mojo jojo            834
princess morbucks    361
gangreen gang        265
him                  246
rowdyruff boys       200
amoeba boys          116
smith family         105
sedusa                77
talking dog           67
dick                  62
fuzzy                 61
bernie                60
lenny                 25
femme fatale          23
sandman               14
gnome                  7
Name: villain, dtype: int64

In [55]:
# Good: First 8 good guys (the lower range tend either to be one-epsiode characters, or an (unrelated) aggregate)
good_aggregate_1998 = summed_speakers_1998.loc[:10, :]
good_aggregate_1998

Unnamed: 0,speaker,word_count_for_line
0,blossom,18626
1,professor,13171
2,buttercup,11452
3,narrator,11423
4,bubbles,11101
5,mojo jojo,9180
6,mayor,6673
7,ms. keane,2992
8,princess morbucks,2861
9,him,2334


In [56]:
# All villains that are in top 50 most speaking people that are not an (unrelated) aggregate
villains_aggregate_1998 = summed_villains

In [57]:
good_aggregate_1998.to_json('./good_all.json', orient='records')
villains_aggregate_1998.to_json('./villains_all.json', orient='records')

In [58]:
good_aggregate_seasons_1998 = lines_1998.groupby(['speaker', 'season'])['word_count_for_line'].sum().reset_index()
good_aggregate_seasons_1998 = good_aggregate_seasons_1998.sort_values(['season', 'word_count_for_line'], ascending=[True, False]).reset_index(drop=True)
good_aggregate_seasons_1998 = good_aggregate_seasons_1998[
    (good_aggregate_seasons_1998['speaker'].isin([
        "blossom",
        "professor",
        "buttercup",
        "narrator",
        "bubbles",
        "mayor",
        "ms. keane",
        "ms. bellum"
    ])) | (good_aggregate_seasons_1998['word_count_for_line'] > 250)
]
good_aggregate_seasons_1998.head(50)

Unnamed: 0,speaker,season,word_count_for_line
0,blossom,1.0,2112
1,mojo jojo,1.0,2088
2,narrator,1.0,1778
3,bubbles,1.0,1772
4,buttercup,1.0,1365
5,mayor,1.0,1292
6,professor,1.0,1271
7,ms. bellum,1.0,336
8,sedusa,1.0,325
9,253,1.0,251


In [59]:
villains_aggregate_seasons_1998 = lines_1998.groupby(['villain', 'season'])['word_count_for_line'].sum().reset_index()
villains_aggregate_seasons_1998 = villains_aggregate_seasons_1998.sort_values(['season', 'word_count_for_line'], ascending=[True, False]).reset_index(drop=True)
villains_aggregate_seasons_1998.head(50)

Unnamed: 0,villain,season,word_count_for_line
0,mojo jojo,1.0,2088
1,sedusa,1.0,325
2,him,1.0,181
3,gangreen gang,1.0,134
4,fuzzy,1.0,132
5,talking dog,1.0,50
6,smith family,1.0,6
7,mojo jojo,2.0,1524
8,smith family,2.0,944
9,princess morbucks,2.0,905


In [60]:
good_aggregate_seasons_1998.to_json('./good_seasons.json', orient='records')
villains_aggregate_seasons_1998.to_json('./villains_seasons.json', orient='records')

In [61]:
good_aggregate_episodes_1998 = lines_1998.groupby(['speaker', 'season', 'episode_nr', 'episode'])['word_count_for_line'].sum().reset_index()
good_aggregate_episodes_1998 = good_aggregate_episodes_1998.sort_values(['season', 'episode_nr', 'episode', 'word_count_for_line'], ascending=[True, True, True, False]).reset_index(drop=True)
good_aggregate_episodes_1998 = good_aggregate_episodes_1998[
    (good_aggregate_episodes_1998['speaker'].isin([
        "blossom",
        "professor",
        "buttercup",
        "narrator",
        "bubbles",
        "mayor",
        "ms. keane",
        "ms. bellum"
    ])) | (good_aggregate_episodes_1998['word_count_for_line'] > 100)
]
good_aggregate_episodes_1998.head(50)

Unnamed: 0,speaker,season,episode_nr,episode,word_count_for_line
0,mojo jojo,1.0,1.1,"monkey see, doggie do",292
1,narrator,1.0,1.1,"monkey see, doggie do",169
2,blossom,1.0,1.1,"monkey see, doggie do",139
3,professor,1.0,1.1,"monkey see, doggie do",71
9,mayor,1.0,1.1,"monkey see, doggie do",11
11,bubbles,1.0,1.1,"monkey see, doggie do",5
14,sedusa,1.0,1.2,mommy fearest,325
15,professor,1.0,1.2,mommy fearest,193
16,narrator,1.0,1.2,mommy fearest,160
17,blossom,1.0,1.2,mommy fearest,120


In [62]:
villains_aggregate_episodes_1998 = lines_1998.groupby(['villain', 'season', 'episode_nr', 'episode'])['word_count_for_line'].sum().reset_index()
villains_aggregate_episodes_1998 = villains_aggregate_episodes_1998.sort_values(['season', 'episode_nr', 'episode', 'word_count_for_line'], ascending=[True, True, True, False]).reset_index(drop=True)
villains_aggregate_episodes_1998.head(50)

Unnamed: 0,villain,season,episode_nr,episode,word_count_for_line
0,mojo jojo,1.0,1.1,"monkey see, doggie do",292
1,sedusa,1.0,1.2,mommy fearest,325
2,him,1.0,3.1,octi evil,181
3,talking dog,1.0,7.1,major competition,7
4,mojo jojo,1.0,7.2,mr. mojo's rising,431
5,smith family,1.0,8.1,paste makes waste,6
6,mojo jojo,1.0,9.1,bubblevicious,113
7,talking dog,1.0,9.1,bubblevicious,24
8,mojo jojo,1.0,9.2,the bare facts,103
9,fuzzy,1.0,10.2,impeach fuzz,132


In [63]:
good_aggregate_episodes_1998.to_json('./good_episodes.json', orient='records')
villains_aggregate_episodes_1998.to_json('./villains_episodes.json', orient='records')

In [64]:
all_episodes = lines_1998.groupby(['season', 'episode_nr', 'episode', 'title_1', 'title_2', 'description'])['word_count_for_line'].sum().reset_index()
all_episodes = all_episodes.sort_values(['season', 'episode_nr', 'episode'], ascending=[True, True, True]).reset_index(drop=True)
all_episodes.head(50)

Unnamed: 0,season,episode_nr,episode,title_1,title_2,description,word_count_for_line
0,1.0,1.1,"monkey see, doggie do","monkey see, doggie do",mommy fearest,"In ""Monkey See, Doggie Do"", Mojo Jojo swipes s...",810
1,1.0,1.2,mommy fearest,"monkey see, doggie do",mommy fearest,"In ""Monkey See, Doggie Do"", Mojo Jojo swipes s...",1079
2,1.0,2.2,powerpuff bluff,insect inside,powerpuff bluff,"""Insect Inside"" The city of Townsville becomes...",1271
3,1.0,3.1,octi evil,octi evil,geshundfight,"""Octi-Evil"" Him is out to break up the Powerpu...",869
4,1.0,7.1,major competition,major competition,mr. mojo's rising,"""Major Competition"" The Powerpuff Girl's place...",940
5,1.0,7.2,mr. mojo's rising,major competition,mr. mojo's rising,"""Major Competition"" The Powerpuff Girl's place...",981
6,1.0,8.1,paste makes waste,paste makes waste,ice sore,"In ""Paste Makes Waste"", one of the Pokey Oaks ...",764
7,1.0,8.2,ice sore,paste makes waste,ice sore,"In ""Paste Makes Waste"", one of the Pokey Oaks ...",962
8,1.0,9.1,bubblevicious,bubblevicious,the bare facts,Bubbles is tired of being the cute and wants t...,854
9,1.0,9.2,the bare facts,bubblevicious,the bare facts,Bubbles is tired of being the cute and wants t...,1634


In [65]:
all_episodes.to_json('./all_episodes.json', orient='records')