In [1]:
import pandas as pd

raw_data_1 = pd.read_json('./ppg-episode-transcripts-1.json', encoding='utf-8')
raw_data_2 = pd.read_json('./ppg-episode-transcripts-2.json', encoding='utf-8')

In [2]:
raw_data = pd.concat([raw_data_1, raw_data_2])

raw_data.head()

Unnamed: 0,episode,dialog
0,'Twas the Fight Before Christmas,"[Narrator: 'Twas the city of Townsville, and a..."
1,15 Minutes of Fame,[Angela: And over here is our newest acquisiti...
2,A Comedy of Terrors,[Kaoru Matsubara: Check him out.]
3,A Documentary,[Brian Larsen: The city of Townsville. This is...
4,A Made Up Story,[Narrator: The city of Townsville! Where some...


In [3]:
raw_data.describe()

Unnamed: 0,episode,dialog
count,258,257
unique,258,247
top,'Twas the Fight Before Christmas,[]
freq,1,11


In [4]:
# Drop all
# - Animated shorts
# - Movies
# - PPG Z episodes
episodes_to_drop = [
   'a comedy of terrors', # powerpuff girls Z
    'enter the entourage',  # powerpuff girls Z
    "it's all because of him", # powerpuff girls Z
    'powerpuff girls to the rescue', # powerpuff girls Z

   'air buttercup', # Animated short, 2016
   "bubbles' beauty blog", # Animated short, 2016
   'ping pong z', # Animated short, 2016
   'run, blossom, run!', # Animated short, 2016
    "i'll be bake", # Animated short, 2017
    'mojo builds a shelf', # Animated short, 2017
    'the powerpuff girls rule!!!', # TV short, 2008

   'crime 101', # Pilot, 1995/1996
   'meat fuzzy lumkins', # Pilot, 1995

   'whoopass stew' # Movie, their first-ever appearance, 1992
   'the powerpuff girls movie', # Movie, 2002
   "'twas the fight before christmas",  # Movie, 2003
   'powerpuff girls special: dance pantsed', # Movie (weird style), 2014

   'ttg v ppg', # Teen titans, not ppg 
]

raw_data = raw_data[~raw_data['episode'].str.lower().isin(episodes_to_drop)]

In [5]:
all_lines = raw_data.explode(column='dialog')
all_lines = all_lines.reset_index(drop=True)
all_lines.head()

Unnamed: 0,episode,dialog
0,15 Minutes of Fame,Angela: And over here is our newest acquisitio...
1,15 Minutes of Fame,"Blossom: Bubbles, you can calm down now. You d..."
2,15 Minutes of Fame,"Buttercup: Yeah, dude! Check it out!"
3,15 Minutes of Fame,Cat: Meow
4,15 Minutes of Fame,"Blossom: That's right, cats can always get up ..."


In [6]:
all_lines[all_lines['dialog'].isna()]['episode']

2494                      Bye Bye, Bellum
11790              Once Upon a Townsville
14988                        Sideline Dad
15477    Small World: Heart to Heartstone
16287                        Strong-Armed
17371         Take Your Kids to Dooms Day
17638                       The Big Sleep
18091                             The Fog
18784                     The Squashening
18785                        The Stayover
19632           Total Eclipse of the Kart
Name: episode, dtype: object

In [7]:
speakers = all_lines['dialog'].str.extract("^([.,*#-'/\w\s]*):")

all_lines['speaker'] = speakers
all_lines['speaker'] = all_lines['speaker'].str.strip()
all_lines['speaker'] = all_lines['speaker'].str.lower()
all_lines['speaker'] = all_lines['speaker'].str.strip()
speakers.nunique()

AttributeError: 'StringMethods' object has no attribute 'trim'

In [None]:
all_lines['dialog'].head(50)

In [None]:
dialog_lines = all_lines['dialog']
dialog_lines = dialog_lines.reset_index(drop=True)
dialog_lines

In [None]:
def process_dialog(dialog):
    if isinstance(dialog, str):
        parts = dialog.split(': ')
        if len(parts) > 1:
            return ' '.join(parts[1:])
        elif len(parts) == 1:
            return parts[0]
    else:
        return None
    
dialog_without_speaker = dialog_lines.apply(process_dialog)

In [None]:
dialog_without_speaker.head(50)

In [None]:
all_lines['dialog'] = dialog_without_speaker
all_lines['dialog']

In [None]:
unique_speakers = all_lines['speaker'].unique()

for speaker in unique_speakers:
    if isinstance(speaker, str) and ',' in speaker:
        print(speaker)

In [None]:
# Save one as second speaker
double_speakers = [    
    "blossom and buttercup",
    "blossom and bubbles",
    "the professor and blossom",
    "both blossom and buttercup",
    "buttercup and blossom",
    "blossom and the professor",
    "bubbles and blossom",
    "bubbles and buttercup",
    "buttercup and bubbles",
    "professor and buttercup",
    "professor utonium and javier xavier",
    "man and woman",
    "girls and erica",
    "momoko and miyako",
    "kaoru and miyako",
    "ms. bellumold woman 1",
    "ms. keaneold woman 2",
    "mayor and ms. bellum",
    "blossom/ms. bellum",
    "ms. bellum/blossom",
    "buttercup/green batman",
    "bubbles/bubbs wonder",
    "buttercup/professor",
    "professor/buttercup",
    "blossom/ms. bellum",
    "bubbles/mayor",
    "mayor/bubbles",
    "ms. bellum/blossom",
    "girl 1/ace",
    "girl 2/grubber",
    "girl 3/snake",
    "old woman/mojo",
    "mojo/old woman",
    "blossom/dachshund",
    "bubbles/centipede",
    "buttercup/cobra",
    "professor/older girl",
    "mojo/fish",
    "professor/ugly baby",
    "professor/nurse",
    "professor/astronaut",
    "professor/pink creature",
    "professor/morbid woman",
    "bubbles/narrator",
    "narrator/bubbles",
    "blossom/bubbles",
    "octi/him",
    "buttercup/bubblecup",
    "bubbles/children",
    "old man/janitor",
    "eyes/blonde hair covered boy",
    "blossom/liberty belle",
    "bubbles/harmony bunny",
    "buttercup/mange",
    "blossom/buttercup",
    "bubbles, buttercup",
"girls, professor",
"blossom, buttercup",
"blossom, bubbles and buttercup",
"kids, blossom",
"slim, junior",
"mojo, princess, boys",
"blossom, bubbles, buttercup and the professor",
"blossom, bubbles, and buttercup",
"professor, blossom, buttercup",
"brick, butch",
"blossom, bubbles",
"teen blossom, teen buttercup",
"girl 2, boy 1",
"skinny, tiny",
"blossom/ms. bellum, buttercup/professor",
"mayor, ms. bellum",
"mojo, 'him'",
"bossman, bossman 2",
"slim, slim 2",
"junior, juniors 2 and 3",
"bubbles, buttercup, random boy, random girl, another random boy, robin snyder, another random boy, two random boys, blossom, jared, and barry",
"random girl , random boy , random boy , buttercup  on the left, jared , and robin , blossom , and two boys",
"blossom, buttercup, jared, barry, and robin",
"chad, man with glasses, and pink haired lady",
"driver, robbers",
"girls, duplicates",
"kids, parents",
"stan, sandra",
"fields, anthropist, blind",
"class, ms. keane",
"pug faced paulie, scared",
"momoko, miyako, and kaoru",
"professor, ms. keane",
"ace, billy, arturo",
"fuzzy, princess, 'him'",
"kim, jeff, mary",
"joey, kim",
"man 9, bass monster",
"gang, fuzzy, 'him'",
"bertha, beatrice",
"lloyd, floyd",
"ace, snake, lil' arturo, big billy",
"002, 017",
"ace, snake, arturo, billy",
"ace, snake",
"boys 2, 3, 4",
"monty, slim",
"bubbles, buttercup and the professor",
"girls, robin",
"barry, bubbles, and buttercup",
"bubbles, blossom",
"many monkeys, chimpanzees, gorillas, orangutans, and even an uakari",
"crowd, professor, ms. bellum",
"governor, yokel, lummox",
"octi/'him'",
"'him'/ms. keane",
"blossom & buttercup",
"blossom & bubbles",
"bubbles & blossom",
"blossom & professor utonium",
"bubbles & buttercup",
"buttercup & bubbles",
"spikey haired boy & eyes/blonde hair covered boy",
"donny & chelsea"
]

blossom_values = [ 
    "blossom #2",
    "blosoom",
    "all blossoms",
    "blossom  and blossom",
    "the blossoms",
    "siren blossom",
    "teen blossom",
    "blossom as she fights the bunnies",
    "knockoff blossom",
    "survivor reject blossom",
    "angel blossom",
    "robot blossom",
    "robo blossom",
    "lice blossom",
    "french renaissance blossom",
    "old blossom",
    "blossom, in the shop",
    "blossom, to bubbles",
    "blossom's thoughts",
    "blossom ",
    "both blossom"
]

bubbles_values = [
    "bubbles",
    "bubbes",
    "dark bubbles",
    "teen bubbles",
    "knockoff bubbles",
    "robo bubbles",
    "french renaissance bubbles",
    "bubbles and doubles",
    "old bubbles",
    "bubblecup",
    "bubbles' voice",
    "bubbles #2"
]

buttercup_values = [
    "buttercup double",
    "buttecup",
    "real buttercup",
    "buttercup on video",
    "teen buttercup",
    "knockoff buttercup",
    "reject buttercup",
    "angel buttercup",
    "robo buttercup",
    "flashback buttercup",
    "old buttercup",
    "buttercup, to herself",
    "buttercup, disguised",
    "buttercup, as she accidentaly reveals herself",
    "buttercup, scared",
    "and buttercup",
    "buttercup  on the left",
    "buttercup's thoughts",
    "buttercup #2"
]

mojo_values = [
    "mojo",
    "jojo",
    "'mojo'",
    "'mojo' ",
    "mojo'",
    "mojo's thoughts"
]

professor_values = [
    "professor",
    "dream professor",
    "memory professor",
    "professor utonium",
    "kid professor",
    "young prof",
    "student prof",
    "the professor",
    "flashback professor",
]

mayor_values = [
    "mayor",
    "the mayor",
    "younger mayor",
    "mayor mayer",
    "mayor of new townsville",
    "lice mayor",
    "citiesville mayor",
    "dream mayor"
]

him_values = [
    "him",
    "'him'",
    "memory him",
    "black spored him"
]

narrator_values = [
    "narrator",
    "male narrator",
    "man voiceover",
]

princess_values = [
    "princess morebucks",
    "princess",
    "princess morbucks",
    "morbucks",
    "morbucks' heart"
]

fuzzy_values = [
    "fuzzy",
    "fuzzy lumpkins"
]

brick_values = [
    "brick",
    "teen brick",
    "brickowski"
]

woman_values = [
    "old woman",
    "woman",
    "muscle woman",
    "woman in the suit",
    "policewoman",
    "woman 2",
    "woman 3",
    "brown skinned woman",
    "woman 1",
    "young woman",
    "adult teenage woman",
    "woman 4",
    "woman 5",
    "woman 6",
    "woman 7",
    "iss woman",
    "cowgirl woman",
    "backgrounder woman",
    "fat woman",
    "fainting woman",
    "nature woman",
    "jewel owner woman",
    "jamaican woman",
    "curler woman",
    "woman 8",
    "woman 9",
    "repairwoman",
    "morbid woman",
    "woman #2",
    "woman #3",
    "woman #1",
    "woman's voice",
    "lunch lady",
    "and pink haired lady",
    "rocket a.i. lady",
    "cleaning lady",
    "video lady",
    "lady" #1,
    "old lady",
    "banklady",
    "news lady",
    "elderly lady",
    "anchor lady",
    "lady",
    "baby lady",
    "random lady",
]

man_values = [
    "man",
    "policeman",
    "old man",
    "swan boat man",
    "barber man",
    "man 1",
    "man 2",
    "man 3",
    "man in court",
    "man in a blue long sleeved robe",
    "old man 1",
    "old man 2",
    "old man 3",
    "man with glasses",
    "policeman 1",
    "policeman 2",
    "security man 1",
    "security man 2",
    "postman",
    "renaissance man",
    "delivery man",
    "man with a green hat",
    "man wit a green hat",
    "man in the space suit",
    "running man",
    "pointing man",
    "manager",
    "french man with a french hat",
    "brown haired man with glasses",
    "man 4",
    "man 5",
    "hotdog man",
    "man 6",
    "man 7",
    "man 8",
    "man 9",
    "man 10",
    "man 11",
    "abs man",
    "commercial man",
    "prison man",
    "salesman",
    "henchman",
    "mailman",
    "backgrounder man",
    "bank manager",
    "pizza man",
    "rainbow man",
    "bagman",
    "gunman",
    "short man",
    "newsman",
    "jewel owner man",
    "smooth man",
    "anchorman",
    "businessman",
    "french renaissance man",
    "fireman 1",
    "man 12",
    "man 13",
    "man 14",
    "fireman 2",
    "man being stung by bees",
    "foam finger man",
    "man in overalls",
    "man in trough",
]

bliss_values = [
    "memory bliss",
    "bliss",
    "memory teenage bliss",
    "young memory bliss"
]

monster_values = [
    "monster",
    "pink tentacled monster",
    "green monster",
    "bass monster",
    "female fish monster",
    "red lizard monster",
    "lawyer monster",
    "monsters",
    "fancy monster",
    "purple octopus",
    "dragon",
    "pincered monster",
    "necktie wearing monster",
    "oneeyed monster",
    "singledout monster",
]

boy_values = [
    "boy",
    "huge boy",
    "pimple faced boy",
    "boy 1",
    "boy 2",
    "boy 3",
    "boy 4",
    "nerd boy",
    "bigger boy",
    "smaller boy",
    "boy with brown spiky hair",
    "man boy",
    "manboy",
    "spikey haired boy",
    "freckle faced boy",
    "boy 5",
    "boy 6",
    "boy 7",
    "boy nerd",
    "green haired boy with rapper glasses",
    "french renaissance orange haired boy",
    "redhead boy",
    "'injured' boy",
    "brownskinned boy",
    "random boy",
    "another random boy",
    "two random boys",
    "random boy ",
    "and two boys",
    "spikey haired boy & eyes/blonde hair covered boy",
    "blonde hair covered boy",
    "boys 2",
    "boy #1",
]

ppg_values = [
    "all 3 girls",
    "all three girls",
    "girls",
    "the girls",
    "old girls",
    "the 3 girls",
    "all girls",
    "powerpuff girls z",
    "the 4 girls"
]

rrb_values = [
    "boys"
]

girl_values = [
    "girl",
    "cool girl",
    "girl with braces",
    "girl 1",
    "girl 2",
    "girl 3",
    "blonde teenage girl",
    "blonde girl",
    "girl with glasses",
    "girl without glasses",
    "2 girls",
    "cool girl 1",
    "cool girl 3",
    "cool girl 2",
    "all 3 cool girls",
    "girl with brown hair",
    "brown haired girl",
    "cowgirl",
    "friendly girl",
    "derbytante girls",
    "spirit girl",
    "mail girl",
    "flower girl",
    "girl student",
    "girl 4",
    "2 unnamed girls",
    "unnamed girl with a pink shirt",
    "teenage girl 1",
    "teenage girl 3",
    "teenage girls",
    "pimple girl",
    "older girl",
    "random girl",
    "random girl ",
    "girl #1",
    "girl #2",
    "girl #3",
]

miyako_values = [
    "memory miyako",
    "miyako",
    "flashback miyako",
]

sapna_values = [
    "sapna,"
    "dream sapna",
    "spider sapna"
]

kaoru_values = [
    "kaoru matsubara"
]

announcer_values = [
    "space towtruck and the infinite jumper cables movie announcer",
    "announcer",
    "announcer 1",
    "announcer 2",
    "another announcer",
    "radio announcer",
    "tv announcer",
    "space towtruck announcer",
    "male pageant announcer",
    "video game announcer",
]

keane_values = [
    "ms. keane",
    "young keane",
    "keanes",
    "keane 2",
    "keane 1",
    "miss keane",
    "french renaissance ms. keane",
    "ms. keane"
]

bellum_values = [
    "ms. bellum",
    "young bellum",
    "miss bellum",
    "dream ms. bellum",
    "ms.bellum",
    "ms. bellum"
]

billy_values = [
    "billy",
    "big billy",
    " billy"
]

arturo_values = [
    "lil arturo",
    " lil arturo",
    " arturo"
]

butch_values = [
    "butch",
    "teen butch"
]

boomer_values = [
    "boomer",
    "teen boomer"
]

talking_dog_values = [
    "talking dog",
    "talking  dog",
    "talking dog billboard"
]

junior_values = [
    "junior",
    "junior 4"
]

bossman_values = [
    "bossman",
    "bossman 3",
    "bossman 2"
]

dick_values = [
    "dick",
    "student dick"
]

marianne_values = [
    'mrs. smith'
]

harold_values = [
    'mr. smith'
]

sedusa_values = [
    'ima'
]

In [None]:
import re

# All double speakers in consistent format (comma-separated)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: re.sub(r'(/|\sand\s|,\s|\s&\s)', ',', x) if x in double_speakers else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: re.sub('/', ',', x) if x in double_speakers else x)

all_lines.loc[all_lines['speaker'] == 'ms. bellumold woman 1', 'speaker'] = 'ms. bellum,woman'
all_lines.loc[all_lines['speaker'] == 'ms. keaneold woman 2', 'speaker'] = 'ms. keane,woman'

double_speaker_items = all_lines[all_lines['speaker'].isin(double_speakers)]
double_speaker_items['speaker']

In [None]:
len(all_lines['speaker'])

In [None]:
new_double_speakers = all_lines[all_lines['speaker'].str.contains(',', na=False)]
len(new_double_speakers['speaker'].value_counts())

In [None]:
all_lines['speaker'] = all_lines['speaker'].str.split(',')
all_lines = all_lines.explode('speaker')
all_lines = all_lines.reset_index(drop=True)
all_lines

In [None]:
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'blossom' if x in blossom_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'bubbles' if x in bubbles_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'buttercup' if x in buttercup_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'professor' if x in professor_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'mayor' if x in mayor_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'mojo jojo' if x in mojo_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'narrator' if x in narrator_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'him' if x in him_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'princess morbucks' if x in princess_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'fuzzy' if x in fuzzy_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'woman' if x in woman_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'man' if x in man_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'brick' if x in brick_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'bliss' if x in bliss_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'monster' if x in monster_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'boy' if x in boy_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'girl' if x in girl_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'powerpuff girls' if x in ppg_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'rowdyruff boys' if x in rrb_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'sapna' if x in sapna_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'miyako' if x in miyako_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'kaoru' if x in kaoru_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'announcer' if x in announcer_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'ms. keane' if x in keane_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'ms. bellum' if x in bellum_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'billy' if x in billy_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'arturo' if x in arturo_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'butch' if x in butch_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'boomer' if x in boomer_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'talking dog' if x in talking_dog_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'junior' if x in junior_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'bossman' if x in bossman_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'marianne' if x in marianne_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'harold' if x in harold_values else x)
all_lines['speaker'] = all_lines['speaker'].apply(lambda x: 'sedusa' if x in sedusa_values else x)

In [None]:
unique_speakers = all_lines['speaker'].unique()

for speaker in unique_speakers:
    if isinstance(speaker, str) and " " in speaker:
        print(speaker)

In [None]:
all_lines['speaker'].value_counts().head(60)

In [None]:
# Identify episodes with only 1 line, probably we need to filter them out
all_lines['episode'].value_counts()

In [None]:
import numpy as np

# Create a villain column
conditions = [
    all_lines['speaker'] == 'mojo jojo',
    all_lines['speaker'] == 'him',
    all_lines['speaker'] == 'talking dog',
    all_lines['speaker'] == 'sedusa',
    all_lines['speaker'] == 'princess morbucks',
    all_lines['speaker'] == 'dick',
    all_lines['speaker'] == 'bernie',
    all_lines['speaker'] == 'sandman',
    all_lines['speaker'] == 'lenny',
    all_lines['speaker'] == 'femme fatale',
    all_lines['speaker'] == 'gnome',
    all_lines['speaker'] == 'fuzzy',
    all_lines['speaker'].isin(['brick', 'boomer', 'butch']), 
    all_lines['speaker'].isin(['ace', 'grubber', 'billy', 'snake', 'arturo']),
    all_lines['speaker'].isin(['bossman', 'junior', 'slim', 'amoeba boys']),
    all_lines['speaker'].isin(['marianne', 'harold', 'julie', 'bud']),
]
values = [
    'mojo jojo', 
    'him',
    'talking dog',
    'sedusa',
    'princess morbucks',
    'dick',
    'bernie',
    'sandman',
    'lenny',
    'femme fatale',
    'gnome',
    'fuzzy',
    'gangreen gang',
    'rowdyruff boys',
    'amoeba boys',
    'smith family'
]

# Use numpy's where function to create the new column
all_lines['villain'] = np.select(conditions, values, default=None)
all_lines['villain'].head()

In [None]:
all_lines['villain'].value_counts()

In [None]:
seasons_1998_raw = pd.read_json('./ppg-seasons-1998.json', encoding='utf-8')
seasons_2016_raw = pd.read_json('./ppg-seasons-2016.json', encoding='utf-8')

seasons_2016_raw.head()

In [None]:
seasons_raw = pd.concat([seasons_1998_raw, seasons_2016_raw])
seasons_raw.head(10)

In [None]:
seasons_raw['title'].head(50)

In [None]:
seasons = seasons_raw.copy()

seasons['title'] = seasons['title'].str.lower()
seasons['title'] = seasons['title'].str.split('/')
seasons['title_1'] = seasons['title'].str[0]
seasons['title_2'] = seasons['title'].str[1]
all_lines['episode'] = all_lines['episode'].str.lower()
seasons.head(10)

In [None]:
lines_with_seasons = pd.merge(all_lines, seasons, how='left', left_on='episode', right_on='title_1')
lines_with_seasons = pd.merge(lines_with_seasons, seasons, how='left', left_on='episode', right_on='title_2')

In [None]:
lines_with_seasons.head()

In [None]:
lines_with_seasons.groupby(['season_y'])['episode'].nunique()

In [None]:
lines_with_seasons.groupby(['season_x'])['episode'].nunique()

In [None]:
lines_with_seasons.columns

In [None]:
lines_with_seasons['season'] = np.where(lines_with_seasons['season_x'].notna(), lines_with_seasons['season_x'], lines_with_seasons['season_y'])
lines_with_seasons['episode_nr'] = np.where(lines_with_seasons['episode_nr_x'].notna(), lines_with_seasons['episode_nr_x'], lines_with_seasons['episode_nr_y'])
lines_with_seasons['description'] = np.where(lines_with_seasons['description_x'].notna(), lines_with_seasons['description_x'], lines_with_seasons['description_y'])
lines_with_seasons['title_1'] = np.where(lines_with_seasons['title_1_x'].notna(), lines_with_seasons['title_1_x'], lines_with_seasons['title_1_y'])
lines_with_seasons['title_2'] = np.where(lines_with_seasons['title_2_x'].notna(), lines_with_seasons['title_2_x'], lines_with_seasons['title_2_y'])
lines_with_seasons['year'] = np.where(lines_with_seasons['year_x'].notna(), lines_with_seasons['year_x'], lines_with_seasons['year_y'])
lines_with_seasons['season'].value_counts()

In [None]:
lines_with_seasons['season'].isna().sum()

In [None]:
lines_with_seasons.drop(columns=[
    'season_x', 'episode_nr_x',
       'title_x', 'description_x', 'title_1_x', 'title_2_x', 'season_y',
       'episode_nr_y', 'title_y', 'description_y', 'title_1_y', 'title_2_y', 'year_x', 'year_y' 
], inplace=True)

In [None]:
lines_with_seasons['episode_nr'] = np.where(lines_with_seasons['title_2'] == lines_with_seasons['episode'], lines_with_seasons['episode_nr'] + 0.2, lines_with_seasons['episode_nr'])
lines_with_seasons['episode_nr'] = np.where((lines_with_seasons['title_1'] == lines_with_seasons['episode']) & (lines_with_seasons['title_2'].notna()), lines_with_seasons['episode_nr'] + 0.1, lines_with_seasons['episode_nr'])

In [None]:
lines_with_seasons[lines_with_seasons['season'].isna()]['episode'].unique()

In [None]:
# NB They are all 2016, so for now I'm not too worried
lines_with_seasons[lines_with_seasons['dialog'].isna()]

In [None]:
lines_with_seasons.dropna(subset=['dialog'], inplace=True)

In [None]:
lines_with_seasons[lines_with_seasons['dialog'].isna()]

In [None]:
lines_with_seasons.info()

In [None]:
lines_with_seasons.head()

In [None]:
# Fill all lines without a speaker with the speaker before. CHECK!!
no_speakers = lines_with_seasons[lines_with_seasons['speaker'].isna()]
no_speakers.head()

In [None]:
no_speakers.count()

In [None]:
lines_with_seasons['speaker'].fillna(method='ffill', inplace=True)

In [None]:
lines_with_seasons.info()

In [None]:
previous_no_speakers = lines_with_seasons[lines_with_seasons.index.isin(no_speakers.index)]
previous_no_speakers.count()

In [None]:
previous_no_speakers.iloc[:10, :]

In [None]:
lines_with_seasons['word_count_for_line'] = lines_with_seasons['dialog'].str.split(' ')
lines_with_seasons['word_count_for_line'] = lines_with_seasons['word_count_for_line'].apply(len)
lines_with_seasons[['dialog', 'word_count_for_line']].head(15)

In [None]:
lines_1998 = lines_with_seasons[lines_with_seasons['year'] == 1998]

summed_speakers_1998 = lines_1998.groupby('speaker')['word_count_for_line'].sum().reset_index()
summed_speakers_1998 = summed_speakers_1998.sort_values('word_count_for_line', ascending=False).reset_index(drop=True)
summed_speakers_1998.head(50)

In [None]:
lines_2016 = lines_with_seasons[lines_with_seasons['year'] == 2016]

summed_speakers_2016 = lines_2016.groupby('speaker')['word_count_for_line'].sum().reset_index()
summed_speakers_2016.sort_values('word_count_for_line', ascending=False).head(50).reset_index(drop=True)

In [None]:
summed_villains = lines_1998.groupby('villain')['word_count_for_line'].sum().reset_index()
summed_villains = summed_villains.sort_values('word_count_for_line', ascending=False).reset_index(drop=True)
summed_villains.head(50)

In [None]:
lines_with_seasons['villain'].value_counts().head(50)

In [None]:
# Good: First 8 good guys (the lower range tend either to be one-epsiode characters, or an (unrelated) aggregate)
good_aggregate_1998 = summed_speakers_1998.loc[:10, :]
good_aggregate_1998

In [None]:
# All villains that are in top 50 most speaking people that are not an (unrelated) aggregate
villains_aggregate_1998 = summed_villains

In [None]:
good_aggregate_1998.to_json('./good_all.json', orient='records')
villains_aggregate_1998.to_json('./villains_all.json', orient='records')

In [None]:
good_aggregate_seasons_1998 = lines_1998.groupby(['speaker', 'season'])['word_count_for_line'].sum().reset_index()
good_aggregate_seasons_1998 = good_aggregate_seasons_1998.sort_values(['season', 'word_count_for_line'], ascending=[True, False]).reset_index(drop=True)
good_aggregate_seasons_1998 = good_aggregate_seasons_1998[
    (good_aggregate_seasons_1998['speaker'].isin([
        "blossom",
        "professor",
        "buttercup",
        "narrator",
        "bubbles",
        "mayor",
        "ms. keane",
        "ms. bellum"
    ])) | (good_aggregate_seasons_1998['word_count_for_line'] > 250)
]
good_aggregate_seasons_1998.head(50)

In [None]:
villains_aggregate_seasons_1998 = lines_1998.groupby(['villain', 'season'])['word_count_for_line'].sum().reset_index()
villains_aggregate_seasons_1998 = villains_aggregate_seasons_1998.sort_values(['season', 'word_count_for_line'], ascending=[True, False]).reset_index(drop=True)
villains_aggregate_seasons_1998.head(50)

In [None]:
good_aggregate_seasons_1998.to_json('./good_seasons.json', orient='records')
villains_aggregate_seasons_1998.to_json('./villains_seasons.json', orient='records')

In [None]:
good_aggregate_episodes_1998 = lines_1998.groupby(['speaker', 'season', 'episode_nr', 'episode'])['word_count_for_line'].sum().reset_index()
good_aggregate_episodes_1998 = good_aggregate_episodes_1998.sort_values(['season', 'episode_nr', 'episode', 'word_count_for_line'], ascending=[True, True, True, False]).reset_index(drop=True)
good_aggregate_episodes_1998 = good_aggregate_episodes_1998[
    (good_aggregate_episodes_1998['speaker'].isin([
        "blossom",
        "professor",
        "buttercup",
        "narrator",
        "bubbles",
        "mayor",
        "ms. keane",
        "ms. bellum"
    ])) | (good_aggregate_episodes_1998['word_count_for_line'] > 100)
]
good_aggregate_episodes_1998.head(50)

In [None]:
villains_aggregate_episodes_1998 = lines_1998.groupby(['villain', 'season', 'episode_nr', 'episode'])['word_count_for_line'].sum().reset_index()
villains_aggregate_episodes_1998 = villains_aggregate_episodes_1998.sort_values(['season', 'episode_nr', 'episode', 'word_count_for_line'], ascending=[True, True, True, False]).reset_index(drop=True)
villains_aggregate_episodes_1998.head(50)

In [None]:
good_aggregate_episodes_1998.to_json('./good_episodes.json', orient='records')
villains_aggregate_episodes_1998.to_json('./villains_episodes.json', orient='records')

In [None]:
all_episodes = lines_1998.groupby(['season', 'episode_nr', 'episode', 'title_1', 'title_2', 'description'])['word_count_for_line'].sum().reset_index()
all_episodes = all_episodes.sort_values(['season', 'episode_nr', 'episode'], ascending=[True, True, True]).reset_index(drop=True)
all_episodes.head(50)

In [None]:
all_episodes.to_json('./all_episodes.json', orient='records')