In [1]:
# Bruno Vieira Ribeiro - 2021

In [2]:
import requests
from bs4 import BeautifulSoup
import re

import pandas as pd
import numpy as np
import pickle

In [3]:
%config Completer.use_jedi = False        # For autocomplete to work!

# Getting lines from *The Legend of Aang*

Page with all the links to episodes:

In [4]:
url = 'https://avatar.fandom.com/wiki/Avatar_Wiki:Transcripts#Book_One:_Water'

In [5]:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

# dfs = pd.read_html(page.text)

Getting all links to seasons from html tables:

In [6]:
seasons = soup.find_all('table', class_='wikitable')

We need to select only the ones for Aangs story:
Inspecting the base url we see that tables `1` trough `6` correspond to ATLA episodes. So, we can simply pick these ones:

In [7]:
for el in seasons[1:7]:
    col = el.find_all('a')
    for episode in col:
        print('https://avatar.fandom.com' + episode['href'])

https://avatar.fandom.com/wiki/Transcript:The_Boy_in_the_Iceberg
https://avatar.fandom.com/wiki/Transcript:The_Avatar_Returns
https://avatar.fandom.com/wiki/Transcript:The_Southern_Air_Temple
https://avatar.fandom.com/wiki/Transcript:The_Warriors_of_Kyoshi
https://avatar.fandom.com/wiki/Transcript:The_King_of_Omashu
https://avatar.fandom.com/wiki/Transcript:Imprisoned
https://avatar.fandom.com/wiki/Transcript:Winter_Solstice,_Part_1:_The_Spirit_World
https://avatar.fandom.com/wiki/Transcript:Winter_Solstice,_Part_2:_Avatar_Roku
https://avatar.fandom.com/wiki/Transcript:The_Waterbending_Scroll
https://avatar.fandom.com/wiki/Transcript:Jet_(episode)
https://avatar.fandom.com/wiki/Transcript:The_Great_Divide
https://avatar.fandom.com/wiki/Transcript:The_Storm
https://avatar.fandom.com/wiki/Transcript:The_Blue_Spirit
https://avatar.fandom.com/wiki/Transcript:The_Fortuneteller
https://avatar.fandom.com/wiki/Transcript:Bato_of_the_Water_Tribe
https://avatar.fandom.com/wiki/Transcript:The_Des

Now to turn this into an iterable:
We'll create a list and append all links to it. After the list is complete, we will convert it to a tuple as we are only going to iterate over it (only acessing elements).

There are 5 episodes here that I'll discard: All the `commentary` episodes and the `Escape from the Spirit World` one in the beginning of Book 3 (because it is, actually, an online game, not an episode - although canonical...).

In [8]:
links_list = []

for el in seasons[1:7]:
    col = el.find_all('a')
    for episode in col:
        # Check if episode is not a commentary nor is it the online game called Escape from the Spirit World
        if not any(x in episode['href'] for x in ['commentary', 'Escape']):
            links_list.append('https://avatar.fandom.com' + episode['href'])
links = tuple(links_list)

In [9]:
links

('https://avatar.fandom.com/wiki/Transcript:The_Boy_in_the_Iceberg',
 'https://avatar.fandom.com/wiki/Transcript:The_Avatar_Returns',
 'https://avatar.fandom.com/wiki/Transcript:The_Southern_Air_Temple',
 'https://avatar.fandom.com/wiki/Transcript:The_Warriors_of_Kyoshi',
 'https://avatar.fandom.com/wiki/Transcript:The_King_of_Omashu',
 'https://avatar.fandom.com/wiki/Transcript:Imprisoned',
 'https://avatar.fandom.com/wiki/Transcript:Winter_Solstice,_Part_1:_The_Spirit_World',
 'https://avatar.fandom.com/wiki/Transcript:Winter_Solstice,_Part_2:_Avatar_Roku',
 'https://avatar.fandom.com/wiki/Transcript:The_Waterbending_Scroll',
 'https://avatar.fandom.com/wiki/Transcript:Jet_(episode)',
 'https://avatar.fandom.com/wiki/Transcript:The_Great_Divide',
 'https://avatar.fandom.com/wiki/Transcript:The_Storm',
 'https://avatar.fandom.com/wiki/Transcript:The_Blue_Spirit',
 'https://avatar.fandom.com/wiki/Transcript:The_Fortuneteller',
 'https://avatar.fandom.com/wiki/Transcript:Bato_of_the_Wat

Great! Now to check if actually got all 61 episodes:

In [10]:
len(links)

61

Episodes 1 and 2 of Book 1 and 8 of Book 3  have a first table with the opening lines from Katara or a **previous** line. So, we'll need to take care of that.

Also, the episode *The Tales of Ba Sing Se* is very different as it has 7 html tables. Of these, there is one for each *Tale* within the epsiode.

I would like to have, for each "script line" the number of the episode and the Book it corresponds to.

In [11]:
# # Checking number of html tables in each episode
# for number, link in enumerate(links):
#     print(number, len(pd.read_html(link)))

In [12]:
episodes_dict = {}
ep_count = 0
# Big loop through all the links:
for number, link in enumerate(links):
    print('Getting episode number', number)

    # Check for first two episodes containing a first table with opening (and the Puppetmaster episode)
    if number == 0 or number == 1 or number == 47:
        episodes_dict[str(number)] = pd.read_html(link)[1]

    elif number == 34:  # Dealing with multiple 'tales' inside the same episode (all are different tables)
        tales = []
        for i in range(5):
            tales.append(pd.read_html(links[34])[i])

        episodes_dict[str(number)] = pd.concat(tales)

    else:
        episodes_dict[str(number)] = pd.read_html(link)[0]

    # Renaming columns for clarity
    episodes_dict[str(number)].columns = ['Character', 'script']
    # Initializing empty column for the episode number (within a given Book)
    episodes_dict[str(number)]['ep_number'] = ''

    # The first 20 episodes are from Book 1. Book 2 has 20 and Book 4, 21.
    if number < 20:
        episodes_dict[str(number)]['Book'] = 1
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count
    elif number < 40:
        episodes_dict[str(number)]['Book'] = 2
        # Small trick to restart episode number counter when we change books
        if episodes_dict[str(number - 1)].iloc[0]['Book'] != episodes_dict[str(
                number)].iloc[0]['Book']:
            ep_count = 0
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count
    else:
        episodes_dict[str(number)]['Book'] = 3
        # Small trick to restart episode number counter when we change books
        if episodes_dict[str(number - 1)].iloc[0]['Book'] != episodes_dict[str(
                number)].iloc[0]['Book']:
            ep_count = 0
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count

Getting episode number 0
Getting episode number 1
Getting episode number 2
Getting episode number 3
Getting episode number 4
Getting episode number 5
Getting episode number 6
Getting episode number 7
Getting episode number 8
Getting episode number 9
Getting episode number 10
Getting episode number 11
Getting episode number 12
Getting episode number 13
Getting episode number 14
Getting episode number 15
Getting episode number 16
Getting episode number 17
Getting episode number 18
Getting episode number 19
Getting episode number 20
Getting episode number 21
Getting episode number 22
Getting episode number 23
Getting episode number 24
Getting episode number 25
Getting episode number 26
Getting episode number 27
Getting episode number 28
Getting episode number 29
Getting episode number 30
Getting episode number 31
Getting episode number 32
Getting episode number 33
Getting episode number 34
Getting episode number 35
Getting episode number 36
Getting episode number 37
Getting episode number

Another sanity check:

In [13]:
len(episodes_dict)

61

I'll use pickle to dump this dictionary into an object for easy use later.

In [14]:
with open('./data/episodes_dict.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(episodes_dict, filehandle)

Now let's concatenate all episodes in a single big DataFrame:

In [15]:
# Create a list of all dataframes to concatenate
frames = []

for k, v in episodes_dict.items():
    frames.append(v)

df_atla = pd.concat(frames)
# Reseting indexes
df_atla.index = range(len(df_atla))

How many "lines" does the whole series have:

In [16]:
len(df_atla)

13369

Checking for the head of our dataframe:

In [17]:
df_atla.head()

Unnamed: 0,Character,script,ep_number,Book
0,,"As the title card fades, the scene opens onto ...",1,1
1,Sokka,It's not getting away from me this time. [Clos...,1,1
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1
3,Katara,"[Happily surprised.] Sokka, look!",1,1
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1


Checking for unique values of `ep_number` for sanity check (this should be all integers between 1 and 21):

In [18]:
df_atla['ep_number'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21])

Let's create a column to store the episode number from 1 to 61:

In [19]:
# Function to count episodes from 1 to 61
def episode_number(ep, book):
    if book == 3:
        return ep + 40
    elif book == 2:
        return ep + 20
    else:
        return ep

In [20]:
df_atla['total_number'] = np.vectorize(episode_number)(df_atla['ep_number'],
                                                       df_atla['Book'])

In [21]:
df_atla.head()

Unnamed: 0,Character,script,ep_number,Book,total_number
0,,"As the title card fades, the scene opens onto ...",1,1,1
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1


Let's save our dataframe to a csv file for easy access later:

In [22]:
df_atla.to_csv('./data/ATLA-episodes-scripts.csv', index=False)

Also, let's use pickle to dump this full DataFrame to a file:

In [23]:
with open('./data/df_atla.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(df_atla, filehandle)

Testing if pickled object worked:

In [24]:
with open('./data/df_atla.data', 'rb') as filehandle:
    test_df = pickle.load(filehandle)

In [25]:
test_df.head()

Unnamed: 0,Character,script,ep_number,Book,total_number
0,,"As the title card fades, the scene opens onto ...",1,1,1
1,Sokka,It's not getting away from me this time. [Clos...,1,1,1
2,,"The shot pans quickly from Sokka to Katara, wh...",1,1,1
3,Katara,"[Happily surprised.] Sokka, look!",1,1,1
4,Sokka,"[Close-up of Sokka; whispering.] Sshh! Katara,...",1,1,1


In [26]:
test_df.tail()

Unnamed: 0,Character,script,ep_number,Book,total_number
13364,Suki,And why did you paint me firebending?,21,3,61
13365,Sokka,I thought it looked more exciting that way. [M...,21,3,61
13366,Iroh,"[Points at painting.] Hey, my belly's not that...",21,3,61
13367,Toph,Well I think you all look perfect! [They laugh.],21,3,61
13368,,"Aang walks past Appa, petting him briefly, bef...",21,3,61


Now we can enjoy our super cool new data!

## Helper dataframe

To better understand our data later on, we can create a dataframe with the names of episodes to use as a quick look up for exploring our results.

In [27]:
names_list = []

for el in seasons[1:7]:
    col = el.find_all('a')
    for episode in col:
        # Check if episode is not a commentary nor is it the online game called Escape from the Spirit World
        if not any(x in episode.text for x in ['commentary', 'Escape']):
            names_list.append(episode.text)
ep_names = tuple(names_list)

In [28]:
ep_names

('The Boy in the Iceberg',
 'The Avatar Returns',
 'The Southern Air Temple',
 'The Warriors of Kyoshi',
 'The King of Omashu',
 'Imprisoned',
 'Winter Solstice, Part 1: The Spirit World',
 'Winter Solstice, Part 2: Avatar Roku',
 'The Waterbending Scroll',
 'Jet',
 'The Great Divide',
 'The Storm',
 'The Blue Spirit',
 'The Fortuneteller',
 'Bato of the Water Tribe',
 'The Deserter',
 'The Northern Air Temple',
 'The Waterbending Master',
 'The Siege of the North, Part 1',
 'The Siege of the North, Part 2',
 'The Avatar State',
 'The Cave of Two Lovers',
 'Return to Omashu',
 'The Swamp',
 'Avatar Day',
 'The Blind Bandit',
 'Zuko Alone',
 'The Chase',
 'Bitter Work',
 'The Library',
 'The Desert',
 "The Serpent's Pass",
 'The Drill',
 'City of Walls and Secrets',
 'The Tales of Ba Sing Se',
 "Appa's Lost Days",
 'Lake Laogai',
 'The Earth King',
 'The Guru',
 'The Crossroads of Destiny',
 'The Awakening',
 'The Headband',
 'The Painted Lady',
 "Sokka's Master",
 'The Beach',
 'The Av

In [29]:
len(ep_names)

61

In [30]:
ep_names_df = pd.DataFrame(ep_names, columns=['Episode'])

In [31]:
ep_names_df['ep_number'] = ''
ep_names_df['Book'] = ''

In [32]:
ep_names_df

Unnamed: 0,Episode,ep_number,Book
0,The Boy in the Iceberg,,
1,The Avatar Returns,,
2,The Southern Air Temple,,
3,The Warriors of Kyoshi,,
4,The King of Omashu,,
...,...,...,...
56,The Ember Island Players,,
57,"Sozin's Comet, Part 1: The Phoenix King",,
58,"Sozin's Comet, Part 2: The Old Masters",,
59,"Sozin's Comet, Part 3: Into the Inferno",,


In [33]:
ep_count = 0
for number in range(len(ep_names_df)):
    # The first 20 episodes are from Book 1. Book 2 has 20 and Book 4, 21.
    if number < 20:
        ep_names_df.at[number, 'Book'] = 1
        ep_count += 1
        ep_names_df.at[number, 'ep_number'] = ep_count
    elif number < 40:
        ep_names_df.at[number, 'Book'] = 2
        # Small trick to restart episode number counter when we change books
        if ep_names_df.iloc[number -
                            1]['Book'] != ep_names_df.iloc[number]['Book']:
            ep_count = 0
        ep_count += 1
        ep_names_df.at[number, 'ep_number'] = ep_count
    else:
        ep_names_df.at[number, 'Book'] = 3
        # Small trick to restart episode number counter when we change books
        if ep_names_df.iloc[number -
                            1]['Book'] != ep_names_df.iloc[number]['Book']:
            ep_count = 0
        ep_count += 1
        ep_names_df.at[number, 'ep_number'] = ep_count

In [34]:
ep_names_df.tail(24)

Unnamed: 0,Episode,ep_number,Book
37,The Earth King,18,2
38,The Guru,19,2
39,The Crossroads of Destiny,20,2
40,The Awakening,1,3
41,The Headband,2,3
42,The Painted Lady,3,3
43,Sokka's Master,4,3
44,The Beach,5,3
45,The Avatar and the Fire Lord,6,3
46,The Runaway,7,3


Great! Now we can easily look up the name of a given episode based on it's `ep_number` and `Book`. Let's dump this into a pickled object:

In [35]:
with open('./data/ep_names.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(ep_names_df, filehandle)

Checking if it worked:

In [36]:
with open('./data/ep_names.data', 'rb') as filehandle:
    test_df = pickle.load(filehandle)

In [37]:
test_df

Unnamed: 0,Episode,ep_number,Book
0,The Boy in the Iceberg,1,1
1,The Avatar Returns,2,1
2,The Southern Air Temple,3,1
3,The Warriors of Kyoshi,4,1
4,The King of Omashu,5,1
...,...,...,...
56,The Ember Island Players,17,3
57,"Sozin's Comet, Part 1: The Phoenix King",18,3
58,"Sozin's Comet, Part 2: The Old Masters",19,3
59,"Sozin's Comet, Part 3: Into the Inferno",20,3


# Getting lines from *The Legend of Korra*

We can a very similar procedure to get episodes from *The Legend of Korra*. For this, we pick the tables from 7 to 15 in our `seasons` object:

In [7]:
for el in seasons[7:15]:
    col = el.find_all('a')
    for episode in col:
        print('https://avatar.fandom.com' + episode['href'])

https://avatar.fandom.com/wiki/Transcript:Welcome_to_Republic_City
https://avatar.fandom.com/wiki/Transcript:Welcome_to_Republic_City_(commentary)
https://avatar.fandom.com/wiki/Transcript:A_Leaf_in_the_Wind
https://avatar.fandom.com/wiki/Transcript:A_Leaf_in_the_Wind_(commentary)
https://avatar.fandom.com/wiki/Transcript:The_Revelation_(episode)
https://avatar.fandom.com/wiki/Transcript:The_Voice_in_the_Night
https://avatar.fandom.com/wiki/Transcript:The_Spirit_of_Competition
https://avatar.fandom.com/wiki/Transcript:And_the_Winner_Is...
https://avatar.fandom.com/wiki/Transcript:The_Aftermath
https://avatar.fandom.com/wiki/Transcript:When_Extremes_Meet
https://avatar.fandom.com/wiki/Transcript:When_Extremes_Meet_(commentary)
https://avatar.fandom.com/wiki/Transcript:Out_of_the_Past
https://avatar.fandom.com/wiki/Transcript:Turning_the_Tides
https://avatar.fandom.com/wiki/Transcript:Skeletons_in_the_Closet
https://avatar.fandom.com/wiki/Transcript:Skeletons_in_the_Closet_(commentary)
h

Getting rid of commentary episodes:

In [8]:
korra_links_list = []

for el in seasons[7:15]:
    col = el.find_all('a')
    for episode in col:
        # Check if episode is not a commentary nor is it the online game called Escape from the Spirit World
        if not any(x in episode['href'] for x in ['commentary']):
            korra_links_list.append('https://avatar.fandom.com' + episode['href'])
korra_links = tuple(korra_links_list)

In [9]:
korra_links

('https://avatar.fandom.com/wiki/Transcript:Welcome_to_Republic_City',
 'https://avatar.fandom.com/wiki/Transcript:A_Leaf_in_the_Wind',
 'https://avatar.fandom.com/wiki/Transcript:The_Revelation_(episode)',
 'https://avatar.fandom.com/wiki/Transcript:The_Voice_in_the_Night',
 'https://avatar.fandom.com/wiki/Transcript:The_Spirit_of_Competition',
 'https://avatar.fandom.com/wiki/Transcript:And_the_Winner_Is...',
 'https://avatar.fandom.com/wiki/Transcript:The_Aftermath',
 'https://avatar.fandom.com/wiki/Transcript:When_Extremes_Meet',
 'https://avatar.fandom.com/wiki/Transcript:Out_of_the_Past',
 'https://avatar.fandom.com/wiki/Transcript:Turning_the_Tides',
 'https://avatar.fandom.com/wiki/Transcript:Skeletons_in_the_Closet',
 'https://avatar.fandom.com/wiki/Transcript:Endgame',
 'https://avatar.fandom.com/wiki/Transcript:Rebel_Spirit',
 'https://avatar.fandom.com/wiki/Transcript:The_Southern_Lights',
 'https://avatar.fandom.com/wiki/Transcript:Civil_Wars,_Part_1',
 'https://avatar.fan

In [11]:
print(f'Number of episodes in TLOK: {len(korra_links)}')

Number of episodes in TLOK: 52


In [38]:
# # Checking number of html tables in each episode
# for number, link in enumerate(korra_links):
#     print(number, len(pd.read_html(link)))

In [13]:
episodes_dict = {}
ep_count = 0
# Big loop through all the links:
for number, link in enumerate(korra_links):
    print('Getting episode number', number)

    # Episodes containing a first table with opening
    episodes_dict[str(number)] = pd.read_html(link)[1]

    # Renaming columns for clarity
    episodes_dict[str(number)].columns = ['Character', 'script']
    # Initializing empty column for the episode number (within a given Book)
    episodes_dict[str(number)]['ep_number'] = ''

    # The first 12 episodes are from Book 1. Book 2 has 14, Book 3 has 13 and Book 4, 13.
    if number < 12:
        episodes_dict[str(number)]['Book'] = 1
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count
    elif number < 26:
        episodes_dict[str(number)]['Book'] = 2
        # Small trick to restart episode number counter when we change books
        if episodes_dict[str(number - 1)].iloc[0]['Book'] != episodes_dict[str(
                number)].iloc[0]['Book']:
            ep_count = 0
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count
    elif number < 39:
        episodes_dict[str(number)]['Book'] = 3
        # Small trick to restart episode number counter when we change books
        if episodes_dict[str(number - 1)].iloc[0]['Book'] != episodes_dict[str(
                number)].iloc[0]['Book']:
            ep_count = 0
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count
    else:
        episodes_dict[str(number)]['Book'] = 4
        # Small trick to restart episode number counter when we change books
        if episodes_dict[str(number - 1)].iloc[0]['Book'] != episodes_dict[str(
                number)].iloc[0]['Book']:
            ep_count = 0
        ep_count += 1
        episodes_dict[str(number)]['ep_number'] = ep_count

Getting episode number 0
Getting episode number 1
Getting episode number 2
Getting episode number 3
Getting episode number 4
Getting episode number 5
Getting episode number 6
Getting episode number 7
Getting episode number 8
Getting episode number 9
Getting episode number 10
Getting episode number 11
Getting episode number 12
Getting episode number 13
Getting episode number 14
Getting episode number 15
Getting episode number 16
Getting episode number 17
Getting episode number 18
Getting episode number 19
Getting episode number 20
Getting episode number 21
Getting episode number 22
Getting episode number 23
Getting episode number 24
Getting episode number 25
Getting episode number 26
Getting episode number 27
Getting episode number 28
Getting episode number 29
Getting episode number 30
Getting episode number 31
Getting episode number 32
Getting episode number 33
Getting episode number 34
Getting episode number 35
Getting episode number 36
Getting episode number 37
Getting episode number

In [14]:
len(episodes_dict)

52

Dumping this for use later:

In [15]:
with open('./data/korra_episodes_dict.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(episodes_dict, filehandle)

In [16]:
# Create a list of all dataframes to concatenate
frames = []

for k, v in episodes_dict.items():
    frames.append(v)

df_atlk = pd.concat(frames)
# Reseting indexes
df_atlk.index = range(len(df_atlk))

In [17]:
len(df_atlk)

9695

In [18]:
df_atlk.head()

Unnamed: 0,Character,script,ep_number,Book
0,,The episode opens to footage of the sky while ...,1,1
1,Tonraq,The White Lotus has honored my family by comin...,1,1
2,,The man pushes open the door to grant the thre...,1,1
3,Senna,"[Briefly bowing her head, and tugging her righ...",1,1
4,,A shield hanging on the wall falls down and ro...,1,1


In [19]:
df_atlk['ep_number'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [20]:
# Function to count episodes from 1 to 52
def korra_episode_number(ep, book):
    if book == 4:
        return ep + 39
    elif book == 3:
        return ep + 26
    elif book == 2:
        return ep + 12
    else:
        return ep

In [21]:
df_atlk['total_number'] = np.vectorize(korra_episode_number)(df_atlk['ep_number'],
                                                       df_atlk['Book'])

In [24]:
df_atlk.head()

Unnamed: 0,Character,script,ep_number,Book,total_number
0,,The episode opens to footage of the sky while ...,1,1,1
1,Tonraq,The White Lotus has honored my family by comin...,1,1,1
2,,The man pushes open the door to grant the thre...,1,1,1
3,Senna,"[Briefly bowing her head, and tugging her righ...",1,1,1
4,,A shield hanging on the wall falls down and ro...,1,1,1


In [25]:
df_atlk.tail()

Unnamed: 0,Character,script,ep_number,Book,total_number
9690,Korra,[Lightens up and smiles.] Let's do it! Let's g...,13,4,52
9691,Asami,[Surprised.] Really? Okay ... I've always want...,13,4,52
9692,Korra,Sounds perfect.,13,4,52
9693,,"Fade to the base of the new spirit portal, whe...",13,4,52
9694,,Roll credits.,13,4,52


Dumping the dataframe:

In [26]:
with open('./data/df_atlk.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(df_atlk, filehandle)

## Helper dataframe

In [27]:
names_list = []

for el in seasons[7:15]:
    col = el.find_all('a')
    for episode in col:
        # Check if episode is not a commentary
        if not any(x in episode.text for x in ['commentary']):
            names_list.append(episode.text)
ep_names = tuple(names_list)

In [28]:
ep_names

('Welcome to Republic City',
 'A Leaf in the Wind',
 'The Revelation',
 'The Voice in the Night',
 'The Spirit of Competition',
 'And the Winner Is...',
 'The Aftermath',
 'When Extremes Meet',
 'Out of the Past',
 'Turning the Tides',
 'Skeletons in the Closet',
 'Endgame',
 'Rebel Spirit',
 'The Southern Lights',
 'Civil Wars, Part 1',
 'Civil Wars, Part 2',
 'Peacekeepers',
 'The Sting',
 'Beginnings, Part 1',
 'Beginnings, Part 2',
 'The Guide',
 'A New Spiritual Age',
 'Night of a Thousand Stars',
 'Harmonic Convergence',
 'Darkness Falls',
 'Light in the Dark',
 'A Breath of Fresh Air',
 'Rebirth',
 'The Earth Queen',
 "In Harm's Way",
 'The Metal Clan',
 'Old Wounds',
 'Original Airbenders',
 'The Terror Within',
 'The Stakeout',
 'Long Live the Queen',
 'The Ultimatum',
 'Enter the Void',
 'Venom of the Red Lotus',
 'After All These Years',
 'Korra Alone',
 'The Coronation',
 'The Calling',
 'Enemy at the Gates',
 'The Battle of Zaofu',
 'Reunion',
 'Remembrances',
 'Beyond the

In [29]:
korra_ep_names_df = pd.DataFrame(ep_names, columns=['Episode'])

In [31]:
korra_ep_names_df['ep_number'] = ''
korra_ep_names_df['Book'] = ''

In [33]:
korra_ep_names_df.head()

Unnamed: 0,Episode,ep_number,Book
0,Welcome to Republic City,,
1,A Leaf in the Wind,,
2,The Revelation,,
3,The Voice in the Night,,
4,The Spirit of Competition,,


In [34]:
ep_count = 0
for number in range(len(korra_ep_names_df)):
    # The first 12 episodes are from Book 1. Book 2 has 14, Book 3 has 13 and Book 4, 13.
    if number < 12:
        korra_ep_names_df.at[number, 'Book'] = 1
        ep_count += 1
        korra_ep_names_df.at[number, 'ep_number'] = ep_count
    elif number < 26:
        korra_ep_names_df.at[number, 'Book'] = 2
        # Small trick to restart episode number counter when we change books
        if korra_ep_names_df.iloc[number -
                            1]['Book'] != korra_ep_names_df.iloc[number]['Book']:
            ep_count = 0
        ep_count += 1
        korra_ep_names_df.at[number, 'ep_number'] = ep_count
    elif number < 39:
        korra_ep_names_df.at[number, 'Book'] = 3
        # Small trick to restart episode number counter when we change books
        if korra_ep_names_df.iloc[number -
                            1]['Book'] != korra_ep_names_df.iloc[number]['Book']:
            ep_count = 0
        ep_count += 1
        korra_ep_names_df.at[number, 'ep_number'] = ep_count
    else:
        korra_ep_names_df.at[number, 'Book'] = 4
        # Small trick to restart episode number counter when we change books
        if korra_ep_names_df.iloc[number -
                            1]['Book'] != korra_ep_names_df.iloc[number]['Book']:
            ep_count = 0
        ep_count += 1
        korra_ep_names_df.at[number, 'ep_number'] = ep_count

In [36]:
korra_ep_names_df.tail()

Unnamed: 0,Episode,ep_number,Book
47,Beyond the Wilds,9,4
48,Operation Beifong,10,4
49,Kuvira's Gambit,11,4
50,Day of the Colossus,12,4
51,The Last Stand,13,4


Dumping the result:

In [37]:
with open('./data/korra_ep_names.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(korra_ep_names_df, filehandle)