# Scrape Simpsons episodes
This notebook scrapes all the episodes from The Simpsons.

## Libraries

In [None]:
# Import all libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Scrape all episode tables

Get the text from all `wikitable plainrowheaders wikiepisodetable` tables.

In [None]:
# Get request
url = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

# Find all episode tables
tables = soup.find_all(
    name='table',
    attrs={
        'class': 'wikitable plainrowheaders wikiepisodetable'
    }
)

Compile all the tables into a single dataframe.

In [None]:
# Columns to retrieve
cols = [
    'Title',
    'Original air date',
    'U.S. viewers (millions)'
]

# Read all tables and concat them together
for i, j in enumerate(tables):
        
    # Read HTML (omit movies)
    try:
        d = pd.read_html(str(j))[0][cols]
    except:
        pass
    
    # Init dataframe
    if i == 0:

        # Declare `df`
        df = d
    
    # Append subsequent frames
    else:

        # Concat to `df`
        df = pd.concat([df, d])

Clean episodes table

In [None]:
# Rename columns
df.columns = [
    'title',
    'release',
    'viewers'
]

# Clean title column
df['title'] = df['title'].str.replace('"', '')

# Turn date to datetime
df['release'] = pd.to_datetime(
    arg=df['release'],
    format='%B %d, %Y'
)

# Clean viewers
df['viewers'] = df['viewers'].str.split('[').str[0].astype(float)

## Scrape Special Guests list

In [None]:
# Get request
url = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_1%E2%80%9320)'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

# Find all guests table
guests = soup.find(
    name='table',
    attrs={
        'class': 'wikitable'
    }
)

# To pandas
gs = pd.read_html(str(guests))[0][['Season', 'Episode title']]

Clean guests table

In [None]:
# Rename columns
gs.columns = ['season', 'title']

# Remove " from title column
gs['title'] = gs['title'].str.replace('"', '')

# Split title column on "["
gs['title'] = gs['title'].str.split('[').str[0]

# Groupby
gs.groupby('title').size().reset_index(
    name='n_guests'
)