In [20]:
# Initial imports
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
import re

import random
import urllib.request
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [21]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [22]:
def _format_num(n):
    return '%02d' % n

In [23]:
episode_info_df = DataFrame(columns=('Season', 'EpisodeNo', 'Title', 'AirDate', 'Writers', 'Director', 'SEID'))
script_df = DataFrame(columns=('SEID', 'Character', 'Dialogue'))
BASE_URL = 'http://www.seinology.com/scripts/script-' 
EPISODE_NUMBERS = (
    list(map(_format_num, range(1, 82))) +

    # Double episode
    ['82and83'] +

    list(map(_format_num, range(84, 100))) +

    # Skip the clip show "100and101".

    list(map(_format_num, range(102, 177))) +

    # Skip the clip show "177and178".

    # Double episode (Finale)
    ['179and180']
)

In [24]:
def get_episode_soup(no):
    url = BASE_URL + str(no) + '.shtml'
    source_code = requests.get(url)
    plain_text = source_code.text
    return plain_text

In [25]:
def parse_episode_info(html):
    """Return a dict with meta-info about the episode."""
    groups = re.search(r'pc: .*? season (\d+), episode (\d+)', html).groups()
    season_num = int(groups[0])
    episode_num = int(groups[1])

    title = re.search(r'Episode \d+(.*?) - (.*?)<', html).groups()[1]
    date = re.search(r'Broadcast date: (.*?)<', html).groups()[0]
    writers = re.search(r'Written [bB]y([:]|&nbsp;)? (.*?)<', html).groups()[0]
    writers = tuple([w.strip() for w in re.split(r',|&amp;', writers) if w])
    director = re.search(r'Directed [bB]y (.*?)<', html).groups()[0]

    return {'season_num': season_num, 'episode_num': episode_num, 
            'title': title, 'date': date, 'writers': writers, 
            'director': director}

In [26]:
for no in log_progress(EPISODE_NUMBERS, every=1):
    html = get_episode_soup(no)
    html_split = re.split(r'={30}.*', html)
    header = html_split[0]
    content = html_split[1]
    episode_info = parse_episode_info(header)
    soup = BeautifulSoup(html_split[1])
    dialogues = list(filter(None, soup.find('body').text.replace('\t', '').split('\n')))
    temp1 = DataFrame([[
        episode_info['season_num'], 
        episode_info['episode_num'],
        re.sub(r'[^\x00-\x7f]',r'', episode_info['title']) ,
        episode_info['date'],
        ', '.join(episode_info['writers']),
        episode_info['director'],
        'S'+ str(_format_num(episode_info['season_num'])) + 'E' + str(_format_num(episode_info['episode_num']))
    ]], columns=('Season', 'EpisodeNo', 'Title', 'AirDate', 'Writers', 'Director', 'SEID'))
    episode_info_df = episode_info_df.append(temp1, ignore_index = True)
    for dialogue in dialogues:
        if len(dialogue.split(':')) <= 1:
            continue
        dialogue_split = dialogue.split(':')
        character = dialogue_split.pop(0)
        line = ''.join(dialogue_split).strip()
        line = re.sub(r'[^\x00-\x7f]',r'', line) 
        temp2 = DataFrame([[
            episode_info['season_num'], 
            episode_info['episode_num'], 
            'S'+ str(_format_num(episode_info['season_num'])) + 'E' + str(_format_num(episode_info['episode_num'])), 
            character, 
            line
        ]], columns=('Season', 'EpisodeNo', 'SEID', 'Character', 'Dialogue'))
        script_df = script_df.append(temp2, ignore_index = True)
    script_df.to_csv('scripts.csv', encoding='utf-8')
    episode_info_df.to_csv('episode_info.csv', encoding='utf-8')
    break

TypeError: expected string or bytes-like object

In [15]:
episode_info_df

Unnamed: 0,Season,EpisodeNo,Title,AirDate,Writers,Director,SEID
0,1.0,1.0,"Good News, Bad News","July 5, 1989","Larry David, Jerry Seinfeld",Art Wolff,S01E01
1,1.0,1.0,"Good News, Bad News","July 5, 1989","Larry David, Jerry Seinfeld",Art Wolff,S01E01
2,1.0,1.0,The Stakeout,"May 31, 1990","Larry David, Jerry Seinfeld",Tom Cherones,S01E01
3,1.0,2.0,The Robbery,"June 7, 1990",Matt Goldman,Tom Cherones,S02E01
4,1.0,3.0,Male Unbonding,"June 14, 1990","Larry David, Jerry Seinfeld",Tom Cherones,S03E01
5,1.0,4.0,The Stock Tip,"June 21, 1990","Larry David, Jerry Seinfeld",Tom Cherones,S04E01
6,2.0,1.0,The Ex-Girlfriend,"January 16, 1991","Larry David, Jerry Seinfeld",Tom Cherones,S01E02
7,2.0,2.0,The Pony Remark,"January 30, 1991","Larry David, Jerry Seinfeld",Tom Cherones,S02E02
8,2.0,3.0,The Jacket,"February 6, 1991","Larry David, Jerry Seinfeld",Tom Cherones,S03E02
9,2.0,4.0,The Phone Message,"February 13, 1991","Larry David, Jerry Seinfeld",Tom Cherones,S04E02


In [16]:
script_df

Unnamed: 0,Character,Dialogue,EpisodeNo,SEID,Season
0,JERRY,Do you know what this is all about? Do you kno...,1.0,S01E01,1.0
1,JERRY,"(pointing at Georges shirt) See, to me, that b...",1.0,S01E01,1.0
2,GEORGE,Are you through?,1.0,S01E01,1.0
3,JERRY,"You do of course try on, when you buy?",1.0,S01E01,1.0
4,GEORGE,"Yes, it was purple, I liked it, I dont actuall...",1.0,S01E01,1.0
5,JERRY,"Oh, you dont recall?",1.0,S01E01,1.0
6,GEORGE,"(on an imaginary microphone) Uh, no, not at th...",1.0,S01E01,1.0
7,JERRY,"Well, senator, Id just like to know, what you ...",1.0,S01E01,1.0
8,CLAIRE,Mr. Seinfeld. Mr. Costanza.,1.0,S01E01,1.0
9,GEORGE,"Are, are you sure this is decaf? Wheres the or...",1.0,S01E01,1.0


In [18]:
writers = re.search(r'Written [bB]y:? (.*?)<', html).groups()[0]

AttributeError: 'NoneType' object has no attribute 'groups'

In [19]:
html

