In [35]:
# Initial imports
import numpy as np
import pandas as pd 
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
import re

import random
import urllib.request
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [241]:
def _format_num(n):
    return '%02d' % n

In [249]:
episode_info_df = DataFrame(columns=('Season', 'EpisodeNo', 'Title', 'AirDate', 'Writers', 'Director', 'SEID'))
script_df = DataFrame(columns=('SEID', 'Character', 'Dialogue'))
BASE_URL = 'http://www.seinology.com/scripts/script-' 
EPISODE_NUMBERS = (
    list(map(_format_num, range(1, 82))) +

    # Double episode
    ['82and83'] +

    list(map(_format_num, range(84, 100))) +

    # Skip the clip show "100and101".

    list(map(_format_num, range(102, 177))) +

    # Skip the clip show "177and178".

    # Double episode (Finale)
    ['179and180']
)

In [243]:
def get_episode_soup(no):
    url = BASE_URL + str(no) + '.shtml'
    source_code = requests.get(url)
    plain_text = source_code.text
    return plain_text

In [244]:
def parse_episode_info(html):
    """Return a dict with meta-info about the episode."""
    groups = re.search(r'pc: .*? season (\d+), episode (\d+)', html).groups()
    season_num = int(groups[0])
    episode_num = int(groups[1])

    title = re.search(r'Episode \d+(.*?) - (.*?)<', html).groups()[1]
    date = re.search(r'Broadcast date: (.*?)<', html).groups()[0]
    writers = re.search(r'Written [bB]y:? (.*?)<', html).groups()[0]
    writers = tuple([w.strip() for w in re.split(r',|&amp;', writers) if w])
    director = re.search(r'Directed [bB]y (.*?)<', html).groups()[0]

    return {'season_num': season_num, 'episode_num': episode_num, 
            'title': title, 'date': date, 'writers': writers, 
            'director': director}

In [262]:
for no in EPISODE_NUMBERS:
    html = get_episode_soup(no)
    html_split = re.split(r'={30}.*', html)
    header = html_split[0]
    content = html_split[1]
    episode_info = parse_episode_info(header)
    soup = BeautifulSoup(html_split[1])
    dialogues = list(filter(None, soup.find('body').text.replace('\t', '').split('\n')))
    temp1 = DataFrame([[
        episode_info['season_num'], 
        episode_info['episode_num'],
        re.sub(r'[^\x00-\x7f]',r'', episode_info['title']) ,
        episode_info['date'],
        ', '.join(episode_info['writers']),
        episode_info['director'],
        'S'+ str(_format_num(episode_info['episode_num'])) + 'E' + str(_format_num(episode_info['season_num']))
    ]], columns=('Season', 'EpisodeNo', 'Title', 'AirDate', 'Writers', 'Director', 'SEID'))
    episode_info_df = episode_info_df.append(temp1, ignore_index = True)
    for dialogue in dialogues:
        if len(dialogue.split(':')) <= 1:
            continue
        dialogue_split = dialogue.split(':')
        character = dialogue_split.pop(0)
        line = ''.join(dialogue_split).strip()
        line = re.sub(r'[^\x00-\x7f]',r'', line) 
        temp2 = DataFrame([[
            episode_info['season_num'], 
            episode_info['episode_num'], 
            'S'+ str(_format_num(episode_info['episode_num'])) + 'E' + str(_format_num(episode_info['season_num'])), 
            character, 
            line
        ]], columns=('Season', 'EpisodeNo', 'SEID', 'Character', 'Dialogue'))
        script_df = script_df.append(temp2, ignore_index = True)
    script_df.to_csv('scripts.csv', encoding='utf-8')
    episode_info_df.to_csv('episode_info.csv', encoding='utf-8')
    break
    

In [266]:
re.sub(r'[^\x00-\x7f]',r'', line)

'I swear, I have absolutely no idea what women are thinking. I dont get it, okay? I, I, I admit, I, Im not getting the signals. I am not getting it! Women, theyre so subtle, their little...everything they do is subtle. Men are not subtle, we are obvious. Women know what men want, men know what men want, what do we want? We want women, thats it! Its the only thing we know for sure, it really is. We want women. How do we get them? Oh, we dont know bout that, we dont know. The next step after that we have no idea. This is why you see men honking car-horns, yelling from construction sites. These are the best ideas weve had so far. The car-horn honk, is that a beauty? Have you seen men doing this? What is this? The man is in the car, the woman walks by the front of the car, he honks. E-eeehh, eehhh, eehhh! This man is out of ideas. How does it...? E-e-e-eeeehhhh! I dont think she likes me. The amazing thing is, that we still get women, dont we? Men, I mean, men are with women. You see men w

In [260]:
episode_info_df

Unnamed: 0,Season,EpisodeNo,Title,AirDate,Writers,Director,SEID
0,1.0,1.0,"b'Good News, Bad News'","July 5, 1989","Larry David, Jerry Seinfeld",Art Wolff,SE01EP01


In [239]:
episode_info

{'date': 'July 5, 1989',
 'director': 'Art Wolff',
 'episode_num': 1,
 'season_num': 1,
 'title': 'Good News, Bad News',
 'writers': ('Larry David', 'Jerry Seinfeld')}

In [229]:
a = l.pop(0)
a
l

[' Do you know what this is all about? Do you know, why we\x92re here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about \x93We should go out\x94? This is what they\x92re talking about...this whole thing, we\x92re all out now, no one is home. Not one person here is home, we\x92re all out! There are people tryin\x92 to find us, they don\x92t know where we are. (on an imaginary phone) \x93Did you ring?, I can\x92t find him.\x94 \x93Where did he go?\x94 \x93He didn\x92t tell me where he was going\x94. He must have gone out. You wanna go out',
 ' you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then you\x92re standing around, whatta you do? You go',
 ' \x93We gotta be getting back\x94. Once you\x92re out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow,

In [230]:
a

'JERRY'

In [234]:
''.join(l).strip()

'Do you know what this is all about? Do you know, why we\x92re here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about \x93We should go out\x94? This is what they\x92re talking about...this whole thing, we\x92re all out now, no one is home. Not one person here is home, we\x92re all out! There are people tryin\x92 to find us, they don\x92t know where we are. (on an imaginary phone) \x93Did you ring?, I can\x92t find him.\x94 \x93Where did he go?\x94 \x93He didn\x92t tell me where he was going\x94. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then you\x92re standing around, whatta you do? You go \x93We gotta be getting back\x94. Once you\x92re out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Wher

In [188]:
episode_info

{'date': 'July 5, 1989',
 'director': 'Art Wolff',
 'episode_num': 1,
 'season_num': 1,
 'title': 'Good News, Bad News',
 'writers': ('Larry David', 'Jerry Seinfeld')}

In [187]:
list(filter(None, soup.find('body').text.replace('\t', '').split('\n')))

['INT. COMEDY CLUB \x96 NIGHT',
 '(Jerry is on stage, performing.)',
 'JERRY: Do you know what this is all about? Do you know, why we\x92re here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about \x93We should go out\x94? This is what they\x92re talking about...this whole thing, we\x92re all out now, no one is home. Not one person here is home, we\x92re all out! There are people tryin\x92 to find us, they don\x92t know where we are. (on an imaginary phone) \x93Did you ring?, I can\x92t find him.\x94 \x93Where did he go?\x94 \x93He didn\x92t tell me where he was going\x94. He must have gone out. You wanna go out: you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then you\x92re standing around, whatta you do? You go: \x93We gotta be getting back\x94. Once you\x92re out, you wanna get back! You wa

In [158]:
b = parse_episode_info(a)

In [162]:
list(b['writers'])

['Larry David', 'Jerry Seinfeld']

In [119]:
a = re.split(r'={30}.*', p[3].text)
episode_info = parse_episode_info(a[0])
a[0]

AttributeError: 'NoneType' object has no attribute 'groups'

In [141]:
a[0].replace('\t', '').split('\n')

['Episode 1 - Good News, Bad News',
 '',
 'pc: 101, season 1, episode 1 (Pilot)',
 '',
 'Broadcast date: July 5, 1989',
 '',
 '(The series is titled The Seinfeld Chronicles, then re-titled Seinfeld for the rest of the series)',
 '',
 '',
 'Written By Larry David & Jerry Seinfeld',
 '',
 'Directed By Art Wolff',
 '',
 '',
 '-------------------------------------------------------------------------------',
 '',
 '',
 'The Cast',
 '',
 'Regulars:',
 '',
 'Jerry Seinfeld ....................... Jerry Seinfeld',
 '',
 'Jason Alexander .................. George Costanza',
 '',
 'Michael Richards ................. Kessler',
 '',
 '',
 'Guest Stars:',
 '',
 '',
 'Lee Garlington ................ Claire',
 '',
 'Pamela Brull ..................... Laura',
 '',
 '',
 '',
 '']

In [87]:
all_text = p[3].text.replace('\t', '').strip().split('\n')

In [72]:
episode_no = all_text[0].split('-')[0].strip()
episode_name = all_text[0].split('-')[1].strip()

In [75]:
episode_no

'Episode 1'

In [98]:
str_list = list(filter(None, all_text))

In [99]:
str_list

['Episode 1 - Good News, Bad News',
 'pc: 101, season 1, episode 1 (Pilot)',
 'Broadcast date: July 5, 1989',
 '(The series is titled The Seinfeld Chronicles, then re-titled Seinfeld for the rest of the series)',
 'Written By Larry David & Jerry Seinfeld',
 'Directed By Art Wolff',
 '-------------------------------------------------------------------------------',
 'The Cast',
 'Regulars:',
 'Jerry Seinfeld ....................... Jerry Seinfeld',
 'Jason Alexander .................. George Costanza',
 'Michael Richards ................. Kessler',
 'Guest Stars:',
 'Lee Garlington ................ Claire',
 'Pamela Brull ..................... Laura',
 'INT. COMEDY CLUB \x96 NIGHT',
 '(Jerry is on stage, performing.)',
 'JERRY: Do you know what this is all about? Do you know, why we\x92re here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about \x93We should go out\x94? This is what they\x92re talking

In [30]:
a[2]

JERRY: Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are. (on an imaginary phone) Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out: you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go: We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Where ever you are in life, its my feeling, youve gott