In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import os
from tqdm.notebook import tqdm
import socket

In [8]:
regexps = [
    (re.compile('♪[^♪]*♪|\[[^\]]*\]|\([^\)]*\)'), ' '),
    (re.compile('<\/?[\w ]*>'), ' '), # for <\br> and similar tags
]

In [9]:
pages_to_skip = [
    'https://scrapsfromtheloft.com/2019/10/27/jerry-seinfeld-playboy-interview-1993/',
    'https://scrapsfromtheloft.com/2019/10/15/bill-cosby-playboy-interview-1985/',
    'https://scrapsfromtheloft.com/2019/07/06/katherine-ryan-in-trouble-transcript/', # link to other site
    'https://scrapsfromtheloft.com/2019/07/05/katherine-ryan-glitter-room-transcript/', # Unable to connect
    'https://scrapsfromtheloft.com/2019/04/01/in-conversation-with-jerry-seinfeld/',
    'https://scrapsfromtheloft.com/2018/10/30/lenny-bruce-unspruced-review-judith-crist/',
    'https://scrapsfromtheloft.com/2018/10/03/richard-pryor-live-in-concert-review-carl-bennett-cinemonkey/',
    'https://scrapsfromtheloft.com/2018/01/05/dave-chappelle-equanimity-2017-transcripcion-completa/',
    'https://scrapsfromtheloft.com/2018/01/05/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/',
    'https://scrapsfromtheloft.com/2017/10/18/louis-c-k-the-rolling-stone-interview-2013/',
    'https://scrapsfromtheloft.com/2017/10/02/jim-jefferies-e-il-controllo-della-armi-in-america/',
    'https://scrapsfromtheloft.com/2017/08/23/doug-stanhope-no-refunds-2007-trascrizione-italiana/',
    'https://scrapsfromtheloft.com/2017/04/21/larry-king-interview-robin-williams-2007/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-jamming-new-york-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-diseased-1999-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-bad-2008-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/01/11/robin-williams-playboy-interview-1992/',
    'https://scrapsfromtheloft.com/2016/11/09/playboy-interview-george-carlin/',
]
ok_pages = [
    'https://scrapsfromtheloft.com/2020/01/13/dave-chappelle-acceptance-speech-2019-mark-twain-prize/',
    'https://scrapsfromtheloft.com/2020/01/07/ricky-gervais-2020-golden-globes-monologue-transcript/',
    'https://scrapsfromtheloft.com/2019/10/20/real-time-with-bill-maher-new-rule-prickstarter/',
    'https://scrapsfromtheloft.com/2019/09/11/new-rule-the-fudge-report-real-time-with-bill-maher/',
    'https://scrapsfromtheloft.com/2019/05/18/doug-stanhope-babies-and-abortion/',
    'https://scrapsfromtheloft.com/2019/03/18/ricky-gervais-2011-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2019/03/18/ricky-gervais-2016-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2019/02/18/politically-correct-language-george-carlin/',
    'https://scrapsfromtheloft.com/2018/08/11/dick-gregory-speech-st-johns-baptist-church-may-20-1963/',
    'https://scrapsfromtheloft.com/2018/05/23/trevor-noah-royal-wedding-2018/',
    'https://scrapsfromtheloft.com/2018/05/16/doug-stanhope-on-nationalism/',
    'https://scrapsfromtheloft.com/2018/03/27/ricky-gervais-2012-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2017/10/25/george-carlin-pro-life-abortion-and-the-sanctity-of-life/',
    'https://scrapsfromtheloft.com/2017/10/25/richard-pryors-monologue-saturday-night-live-1975/',
    'https://scrapsfromtheloft.com/2017/10/06/the-daily-show-fox-news-las-vegas-shooting-2017/',
    'https://scrapsfromtheloft.com/2017/10/03/george-carlin-religion-is-bullshit/',
]

In [10]:
URL = 'https://scrapsfromtheloft.com/comedy/page/{}/'

def save_file(path, txt, encoding=None):
    # Create the corresponding folder (if needed)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', encoding=encoding) as out_file:
        out_file.writelines(txt)


def process_block(block):
    result = []
    # If not text, skip
    try:
        if block.name in ['img'] or (block.name == 'div' and block.get('class', [''])[0] == 'yarpp-related'):
            return result
    except AttributeError:
        print('[ERROR] AttributeError!')
        print(type(block))
        print(block)
        print('---------------------------------')
    # If is a tag, process it's content
    if isinstance(block, Tag):
        for sub_block in block.contents:
            result.extend(process_block(sub_block))
        return result
    for regexp, sub_str in regexps:
        block = regexp.sub(sub_str, block)
    block = block.strip()
    if block:
        result.append(block)
    return result


def scrap_transcript(url, file_path):
    try:
        transcript_page = requests.get(url)
    except requests.exceptions.ConnectionError:
        print('[ERROR] Connection error to', url)
        return
    transcript_soup = BeautifulSoup(transcript_page.content, 'html.parser')
    content_blocks = transcript_soup.findAll('div', 'post-content')
    if len(content_blocks) != 1:
        print('[WARN] strange content in', url)
        return
    content = process_block(content_blocks[0])
    stripped_content = ['']
    for line in content:
        if len(stripped_content[-1]) < 200:
            stripped_content[-1] += ' ' + line
        else:
            stripped_content.append(line)
    save_file(file_path, '\n'.join(stripped_content), encoding='utf8')


n_batches = 40
skip_downloaded = False
n_processed = 0
pbar = tqdm(total=n_batches)
for i in range(n_batches):
    pbar.set_description('Loading {} batch...'.format(i+1))
#     time.sleep(1)
    page = requests.get(URL.format(i))
    pbar.set_description('Processing {} batch...'.format(i+1))
    soup = BeautifulSoup(page.content, 'html.parser')
    blocks = soup.body.findAll('div', 'fusion-post-content post-content')
    for j, block in enumerate(blocks):
        pbar.set_description('Processing {} block...'.format(j+1))
        block_title = block.find('h2', 'entry-title fusion-post-title').a
        transcript_url = block_title['href']
        file_name = transcript_url[:-1].rsplit('/', 1)[-1]
        file_path = os.path.join('data', 'transcripts', file_name + '.txt')
        # Skip `bad` pages
        if transcript_url in pages_to_skip:
            if os.path.exists(file_path):
                os.remove(file_path)
            continue
        if not ('transcript' in block_title.contents[0].lower() or transcript_url in ok_pages):
            print('[WARN] Possibly page without transcript!', transcript_url)
        if not (os.path.exists(file_path) and skip_downloaded):
            try:
                scrap_transcript(transcript_url, file_path)
            except Exception:
                print('[ERROR] Some error on:', transcript_url)
    n_processed += len(blocks)
    print('[INFO] Processed another ', len(blocks), 'blocks, for a total of', n_processed)
    pbar.update(1)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

[INFO] Processed another  10 blocks, for a total of 10
[INFO] Processed another  10 blocks, for a total of 20
[INFO] Processed another  10 blocks, for a total of 30
[INFO] Processed another  10 blocks, for a total of 40
[INFO] Processed another  10 blocks, for a total of 50
[INFO] Processed another  10 blocks, for a total of 60
[INFO] Processed another  10 blocks, for a total of 70
[INFO] Processed another  10 blocks, for a total of 80
[INFO] Processed another  10 blocks, for a total of 90
[INFO] Processed another  10 blocks, for a total of 100
[INFO] Processed another  10 blocks, for a total of 110
[INFO] Processed another  10 blocks, for a total of 120
[INFO] Processed another  10 blocks, for a total of 130
[INFO] Processed another  10 blocks, for a total of 140
[INFO] Processed another  10 blocks, for a total of 150
[INFO] Processed another  10 blocks, for a total of 160
[INFO] Processed another  10 blocks, for a total of 170
[INFO] Processed another  10 blocks, for a total of 180
[

## Test

In [131]:
URL = 'https://scrapsfromtheloft.com/2017/07/06/mitch-hedberg-comedy-central-special1999-full-transcript/'
try:
    page = requests.get(URL.format(i))
except requests.exceptions.ConnectionError:
    print('heh')
soup = BeautifulSoup(page.content, 'html.parser')

In [192]:
def process_block(block):
    result = []
    # If not text, skip
    if block.name in ['img'] or (block.name == 'div' and block.get('class', [''])[0] == 'yarpp-related'):
        return result
    # If is a tag, process it's content
    if isinstance(block, Tag):
        for i, sub_block in enumerate(block.contents):
            result.extend(process_block(sub_block))
        return result
    for regexp, sub_str in regexps:
        block = regexp.sub(sub_str, block)
    block = block.strip()
    if block:
        result.append(block)
    return result



# for block in soup.findAll('div', 'post-content')[0].findAll('p'):
process_block(soup.findAll('div', 'post-content')[0])

Mitch Hedberg’s half-hour special on Comedy Central
--------
This is the uncut 37 minute version.
--------
Tonight from the Palace in Hollywood, California. Comedy Central Presents Mitch Hedberg!
--------
Thank you. Hey. Welcome to my half-hour special. Does anybody know who I am? Why did a bunch of people who don’t know who I am show it my special? That’s bullshit. All right. Everybody. This will be fun.
--------
I used to live here in Los Angeles on Sierra Bonita. And I had an apartment. And I had a neighbor. And whenever he would knock on my wall, I knew he wanted me to turn my music down. That made me angry. Because I like loud music. So he knocked on the wall, I’d mess with his head. I’d say “go around.” “I cannot open the wall.” “I don’t know if you have a doorknob of the other side.” “But over here, there’s nothing.” “It’s just flat.”
--------
All right, man. I gotta do a half-hour. You get a like me more than that. I can’t be getting through a half-hour with that kind of action

['Mitch Hedberg’s half-hour special on Comedy Central',
 'This is the uncut 37 minute version.',
 'Tonight from the Palace in Hollywood, California. Comedy Central Presents Mitch Hedberg!',
 'Thank you. Hey. Welcome to my half-hour special. Does anybody know who I am? Why did a bunch of people who don’t know who I am show it my special? That’s bullshit. All right. Everybody. This will be fun.',
 'I used to live here in Los Angeles on Sierra Bonita. And I had an apartment. And I had a neighbor. And whenever he would knock on my wall, I knew he wanted me to turn my music down. That made me angry. Because I like loud music. So he knocked on the wall, I’d mess with his head. I’d say “go around.” “I cannot open the wall.” “I don’t know if you have a doorknob of the other side.” “But over here, there’s nothing.” “It’s just flat.”',
 'All right, man. I gotta do a half-hour. You get a like me more than that. I can’t be getting through a half-hour with that kind of action. I like an escalator, 