In [111]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import os
import pandas as pd
from tqdm.notebook import tqdm
import socket

output_path = os.path.join('..', 'data', 'transcripts')

In [37]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

regexps = [
    (re.compile('♪[^♪]*♪|\[[^\]]*\]|\([^\)]*\)'), ' '),
    (re.compile('<\/br>'), '\n'),
    (re.compile('<\/?[\w ]*>'), ' '), # for <\br> and similar tags
]

In [None]:
def save_file(path, txt, encoding=None):
    # Create the corresponding folder (if needed)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', encoding=encoding) as out_file:
        out_file.writelines(txt)


def process_block(block):
    result = []
    # If not text, skip
    try:
        if block.name in ['img'] or (block.name == 'div' and block.get('class', [''])[0] == 'yarpp-related'):
            return result
    except AttributeError:
        print('[ERROR] AttributeError!')
        print(type(block))
        print(block)
        print('---------------------------------')
    # If is a tag, process it's content
    if isinstance(block, Tag):
        for sub_block in block.contents:
            result.extend(process_block(sub_block))
        return result
    for regexp, sub_str in regexps:
        block = regexp.sub(sub_str, block)
    block = block.strip()
    if block:
        result.append(block)
    return result

## scrapsfromtheloft.com

In [174]:
pages_to_skip = [
    'https://scrapsfromtheloft.com/2019/10/27/jerry-seinfeld-playboy-interview-1993/',
    'https://scrapsfromtheloft.com/2019/10/15/bill-cosby-playboy-interview-1985/',
    'https://scrapsfromtheloft.com/2019/07/06/katherine-ryan-in-trouble-transcript/', # link to other site
    'https://scrapsfromtheloft.com/2019/07/05/katherine-ryan-glitter-room-transcript/', # Unable to connect
    'https://scrapsfromtheloft.com/2019/04/01/in-conversation-with-jerry-seinfeld/',
    'https://scrapsfromtheloft.com/2018/10/30/lenny-bruce-unspruced-review-judith-crist/',
    'https://scrapsfromtheloft.com/2018/10/03/richard-pryor-live-in-concert-review-carl-bennett-cinemonkey/',
    'https://scrapsfromtheloft.com/2018/01/05/dave-chappelle-equanimity-2017-transcripcion-completa/',
    'https://scrapsfromtheloft.com/2018/01/05/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/',
    'https://scrapsfromtheloft.com/2017/10/18/louis-c-k-the-rolling-stone-interview-2013/',
    'https://scrapsfromtheloft.com/2017/10/02/jim-jefferies-e-il-controllo-della-armi-in-america/',
    'https://scrapsfromtheloft.com/2017/08/23/doug-stanhope-no-refunds-2007-trascrizione-italiana/',
    'https://scrapsfromtheloft.com/2017/04/21/larry-king-interview-robin-williams-2007/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-jamming-new-york-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-diseased-1999-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-bad-2008-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/01/11/robin-williams-playboy-interview-1992/',
    'https://scrapsfromtheloft.com/2016/11/09/playboy-interview-george-carlin/',
]
ok_pages = [
    'https://scrapsfromtheloft.com/2020/01/13/dave-chappelle-acceptance-speech-2019-mark-twain-prize/',
    'https://scrapsfromtheloft.com/2020/01/07/ricky-gervais-2020-golden-globes-monologue-transcript/',
    'https://scrapsfromtheloft.com/2019/10/20/real-time-with-bill-maher-new-rule-prickstarter/',
    'https://scrapsfromtheloft.com/2019/09/11/new-rule-the-fudge-report-real-time-with-bill-maher/',
    'https://scrapsfromtheloft.com/2019/05/18/doug-stanhope-babies-and-abortion/',
    'https://scrapsfromtheloft.com/2019/03/18/ricky-gervais-2011-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2019/03/18/ricky-gervais-2016-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2019/02/18/politically-correct-language-george-carlin/',
    'https://scrapsfromtheloft.com/2018/08/11/dick-gregory-speech-st-johns-baptist-church-may-20-1963/',
    'https://scrapsfromtheloft.com/2018/05/23/trevor-noah-royal-wedding-2018/',
    'https://scrapsfromtheloft.com/2018/05/16/doug-stanhope-on-nationalism/',
    'https://scrapsfromtheloft.com/2018/03/27/ricky-gervais-2012-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2017/10/25/george-carlin-pro-life-abortion-and-the-sanctity-of-life/',
    'https://scrapsfromtheloft.com/2017/10/25/richard-pryors-monologue-saturday-night-live-1975/',
    'https://scrapsfromtheloft.com/2017/10/06/the-daily-show-fox-news-las-vegas-shooting-2017/',
    'https://scrapsfromtheloft.com/2017/10/03/george-carlin-religion-is-bullshit/',
]

In [179]:
def scrap_transcript(soup, transcript_url):
    file_name = transcript_url[:-1].rsplit('/', 1)[-1]
    file_path = os.path.join('test', file_name + '.txt')
    if not ('transcript' in file_path.lower()):
        print('[WARN] Possibly page without transcript!', transcript_url)
    content_blocks = soup.findAll('div', 'post-content')
    if len(content_blocks) != 1:
        print('[WARN] strange content in', url)
        return
    content = process_block(content_blocks[0])
    # Merge smal paragraphs.
    stripped_content = ['']
    for line in content:
        if len(stripped_content[-1]) < 200:
            stripped_content[-1] += ' ' + line
        else:
            stripped_content.append(line)
    stripped_content = '\n'.join(stripped_content)
    if file_path:
        save_file(file_path, stripped_content, encoding='utf8')
    return [stripped_content]

def url_transcripts(n_batches, pbar):
    URL = 'https://scrapsfromtheloft.com/comedy/page/{}/'
    for i in range(n_batches):
        # Load block of pages
        pbar.set_description('Loading {} block...'.format(i+1))
        page = requests.get(URL.format(i))
        soup = BeautifulSoup(page.content, 'html.parser')
        blocks = soup.body.findAll('div', 'fusion-post-content post-content')
        # Extract link to the page
        for j, block in enumerate(blocks):
            block_title = block.find('h2', 'entry-title fusion-post-title').a
            yield block_title['href']
    
def scrap_page(func, urls_gen, n_batches, save_to_files=False):
    n_processed = 0
    pbar = tqdm(total=n_batches)
    accumulator = []
    for i, url in enumerate(urls_gen(n_batches, pbar)):
        # Skip "bad" pages
        if url in pages_to_skip:
            continue
        try:
            pbar.set_description('Loading {} page...'.format(i+1))
            page = requests.get(url)
            pbar.set_description('Processing {} page...'.format(i+1))
        except requests.exceptions.ConnectionError:
            print('[ERROR] Connection error to', url)
            pbar.update(1)
            continue
        soup = BeautifulSoup(page.content, 'html.parser')
        new_items = func(soup, url)
        accumulator.extend(new_items)
        n_processed += len(new_items)
        print('[INFO] Processed another ', len(new_items), 'items, for a total of', n_processed)
        pbar.update(1)
    return accumulator

scrap_page(scrap_transcript, url_transcripts, n_batches=40, save_to_files=True)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

[INFO] Processed another  1 items, for a total of 1
[WARN] Possibly page without transcript! https://scrapsfromtheloft.com/2020/05/05/bill-burr-late-show-with-david-letterman-2010/
[INFO] Processed another  1 items, for a total of 2
[INFO] Processed another  1 items, for a total of 3
[INFO] Processed another  1 items, for a total of 4
[INFO] Processed another  1 items, for a total of 5
[INFO] Processed another  1 items, for a total of 6
[INFO] Processed another  1 items, for a total of 7
[INFO] Processed another  1 items, for a total of 8
[INFO] Processed another  1 items, for a total of 9
[INFO] Processed another  1 items, for a total of 10
[INFO] Processed another  1 items, for a total of 11
[INFO] Processed another  1 items, for a total of 12
[INFO] Processed another  1 items, for a total of 13


KeyboardInterrupt: 

## http://anecdotica.ru/

In [None]:
def scrap_anecdotica(soup, _):
    jokes = soup.findAll('div', 'item_text')
    return [' '.join(process_block(joke)) for joke in jokes]

def urls_anecdotica(n_batches, _):
    URL = 'http://anecdotica.ru/all/{}'
    for i in range(1, n_batches + 1):
        yield URL.format(i)

anecdotica = scrap_page(scrap_anecdotica, urls_anecdotica, n_batches=1000)

## https://www.anekdot.ru/

In [227]:
URL = 'https://www.anekdot.ru/an/an{}{}/j{}{}33;{},100.html' # year;month;year;month;start_from
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}


def urls_anekdot(max_num_jokes):
    # Шутки: j - свежие; s - повторные; x - остальные
    # Истории: o - свежие;
    URL = 'https://www.anekdot.ru/an/an{}{}/j{}{}33;{},100.html' # year;month;year;month;start_from
    # All years from 1995 to 2020
    years = list(map(lambda x: f'{x:0>2}', list(range(95, 100)) + list(range(0, 21))))
    months = list(map(lambda x: f'{x:0>2}', list(range(1, 13))))
    for year in years:
        for month in months:
            for start_ind in range(0, max_num_jokes, 100):
                yield URL.format(year, month, year, month, start_ind)

years = list(map(lambda x: f'{x:0>2}', list(range(95, 100)) + list(range(0, 21)))) # All years from 1995 to 2020
months = list(map(lambda x: f'{x:0>2}', list(range(1, 13))))
max_num = 3000
anekdot_jokes = []
n_processed = 0
pbar = tqdm(total=max_num * len(years) * len(months))
for year in years:
    for month in months:
        for start_ind in range(0, max_num, 100):
            page_url = URL.format(year, month, year, month, start_ind)
            page = requests.get(page_url, headers=headers)
            pbar.set_description(f'Processing {year} {month} {start_ind} page...')
            soup = BeautifulSoup(page.content, 'html.parser')
            new_items = [' '.join(process_block(joke)) for joke in soup.findAll('div', 'text')]
            if len(new_items) == 0:
                print('[INFO] No more jokes at:', page_url)
                pbar.update(max_num - start_ind)
                break
            anekdot_jokes.extend(new_items)
            n_processed += len(new_items)
            print('[INFO] Processed another ', len(new_items), 'items, for a total of', n_processed)
            pbar.update(100)

HBox(children=(FloatProgress(value=0.0, max=936000.0), HTML(value='')))

[INFO] No more jokes at: https://www.anekdot.ru/an/an9501/s950133;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9502/s950233;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9503/s950333;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9504/s950433;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9505/s950533;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9506/s950633;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9507/s950733;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9508/s950833;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9509/s950933;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9510/s951033;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9511/s951133;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9512/s951233;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9601/s960133;0,100.html

[INFO] Processed another  101 items, for a total of 7532
[INFO] Processed another  101 items, for a total of 7633
[INFO] Processed another  101 items, for a total of 7734
[INFO] Processed another  101 items, for a total of 7835
[INFO] Processed another  101 items, for a total of 7936
[INFO] Processed another  101 items, for a total of 8037
[INFO] Processed another  101 items, for a total of 8138
[INFO] Processed another  101 items, for a total of 8239
[INFO] Processed another  101 items, for a total of 8340
[INFO] Processed another  64 items, for a total of 8404
[INFO] No more jokes at: https://www.anekdot.ru/an/an9901/s990133;1400,100.html
[INFO] Processed another  101 items, for a total of 8505
[INFO] Processed another  101 items, for a total of 8606
[INFO] Processed another  101 items, for a total of 8707
[INFO] Processed another  101 items, for a total of 8808
[INFO] Processed another  101 items, for a total of 8909
[INFO] Processed another  101 items, for a total of 9010
[INFO] Pr

[INFO] Processed another  101 items, for a total of 20576
[INFO] Processed another  101 items, for a total of 20677
[INFO] Processed another  101 items, for a total of 20778
[INFO] Processed another  98 items, for a total of 20876
[INFO] No more jokes at: https://www.anekdot.ru/an/an9909/s990933;1600,100.html
[INFO] Processed another  101 items, for a total of 20977
[INFO] Processed another  101 items, for a total of 21078
[INFO] Processed another  101 items, for a total of 21179
[INFO] Processed another  101 items, for a total of 21280
[INFO] Processed another  101 items, for a total of 21381
[INFO] Processed another  101 items, for a total of 21482
[INFO] Processed another  101 items, for a total of 21583
[INFO] Processed another  101 items, for a total of 21684
[INFO] Processed another  101 items, for a total of 21785
[INFO] Processed another  101 items, for a total of 21886
[INFO] Processed another  101 items, for a total of 21987
[INFO] Processed another  101 items, for a total of

[INFO] Processed another  101 items, for a total of 33196
[INFO] Processed another  101 items, for a total of 33297
[INFO] Processed another  101 items, for a total of 33398
[INFO] Processed another  101 items, for a total of 33499
[INFO] Processed another  101 items, for a total of 33600
[INFO] Processed another  101 items, for a total of 33701
[INFO] Processed another  101 items, for a total of 33802
[INFO] Processed another  101 items, for a total of 33903
[INFO] Processed another  101 items, for a total of 34004
[INFO] Processed another  75 items, for a total of 34079
[INFO] No more jokes at: https://www.anekdot.ru/an/an0006/s000633;1300,100.html
[INFO] Processed another  65 items, for a total of 34144
[INFO] Processed another  101 items, for a total of 34245
[INFO] Processed another  101 items, for a total of 34346
[INFO] Processed another  101 items, for a total of 34447
[INFO] Processed another  101 items, for a total of 34548
[INFO] Processed another  101 items, for a total of 

[INFO] Processed another  101 items, for a total of 45955
[INFO] Processed another  101 items, for a total of 46056
[INFO] Processed another  101 items, for a total of 46157
[INFO] Processed another  101 items, for a total of 46258
[INFO] Processed another  101 items, for a total of 46359
[INFO] Processed another  101 items, for a total of 46460
[INFO] Processed another  101 items, for a total of 46561
[INFO] Processed another  48 items, for a total of 46609
[INFO] No more jokes at: https://www.anekdot.ru/an/an0102/s010233;1600,100.html
[INFO] Processed another  101 items, for a total of 46710
[INFO] Processed another  101 items, for a total of 46811
[INFO] Processed another  101 items, for a total of 46912
[INFO] Processed another  101 items, for a total of 47013
[INFO] Processed another  101 items, for a total of 47114
[INFO] Processed another  101 items, for a total of 47215
[INFO] Processed another  101 items, for a total of 47316
[INFO] Processed another  101 items, for a total of

[INFO] Processed another  101 items, for a total of 58774
[INFO] Processed another  101 items, for a total of 58875
[INFO] Processed another  101 items, for a total of 58976
[INFO] Processed another  101 items, for a total of 59077
[INFO] Processed another  101 items, for a total of 59178
[INFO] Processed another  101 items, for a total of 59279
[INFO] Processed another  101 items, for a total of 59380
[INFO] Processed another  101 items, for a total of 59481
[INFO] Processed another  101 items, for a total of 59582
[INFO] Processed another  101 items, for a total of 59683
[INFO] Processed another  101 items, for a total of 59784
[INFO] Processed another  101 items, for a total of 59885
[INFO] Processed another  101 items, for a total of 59986
[INFO] Processed another  2 items, for a total of 59988
[INFO] No more jokes at: https://www.anekdot.ru/an/an0111/s011133;1400,100.html
[INFO] Processed another  101 items, for a total of 60089
[INFO] Processed another  101 items, for a total of 

[INFO] Processed another  101 items, for a total of 72096
[INFO] Processed another  101 items, for a total of 72197
[INFO] Processed another  101 items, for a total of 72298
[INFO] Processed another  101 items, for a total of 72399
[INFO] Processed another  101 items, for a total of 72500
[INFO] Processed another  101 items, for a total of 72601
[INFO] Processed another  101 items, for a total of 72702
[INFO] Processed another  101 items, for a total of 72803
[INFO] Processed another  101 items, for a total of 72904
[INFO] Processed another  101 items, for a total of 73005
[INFO] Processed another  101 items, for a total of 73106
[INFO] Processed another  101 items, for a total of 73207
[INFO] Processed another  101 items, for a total of 73308
[INFO] Processed another  81 items, for a total of 73389
[INFO] No more jokes at: https://www.anekdot.ru/an/an0205/s020533;2200,100.html
[INFO] Processed another  101 items, for a total of 73490
[INFO] Processed another  101 items, for a total of

[INFO] Processed another  101 items, for a total of 85469
[INFO] Processed another  101 items, for a total of 85570
[INFO] Processed another  101 items, for a total of 85671
[INFO] Processed another  101 items, for a total of 85772
[INFO] Processed another  101 items, for a total of 85873
[INFO] Processed another  101 items, for a total of 85974
[INFO] Processed another  101 items, for a total of 86075
[INFO] Processed another  101 items, for a total of 86176
[INFO] Processed another  101 items, for a total of 86277
[INFO] Processed another  101 items, for a total of 86378
[INFO] Processed another  27 items, for a total of 86405
[INFO] No more jokes at: https://www.anekdot.ru/an/an0211/s021133;2400,100.html
[INFO] Processed another  101 items, for a total of 86506
[INFO] Processed another  101 items, for a total of 86607
[INFO] Processed another  101 items, for a total of 86708
[INFO] Processed another  101 items, for a total of 86809
[INFO] Processed another  101 items, for a total of

[INFO] Processed another  42 items, for a total of 97523
[INFO] No more jokes at: https://www.anekdot.ru/an/an0310/s031033;800,100.html
[INFO] Processed another  101 items, for a total of 97624
[INFO] Processed another  101 items, for a total of 97725
[INFO] Processed another  101 items, for a total of 97826
[INFO] Processed another  101 items, for a total of 97927
[INFO] Processed another  101 items, for a total of 98028
[INFO] Processed another  101 items, for a total of 98129
[INFO] Processed another  72 items, for a total of 98201
[INFO] No more jokes at: https://www.anekdot.ru/an/an0311/s031133;700,100.html
[INFO] Processed another  101 items, for a total of 98302
[INFO] Processed another  101 items, for a total of 98403
[INFO] Processed another  101 items, for a total of 98504
[INFO] Processed another  101 items, for a total of 98605
[INFO] Processed another  101 items, for a total of 98706
[INFO] Processed another  101 items, for a total of 98807
[INFO] Processed another  101 it

[INFO] Processed another  101 items, for a total of 109006
[INFO] Processed another  101 items, for a total of 109107
[INFO] Processed another  101 items, for a total of 109208
[INFO] Processed another  101 items, for a total of 109309
[INFO] Processed another  101 items, for a total of 109410
[INFO] Processed another  90 items, for a total of 109500
[INFO] No more jokes at: https://www.anekdot.ru/an/an0501/s050133;900,100.html
[INFO] Processed another  101 items, for a total of 109601
[INFO] Processed another  101 items, for a total of 109702
[INFO] Processed another  101 items, for a total of 109803
[INFO] Processed another  101 items, for a total of 109904
[INFO] Processed another  101 items, for a total of 110005
[INFO] Processed another  101 items, for a total of 110106
[INFO] Processed another  101 items, for a total of 110207
[INFO] Processed another  88 items, for a total of 110295
[INFO] No more jokes at: https://www.anekdot.ru/an/an0502/s050233;800,100.html
[INFO] Processed a

[INFO] Processed another  101 items, for a total of 120487
[INFO] Processed another  101 items, for a total of 120588
[INFO] Processed another  101 items, for a total of 120689
[INFO] Processed another  101 items, for a total of 120790
[INFO] Processed another  101 items, for a total of 120891
[INFO] Processed another  101 items, for a total of 120992
[INFO] Processed another  101 items, for a total of 121093
[INFO] Processed another  8 items, for a total of 121101
[INFO] No more jokes at: https://www.anekdot.ru/an/an0604/s060433;800,100.html
[INFO] Processed another  101 items, for a total of 121202
[INFO] Processed another  101 items, for a total of 121303
[INFO] Processed another  101 items, for a total of 121404
[INFO] Processed another  101 items, for a total of 121505
[INFO] Processed another  101 items, for a total of 121606
[INFO] Processed another  101 items, for a total of 121707
[INFO] Processed another  77 items, for a total of 121784
[INFO] No more jokes at: https://www.an

[INFO] Processed another  97 items, for a total of 131262
[INFO] No more jokes at: https://www.anekdot.ru/an/an0709/s070933;500,100.html
[INFO] Processed another  101 items, for a total of 131363
[INFO] Processed another  101 items, for a total of 131464
[INFO] Processed another  101 items, for a total of 131565
[INFO] Processed another  101 items, for a total of 131666
[INFO] Processed another  87 items, for a total of 131753
[INFO] No more jokes at: https://www.anekdot.ru/an/an0710/s071033;500,100.html
[INFO] Processed another  101 items, for a total of 131854
[INFO] Processed another  101 items, for a total of 131955
[INFO] Processed another  101 items, for a total of 132056
[INFO] Processed another  96 items, for a total of 132152
[INFO] No more jokes at: https://www.anekdot.ru/an/an0711/s071133;400,100.html
[INFO] Processed another  101 items, for a total of 132253
[INFO] Processed another  101 items, for a total of 132354
[INFO] Processed another  101 items, for a total of 132455

[INFO] Processed another  101 items, for a total of 141947
[INFO] Processed another  101 items, for a total of 142048
[INFO] Processed another  101 items, for a total of 142149
[INFO] Processed another  15 items, for a total of 142164
[INFO] No more jokes at: https://www.anekdot.ru/an/an0904/s090433;1100,100.html
[INFO] Processed another  101 items, for a total of 142265
[INFO] Processed another  101 items, for a total of 142366
[INFO] Processed another  101 items, for a total of 142467
[INFO] Processed another  101 items, for a total of 142568
[INFO] Processed another  101 items, for a total of 142669
[INFO] No more jokes at: https://www.anekdot.ru/an/an0905/s090533;500,100.html
[INFO] Processed another  101 items, for a total of 142770
[INFO] Processed another  101 items, for a total of 142871
[INFO] Processed another  101 items, for a total of 142972
[INFO] Processed another  101 items, for a total of 143073
[INFO] Processed another  101 items, for a total of 143174
[INFO] Processed

[INFO] Processed another  101 items, for a total of 152799
[INFO] Processed another  101 items, for a total of 152900
[INFO] Processed another  40 items, for a total of 152940
[INFO] No more jokes at: https://www.anekdot.ru/an/an1010/s101033;600,100.html
[INFO] Processed another  101 items, for a total of 153041
[INFO] Processed another  101 items, for a total of 153142
[INFO] Processed another  101 items, for a total of 153243
[INFO] Processed another  101 items, for a total of 153344
[INFO] Processed another  89 items, for a total of 153433
[INFO] No more jokes at: https://www.anekdot.ru/an/an1011/s101133;500,100.html
[INFO] Processed another  101 items, for a total of 153534
[INFO] Processed another  101 items, for a total of 153635
[INFO] Processed another  101 items, for a total of 153736
[INFO] Processed another  101 items, for a total of 153837
[INFO] Processed another  101 items, for a total of 153938
[INFO] Processed another  101 items, for a total of 154039
[INFO] Processed a

[INFO] No more jokes at: https://www.anekdot.ru/an/an1204/s120433;700,100.html
[INFO] Processed another  101 items, for a total of 163472
[INFO] Processed another  101 items, for a total of 163573
[INFO] Processed another  101 items, for a total of 163674
[INFO] Processed another  101 items, for a total of 163775
[INFO] Processed another  101 items, for a total of 163876
[INFO] Processed another  101 items, for a total of 163977
[INFO] Processed another  41 items, for a total of 164018
[INFO] No more jokes at: https://www.anekdot.ru/an/an1205/s120533;700,100.html
[INFO] Processed another  101 items, for a total of 164119
[INFO] Processed another  101 items, for a total of 164220
[INFO] Processed another  101 items, for a total of 164321
[INFO] Processed another  101 items, for a total of 164422
[INFO] Processed another  101 items, for a total of 164523
[INFO] Processed another  58 items, for a total of 164581
[INFO] No more jokes at: https://www.anekdot.ru/an/an1206/s120633;600,100.htm

[INFO] Processed another  101 items, for a total of 173935
[INFO] Processed another  62 items, for a total of 173997
[INFO] No more jokes at: https://www.anekdot.ru/an/an1311/s131133;500,100.html
[INFO] Processed another  101 items, for a total of 174098
[INFO] Processed another  101 items, for a total of 174199
[INFO] Processed another  101 items, for a total of 174300
[INFO] Processed another  101 items, for a total of 174401
[INFO] Processed another  5 items, for a total of 174406
[INFO] No more jokes at: https://www.anekdot.ru/an/an1312/s131233;500,100.html
[INFO] Processed another  101 items, for a total of 174507
[INFO] Processed another  101 items, for a total of 174608
[INFO] Processed another  101 items, for a total of 174709
[INFO] Processed another  61 items, for a total of 174770
[INFO] No more jokes at: https://www.anekdot.ru/an/an1401/s140133;400,100.html
[INFO] Processed another  101 items, for a total of 174871
[INFO] Processed another  101 items, for a total of 174972


[INFO] Processed another  101 items, for a total of 184883
[INFO] Processed another  101 items, for a total of 184984
[INFO] Processed another  101 items, for a total of 185085
[INFO] Processed another  21 items, for a total of 185106
[INFO] No more jokes at: https://www.anekdot.ru/an/an1505/s150533;600,100.html
[INFO] Processed another  101 items, for a total of 185207
[INFO] Processed another  101 items, for a total of 185308
[INFO] Processed another  101 items, for a total of 185409
[INFO] Processed another  101 items, for a total of 185510
[INFO] Processed another  101 items, for a total of 185611
[INFO] Processed another  101 items, for a total of 185712
[INFO] Processed another  13 items, for a total of 185725
[INFO] No more jokes at: https://www.anekdot.ru/an/an1506/s150633;700,100.html
[INFO] Processed another  101 items, for a total of 185826
[INFO] Processed another  101 items, for a total of 185927
[INFO] Processed another  101 items, for a total of 186028
[INFO] Processed a

[INFO] Processed another  101 items, for a total of 196098
[INFO] Processed another  101 items, for a total of 196199
[INFO] Processed another  101 items, for a total of 196300
[INFO] Processed another  24 items, for a total of 196324
[INFO] No more jokes at: https://www.anekdot.ru/an/an1608/s160833;600,100.html
[INFO] Processed another  101 items, for a total of 196425
[INFO] Processed another  101 items, for a total of 196526
[INFO] Processed another  101 items, for a total of 196627
[INFO] Processed another  84 items, for a total of 196711
[INFO] No more jokes at: https://www.anekdot.ru/an/an1609/s160933;400,100.html
[INFO] Processed another  101 items, for a total of 196812
[INFO] Processed another  101 items, for a total of 196913
[INFO] Processed another  101 items, for a total of 197014
[INFO] Processed another  101 items, for a total of 197115
[INFO] Processed another  5 items, for a total of 197120
[INFO] No more jokes at: https://www.anekdot.ru/an/an1610/s161033;500,100.html


[INFO] No more jokes at: https://www.anekdot.ru/an/an1805/s180533;500,100.html
[INFO] Processed another  101 items, for a total of 206334
[INFO] Processed another  101 items, for a total of 206435
[INFO] Processed another  101 items, for a total of 206536
[INFO] Processed another  66 items, for a total of 206602
[INFO] No more jokes at: https://www.anekdot.ru/an/an1806/s180633;400,100.html
[INFO] Processed another  101 items, for a total of 206703
[INFO] Processed another  101 items, for a total of 206804
[INFO] Processed another  101 items, for a total of 206905
[INFO] Processed another  101 items, for a total of 207006
[INFO] Processed another  81 items, for a total of 207087
[INFO] No more jokes at: https://www.anekdot.ru/an/an1807/s180733;500,100.html
[INFO] Processed another  101 items, for a total of 207188
[INFO] Processed another  101 items, for a total of 207289
[INFO] Processed another  101 items, for a total of 207390
[INFO] Processed another  101 items, for a total of 20749

[INFO] Processed another  101 items, for a total of 216130
[INFO] Processed another  101 items, for a total of 216231
[INFO] Processed another  101 items, for a total of 216332
[INFO] Processed another  101 items, for a total of 216433
[INFO] Processed another  50 items, for a total of 216483
[INFO] No more jokes at: https://www.anekdot.ru/an/an2004/s200433;500,100.html
[INFO] Processed another  62 items, for a total of 216545
[INFO] No more jokes at: https://www.anekdot.ru/an/an2005/s200533;100,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2006/s200633;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2007/s200733;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2008/s200833;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2009/s200933;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2010/s201033;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2011/s201133;0,100.html
[INFO] No more jokes at: htt

In [157]:
# pd.DataFrame(anekdot_jokes, columns=['Text']).to_csv('../data/anekdot_repetitive.csv')

### Telegram channel
https://t.me/ligaplohihshutok

In [235]:
import json
with open('../data/liga-plohih-shutok.json', encoding='utf-8') as in_file:
    liga_jokes = json.loads(in_file.read())

In [239]:
pd.DataFrame(liga_jokes, columns=['Text']).to_csv('../data/ru_lpsh_jokes.json')

### Extract QA jokes
We can extract the QA jokes from the datasets with the general jokes.

In [297]:
import nltk

regexps = [ # Regexp for the special chars
    (re.compile('♦'), '*'),
    (re.compile('\n *\n'), '\n'), # Replace multiple newlines with one
    (re.compile(r' {2,}'), ' '), # Replace multiple spaces with one
]

def fix_text(s):
    for regexp in regexps:
        s = regexp[0].sub(regexp[1], s)
    s = s.strip(' -—')
    s = re.sub('^(?:вопрос|ответ):?', '', s, flags=re.IGNORECASE)
    return s.strip()


def extract_qa_jokes(iterator, max_num_sents=2):
    res = []
    pbar = tqdm(total=len(iterator))
    for i, joke in enumerate(iterator):
        joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио[^\n:?]*\:', '', joke, flags=re.IGNORECASE)
        sentences = list(map(lambda s: fix_text(s), nltk.sent_tokenize(joke, language="russian")))
        if sentences[0][-1] == '?' and  1 < len(sentences) <= max_num_sents:
            res.append({
                'Question': sentences[0],
                'Answer': ' '.join(sentences[1:])
            })
        if i % 500 == 0:
            pbar.set_description(f'Extracted: {len(res)} jokes')
        pbar.update(1)
    pbar.set_description(f'Extracted: {len(res)} jokes')
    pbar.close()
    return res

In [298]:
files = [
    '../data/anecdotika.csv',
    '../data/anekdot_fresh.csv',
    '../data/anekdot_others.csv',
    '../data/ru_lpsh_jokes.json',
]

qa_jokes = []

for file in files:
    jokes = pd.read_csv(file)
    qa_anekdot = extract_qa_jokes(jokes['Text'].values, max_num_sents=3)
    qa_jokes.extend(qa_anekdot)
len(qa_jokes)

HBox(children=(FloatProgress(value=0.0, max=20025.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200621.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=218411.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1806.0), HTML(value='')))




47983

In [299]:
pd.DataFrame.from_dict(qa_jokes).to_csv('../data/rus_qa_jokes.csv')