In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import os
import pandas as pd
from tqdm.notebook import tqdm
import socket

output_path = os.path.join('..', 'data', 'transcripts')

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

regexps = [
    (re.compile('♪[^♪]*♪|\[[^\]]*\]|\([^\)]*\)'), ' '),
    (re.compile('<\/br>'), '\n'),
    (re.compile('<\/?[\w ]*>'), ' '), # for <\br> and similar tags
]

In [6]:
def save_file(path, txt, encoding=None):
    # Create the corresponding folder (if needed)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', encoding=encoding) as out_file:
        out_file.writelines(txt)


def process_block(block):
    result = []
    # If not text, skip
    try:
        if block.name in ['img'] or (block.name == 'div' and block.get('class', [''])[0] == 'yarpp-related'):
            return result
    except AttributeError:
        print('[ERROR] AttributeError!')
        print(type(block))
        print(block)
        print('---------------------------------')
    # If is a tag, process it's content
    if isinstance(block, Tag):
        for sub_block in block.contents:
            result.extend(process_block(sub_block))
        return result
    for regexp, sub_str in regexps:
        block = regexp.sub(sub_str, block)
    block = block.strip()
    if block:
        result.append(block)
    return result

pages_to_skip = []

def scrap_page(func, urls_gen, n_batches, save_to_files=False):
    n_processed = 0
    pbar = tqdm(total=n_batches)
    accumulator = []
    for i, url in enumerate(urls_gen(n_batches, pbar)):
        # Skip "bad" pages
        if url in pages_to_skip:
            continue
        try:
            pbar.set_description('Loading {} page...'.format(i+1))
            page = requests.get(url)
            pbar.set_description('Processing {} page...'.format(i+1))
        except requests.exceptions.ConnectionError:
            print('[ERROR] Connection error to', url)
            pbar.update(1)
            continue
        soup = BeautifulSoup(page.content, 'html.parser')
        new_items = func(soup, url)
        accumulator.extend(new_items)
        n_processed += len(new_items)
        print('[INFO] Processed another ', len(new_items), 'items, for a total of', n_processed)
        pbar.update(1)
    return accumulator

## scrapsfromtheloft.com

In [174]:
pages_to_skip = [
    'https://scrapsfromtheloft.com/2019/10/27/jerry-seinfeld-playboy-interview-1993/',
    'https://scrapsfromtheloft.com/2019/10/15/bill-cosby-playboy-interview-1985/',
    'https://scrapsfromtheloft.com/2019/07/06/katherine-ryan-in-trouble-transcript/', # link to other site
    'https://scrapsfromtheloft.com/2019/07/05/katherine-ryan-glitter-room-transcript/', # Unable to connect
    'https://scrapsfromtheloft.com/2019/04/01/in-conversation-with-jerry-seinfeld/',
    'https://scrapsfromtheloft.com/2018/10/30/lenny-bruce-unspruced-review-judith-crist/',
    'https://scrapsfromtheloft.com/2018/10/03/richard-pryor-live-in-concert-review-carl-bennett-cinemonkey/',
    'https://scrapsfromtheloft.com/2018/01/05/dave-chappelle-equanimity-2017-transcripcion-completa/',
    'https://scrapsfromtheloft.com/2018/01/05/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/',
    'https://scrapsfromtheloft.com/2017/10/18/louis-c-k-the-rolling-stone-interview-2013/',
    'https://scrapsfromtheloft.com/2017/10/02/jim-jefferies-e-il-controllo-della-armi-in-america/',
    'https://scrapsfromtheloft.com/2017/08/23/doug-stanhope-no-refunds-2007-trascrizione-italiana/',
    'https://scrapsfromtheloft.com/2017/04/21/larry-king-interview-robin-williams-2007/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-jamming-new-york-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-diseased-1999-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/04/12/george-carlin-bad-2008-testo-italiano-completo/',
    'https://scrapsfromtheloft.com/2017/01/11/robin-williams-playboy-interview-1992/',
    'https://scrapsfromtheloft.com/2016/11/09/playboy-interview-george-carlin/',
]
ok_pages = [
    'https://scrapsfromtheloft.com/2020/01/13/dave-chappelle-acceptance-speech-2019-mark-twain-prize/',
    'https://scrapsfromtheloft.com/2020/01/07/ricky-gervais-2020-golden-globes-monologue-transcript/',
    'https://scrapsfromtheloft.com/2019/10/20/real-time-with-bill-maher-new-rule-prickstarter/',
    'https://scrapsfromtheloft.com/2019/09/11/new-rule-the-fudge-report-real-time-with-bill-maher/',
    'https://scrapsfromtheloft.com/2019/05/18/doug-stanhope-babies-and-abortion/',
    'https://scrapsfromtheloft.com/2019/03/18/ricky-gervais-2011-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2019/03/18/ricky-gervais-2016-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2019/02/18/politically-correct-language-george-carlin/',
    'https://scrapsfromtheloft.com/2018/08/11/dick-gregory-speech-st-johns-baptist-church-may-20-1963/',
    'https://scrapsfromtheloft.com/2018/05/23/trevor-noah-royal-wedding-2018/',
    'https://scrapsfromtheloft.com/2018/05/16/doug-stanhope-on-nationalism/',
    'https://scrapsfromtheloft.com/2018/03/27/ricky-gervais-2012-golden-globes-opening-monologue/',
    'https://scrapsfromtheloft.com/2017/10/25/george-carlin-pro-life-abortion-and-the-sanctity-of-life/',
    'https://scrapsfromtheloft.com/2017/10/25/richard-pryors-monologue-saturday-night-live-1975/',
    'https://scrapsfromtheloft.com/2017/10/06/the-daily-show-fox-news-las-vegas-shooting-2017/',
    'https://scrapsfromtheloft.com/2017/10/03/george-carlin-religion-is-bullshit/',
]

In [179]:
def scrap_transcript(soup, transcript_url):
    file_name = transcript_url[:-1].rsplit('/', 1)[-1]
    file_path = os.path.join('test', file_name + '.txt')
    if not ('transcript' in file_path.lower()):
        print('[WARN] Possibly page without transcript!', transcript_url)
    content_blocks = soup.findAll('div', 'post-content')
    if len(content_blocks) != 1:
        print('[WARN] strange content in', url)
        return
    content = process_block(content_blocks[0])
    # Merge smal paragraphs.
    stripped_content = ['']
    for line in content:
        if len(stripped_content[-1]) < 200:
            stripped_content[-1] += ' ' + line
        else:
            stripped_content.append(line)
    stripped_content = '\n'.join(stripped_content)
    if file_path:
        save_file(file_path, stripped_content, encoding='utf8')
    return [stripped_content]

def url_transcripts(n_batches, pbar):
    URL = 'https://scrapsfromtheloft.com/comedy/page/{}/'
    for i in range(n_batches):
        # Load block of pages
        pbar.set_description('Loading {} block...'.format(i+1))
        page = requests.get(URL.format(i))
        soup = BeautifulSoup(page.content, 'html.parser')
        blocks = soup.body.findAll('div', 'fusion-post-content post-content')
        # Extract link to the page
        for j, block in enumerate(blocks):
            block_title = block.find('h2', 'entry-title fusion-post-title').a
            yield block_title['href']

scrap_page(scrap_transcript, url_transcripts, n_batches=40, save_to_files=True)

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

[INFO] Processed another  1 items, for a total of 1
[WARN] Possibly page without transcript! https://scrapsfromtheloft.com/2020/05/05/bill-burr-late-show-with-david-letterman-2010/
[INFO] Processed another  1 items, for a total of 2
[INFO] Processed another  1 items, for a total of 3
[INFO] Processed another  1 items, for a total of 4
[INFO] Processed another  1 items, for a total of 5
[INFO] Processed another  1 items, for a total of 6
[INFO] Processed another  1 items, for a total of 7
[INFO] Processed another  1 items, for a total of 8
[INFO] Processed another  1 items, for a total of 9
[INFO] Processed another  1 items, for a total of 10
[INFO] Processed another  1 items, for a total of 11
[INFO] Processed another  1 items, for a total of 12
[INFO] Processed another  1 items, for a total of 13


KeyboardInterrupt: 

## Russian

### http://anecdotica.ru/

In [7]:
def scrap_anecdotica(soup, _):
    jokes = soup.findAll('div', 'item_text')
    return [' '.join(process_block(joke)) for joke in jokes]

def urls_anecdotica(n_batches, _):
    URL = 'http://anecdotica.ru/all/{}'
    for i in range(1, n_batches + 1):
        yield URL.format(i)

anecdotica = scrap_page(scrap_anecdotica, urls_anecdotica, n_batches=1000)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

[INFO] Processed another  25 items, for a total of 25
[INFO] Processed another  25 items, for a total of 50
[INFO] Processed another  25 items, for a total of 75
[INFO] Processed another  25 items, for a total of 100
[INFO] Processed another  25 items, for a total of 125
[INFO] Processed another  25 items, for a total of 150
[INFO] Processed another  25 items, for a total of 175
[INFO] Processed another  25 items, for a total of 200
[INFO] Processed another  24 items, for a total of 224
[INFO] Processed another  25 items, for a total of 249
[INFO] Processed another  25 items, for a total of 274
[INFO] Processed another  25 items, for a total of 299
[INFO] Processed another  25 items, for a total of 324
[INFO] Processed another  25 items, for a total of 349
[INFO] Processed another  25 items, for a total of 374
[INFO] Processed another  25 items, for a total of 399
[INFO] Processed another  25 items, for a total of 424
[INFO] Processed another  24 items, for a total of 448
[INFO] Proces

[INFO] Processed another  25 items, for a total of 3707
[INFO] Processed another  25 items, for a total of 3732
[INFO] Processed another  25 items, for a total of 3757
[INFO] Processed another  25 items, for a total of 3782
[INFO] Processed another  25 items, for a total of 3807
[INFO] Processed another  25 items, for a total of 3832
[INFO] Processed another  25 items, for a total of 3857
[INFO] Processed another  25 items, for a total of 3882
[INFO] Processed another  25 items, for a total of 3907
[INFO] Processed another  25 items, for a total of 3932
[INFO] Processed another  24 items, for a total of 3956
[INFO] Processed another  25 items, for a total of 3981
[INFO] Processed another  25 items, for a total of 4006
[INFO] Processed another  25 items, for a total of 4031
[INFO] Processed another  25 items, for a total of 4056
[INFO] Processed another  25 items, for a total of 4081
[INFO] Processed another  25 items, for a total of 4106
[INFO] Processed another  25 items, for a total 

[INFO] Processed another  25 items, for a total of 7369
[INFO] Processed another  25 items, for a total of 7394
[INFO] Processed another  25 items, for a total of 7419
[INFO] Processed another  25 items, for a total of 7444
[INFO] Processed another  25 items, for a total of 7469
[INFO] Processed another  25 items, for a total of 7494
[INFO] Processed another  25 items, for a total of 7519
[INFO] Processed another  25 items, for a total of 7544
[INFO] Processed another  25 items, for a total of 7569
[INFO] Processed another  25 items, for a total of 7594
[INFO] Processed another  25 items, for a total of 7619
[INFO] Processed another  25 items, for a total of 7644
[INFO] Processed another  25 items, for a total of 7669
[INFO] Processed another  25 items, for a total of 7694
[INFO] Processed another  25 items, for a total of 7719
[INFO] Processed another  25 items, for a total of 7744
[INFO] Processed another  25 items, for a total of 7769
[INFO] Processed another  25 items, for a total 

[INFO] Processed another  25 items, for a total of 11006
[INFO] Processed another  25 items, for a total of 11031
[INFO] Processed another  25 items, for a total of 11056
[INFO] Processed another  25 items, for a total of 11081
[INFO] Processed another  25 items, for a total of 11106
[INFO] Processed another  25 items, for a total of 11131
[INFO] Processed another  25 items, for a total of 11156
[INFO] Processed another  25 items, for a total of 11181
[INFO] Processed another  25 items, for a total of 11206
[INFO] Processed another  25 items, for a total of 11231
[INFO] Processed another  25 items, for a total of 11256
[INFO] Processed another  25 items, for a total of 11281
[INFO] Processed another  25 items, for a total of 11306
[INFO] Processed another  25 items, for a total of 11331
[INFO] Processed another  25 items, for a total of 11356
[INFO] Processed another  25 items, for a total of 11381
[INFO] Processed another  25 items, for a total of 11406
[INFO] Processed another  25 it

[INFO] Processed another  25 items, for a total of 14597
[INFO] Processed another  25 items, for a total of 14622
[INFO] Processed another  25 items, for a total of 14647
[INFO] Processed another  25 items, for a total of 14672
[INFO] Processed another  25 items, for a total of 14697
[INFO] Processed another  25 items, for a total of 14722
[INFO] Processed another  25 items, for a total of 14747
[INFO] Processed another  25 items, for a total of 14772
[INFO] Processed another  25 items, for a total of 14797
[INFO] Processed another  25 items, for a total of 14822
[INFO] Processed another  25 items, for a total of 14847
[INFO] Processed another  25 items, for a total of 14872
[INFO] Processed another  25 items, for a total of 14897
[INFO] Processed another  25 items, for a total of 14922
[INFO] Processed another  25 items, for a total of 14947
[INFO] Processed another  25 items, for a total of 14972
[INFO] Processed another  25 items, for a total of 14997
[INFO] Processed another  25 it

[INFO] Processed another  24 items, for a total of 18177
[INFO] Processed another  25 items, for a total of 18202
[INFO] Processed another  25 items, for a total of 18227
[INFO] Processed another  25 items, for a total of 18252
[INFO] Processed another  25 items, for a total of 18277
[INFO] Processed another  25 items, for a total of 18302
[INFO] Processed another  25 items, for a total of 18327
[INFO] Processed another  25 items, for a total of 18352
[INFO] Processed another  25 items, for a total of 18377
[INFO] Processed another  25 items, for a total of 18402
[INFO] Processed another  25 items, for a total of 18427
[INFO] Processed another  25 items, for a total of 18452
[INFO] Processed another  25 items, for a total of 18477
[INFO] Processed another  25 items, for a total of 18502
[INFO] Processed another  25 items, for a total of 18527
[INFO] Processed another  25 items, for a total of 18552
[INFO] Processed another  25 items, for a total of 18577
[INFO] Processed another  25 it

[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total of 20025
[INFO] Processed another  0 items, for a total o

In [11]:
# pd.DataFrame(anecdotica, columns=['Text']).to_csv('../data/anecdotika.csv')

### https://www.anekdot.ru/

In [44]:
def urls_anekdot(j_type, step, max_num_jokes, pbar):
    # Шутки: j - свежие; s - повторные; x - остальные
    # Истории: o - свежие;
    URL = f'https://www.anekdot.ru/an/an{{}}{{}}/{j_type}{{}}{{}}33;{{}},100.html' # year;month;year;month;start_from
    # All years from 1995 to 2020
    years = list(map(lambda x: f'{x:0>2}', list(range(95, 100)) + list(range(0, 21))))
    months = list(map(lambda x: f'{x:0>2}', list(range(1, 13))))
    for year in years:
        for month in months:
            pbar.set_description(f'Processing {year} {month} page...')
            for start_ind in range(0, max_num_jokes, step):
                yield (URL.format(year, month, year, month, start_ind),
                       ((max_num - start_ind) // step) - 1)

anekdot_jokes, n_processed = [], 0
pbar = tqdm(total=max_num * len(years) * len(months))
max_num, step = 4000, 100
iterator = urls_anekdot('x', step, max_num, pbar)
for page_url, n_skip in iterator:
    page = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    new_items = [' '.join(process_block(joke)) for joke in soup.findAll('div', 'text')]
    if len(new_items) == 0:
        print('[INFO] No more jokes at:', page_url)
        for _ in range(n_skip):
            next(iterator)
            pbar.update(100)
        continue
    anekdot_jokes.extend(new_items)
    n_processed += len(new_items)
    print('[INFO] Processed another ', len(new_items), 'items, for a total of', n_processed)
    pbar.update(100)

HBox(children=(FloatProgress(value=0.0, max=1248000.0), HTML(value='')))

[INFO] No more jokes at: https://www.anekdot.ru/an/an9501/x950133;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9502/x950233;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9503/x950333;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9504/x950433;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9505/x950533;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9506/x950633;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9507/x950733;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9508/x950833;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9509/x950933;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9510/x951033;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9511/x951133;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9512/x951233;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an9601/x960133;0,100.html

[INFO] Processed another  101 items, for a total of 5667
[INFO] Processed another  101 items, for a total of 5768
[INFO] Processed another  101 items, for a total of 5869
[INFO] No more jokes at: https://www.anekdot.ru/an/an0002/x000233;400,100.html
[INFO] Processed another  101 items, for a total of 5970
[INFO] Processed another  101 items, for a total of 6071
[INFO] Processed another  101 items, for a total of 6172
[INFO] Processed another  86 items, for a total of 6258
[INFO] No more jokes at: https://www.anekdot.ru/an/an0003/x000333;400,100.html
[INFO] Processed another  101 items, for a total of 6359
[INFO] Processed another  101 items, for a total of 6460
[INFO] Processed another  101 items, for a total of 6561
[INFO] Processed another  101 items, for a total of 6662
[INFO] Processed another  6 items, for a total of 6668
[INFO] No more jokes at: https://www.anekdot.ru/an/an0004/x000433;500,100.html
[INFO] Processed another  101 items, for a total of 6769
[INFO] Processed another 

[INFO] Processed another  101 items, for a total of 16036
[INFO] Processed another  99 items, for a total of 16135
[INFO] No more jokes at: https://www.anekdot.ru/an/an0112/x011233;500,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0201/x020133;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0202/x020233;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0203/x020333;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0204/x020433;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0205/x020533;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0206/x020633;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0207/x020733;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0208/x020833;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0209/x020933;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an0210/x021033;0,100.html
[INFO] No more jokes at: https://www

[INFO] Processed another  101 items, for a total of 25609
[INFO] Processed another  59 items, for a total of 25668
[INFO] No more jokes at: https://www.anekdot.ru/an/an0406/x040633;800,100.html
[INFO] Processed another  101 items, for a total of 25769
[INFO] Processed another  101 items, for a total of 25870
[INFO] Processed another  101 items, for a total of 25971
[INFO] Processed another  101 items, for a total of 26072
[INFO] Processed another  101 items, for a total of 26173
[INFO] Processed another  101 items, for a total of 26274
[INFO] Processed another  79 items, for a total of 26353
[INFO] No more jokes at: https://www.anekdot.ru/an/an0407/x040733;700,100.html
[INFO] Processed another  101 items, for a total of 26454
[INFO] Processed another  101 items, for a total of 26555
[INFO] Processed another  101 items, for a total of 26656
[INFO] Processed another  101 items, for a total of 26757
[INFO] Processed another  101 items, for a total of 26858
[INFO] Processed another  101 it

[INFO] Processed another  101 items, for a total of 36332
[INFO] Processed another  101 items, for a total of 36433
[INFO] Processed another  101 items, for a total of 36534
[INFO] Processed another  101 items, for a total of 36635
[INFO] Processed another  101 items, for a total of 36736
[INFO] Processed another  101 items, for a total of 36837
[INFO] Processed another  101 items, for a total of 36938
[INFO] Processed another  81 items, for a total of 37019
[INFO] No more jokes at: https://www.anekdot.ru/an/an0604/x060433;1000,100.html
[INFO] Processed another  101 items, for a total of 37120
[INFO] Processed another  101 items, for a total of 37221
[INFO] Processed another  101 items, for a total of 37322
[INFO] Processed another  101 items, for a total of 37423
[INFO] Processed another  101 items, for a total of 37524
[INFO] Processed another  101 items, for a total of 37625
[INFO] Processed another  101 items, for a total of 37726
[INFO] Processed another  101 items, for a total of

[INFO] Processed another  101 items, for a total of 48658
[INFO] Processed another  101 items, for a total of 48759
[INFO] Processed another  101 items, for a total of 48860
[INFO] Processed another  101 items, for a total of 48961
[INFO] Processed another  101 items, for a total of 49062
[INFO] Processed another  101 items, for a total of 49163
[INFO] Processed another  101 items, for a total of 49264
[INFO] Processed another  101 items, for a total of 49365
[INFO] Processed another  65 items, for a total of 49430
[INFO] No more jokes at: https://www.anekdot.ru/an/an0704/x070433;900,100.html
[INFO] Processed another  101 items, for a total of 49531
[INFO] Processed another  101 items, for a total of 49632
[INFO] Processed another  101 items, for a total of 49733
[INFO] Processed another  101 items, for a total of 49834
[INFO] Processed another  101 items, for a total of 49935
[INFO] Processed another  101 items, for a total of 50036
[INFO] Processed another  101 items, for a total of 

[INFO] Processed another  20 items, for a total of 60857
[INFO] No more jokes at: https://www.anekdot.ru/an/an0803/x080333;1100,100.html
[INFO] Processed another  101 items, for a total of 60958
[INFO] Processed another  101 items, for a total of 61059
[INFO] Processed another  101 items, for a total of 61160
[INFO] Processed another  101 items, for a total of 61261
[INFO] Processed another  101 items, for a total of 61362
[INFO] Processed another  101 items, for a total of 61463
[INFO] Processed another  101 items, for a total of 61564
[INFO] Processed another  101 items, for a total of 61665
[INFO] Processed another  101 items, for a total of 61766
[INFO] Processed another  97 items, for a total of 61863
[INFO] No more jokes at: https://www.anekdot.ru/an/an0804/x080433;1000,100.html
[INFO] Processed another  101 items, for a total of 61964
[INFO] Processed another  101 items, for a total of 62065
[INFO] Processed another  101 items, for a total of 62166
[INFO] Processed another  101 

[INFO] Processed another  101 items, for a total of 72880
[INFO] Processed another  101 items, for a total of 72981
[INFO] Processed another  101 items, for a total of 73082
[INFO] Processed another  101 items, for a total of 73183
[INFO] Processed another  101 items, for a total of 73284
[INFO] Processed another  68 items, for a total of 73352
[INFO] No more jokes at: https://www.anekdot.ru/an/an0904/x090433;1300,100.html
[INFO] Processed another  101 items, for a total of 73453
[INFO] Processed another  101 items, for a total of 73554
[INFO] Processed another  101 items, for a total of 73655
[INFO] Processed another  101 items, for a total of 73756
[INFO] Processed another  101 items, for a total of 73857
[INFO] Processed another  101 items, for a total of 73958
[INFO] Processed another  101 items, for a total of 74059
[INFO] Processed another  101 items, for a total of 74160
[INFO] Processed another  101 items, for a total of 74261
[INFO] Processed another  101 items, for a total of

[INFO] No more jokes at: https://www.anekdot.ru/an/an1006/x100633;1000,100.html
[INFO] Processed another  101 items, for a total of 84625
[INFO] Processed another  101 items, for a total of 84726
[INFO] Processed another  101 items, for a total of 84827
[INFO] Processed another  101 items, for a total of 84928
[INFO] Processed another  101 items, for a total of 85029
[INFO] Processed another  101 items, for a total of 85130
[INFO] Processed another  101 items, for a total of 85231
[INFO] Processed another  89 items, for a total of 85320
[INFO] No more jokes at: https://www.anekdot.ru/an/an1007/x100733;800,100.html
[INFO] Processed another  101 items, for a total of 85421
[INFO] Processed another  101 items, for a total of 85522
[INFO] Processed another  101 items, for a total of 85623
[INFO] Processed another  101 items, for a total of 85724
[INFO] Processed another  101 items, for a total of 85825
[INFO] Processed another  101 items, for a total of 85926
[INFO] Processed another  101 

[INFO] Processed another  101 items, for a total of 96840
[INFO] Processed another  101 items, for a total of 96941
[INFO] Processed another  101 items, for a total of 97042
[INFO] Processed another  74 items, for a total of 97116
[INFO] No more jokes at: https://www.anekdot.ru/an/an1106/x110633;1000,100.html
[INFO] Processed another  101 items, for a total of 97217
[INFO] Processed another  101 items, for a total of 97318
[INFO] Processed another  101 items, for a total of 97419
[INFO] Processed another  101 items, for a total of 97520
[INFO] Processed another  101 items, for a total of 97621
[INFO] Processed another  101 items, for a total of 97722
[INFO] Processed another  101 items, for a total of 97823
[INFO] Processed another  101 items, for a total of 97924
[INFO] Processed another  73 items, for a total of 97997
[INFO] No more jokes at: https://www.anekdot.ru/an/an1107/x110733;900,100.html
[INFO] Processed another  101 items, for a total of 98098
[INFO] Processed another  101 i

[INFO] Processed another  52 items, for a total of 108922
[INFO] No more jokes at: https://www.anekdot.ru/an/an1205/x120533;1000,100.html
[INFO] Processed another  101 items, for a total of 109023
[INFO] Processed another  101 items, for a total of 109124
[INFO] Processed another  101 items, for a total of 109225
[INFO] Processed another  101 items, for a total of 109326
[INFO] Processed another  101 items, for a total of 109427
[INFO] Processed another  101 items, for a total of 109528
[INFO] Processed another  101 items, for a total of 109629
[INFO] Processed another  101 items, for a total of 109730
[INFO] Processed another  101 items, for a total of 109831
[INFO] Processed another  101 items, for a total of 109932
[INFO] Processed another  33 items, for a total of 109965
[INFO] No more jokes at: https://www.anekdot.ru/an/an1206/x120633;1100,100.html
[INFO] Processed another  101 items, for a total of 110066
[INFO] Processed another  101 items, for a total of 110167
[INFO] Processed

[INFO] Processed another  101 items, for a total of 120884
[INFO] Processed another  101 items, for a total of 120985
[INFO] Processed another  101 items, for a total of 121086
[INFO] Processed another  101 items, for a total of 121187
[INFO] Processed another  101 items, for a total of 121288
[INFO] Processed another  101 items, for a total of 121389
[INFO] Processed another  5 items, for a total of 121394
[INFO] No more jokes at: https://www.anekdot.ru/an/an1305/x130533;1000,100.html
[INFO] Processed another  101 items, for a total of 121495
[INFO] Processed another  101 items, for a total of 121596
[INFO] Processed another  101 items, for a total of 121697
[INFO] Processed another  101 items, for a total of 121798
[INFO] Processed another  101 items, for a total of 121899
[INFO] Processed another  101 items, for a total of 122000
[INFO] Processed another  101 items, for a total of 122101
[INFO] Processed another  101 items, for a total of 122202
[INFO] Processed another  101 items, 

[INFO] Processed another  101 items, for a total of 132715
[INFO] Processed another  101 items, for a total of 132816
[INFO] Processed another  101 items, for a total of 132917
[INFO] Processed another  101 items, for a total of 133018
[INFO] Processed another  101 items, for a total of 133119
[INFO] Processed another  101 items, for a total of 133220
[INFO] No more jokes at: https://www.anekdot.ru/an/an1404/x140433;1200,100.html
[INFO] Processed another  101 items, for a total of 133321
[INFO] Processed another  101 items, for a total of 133422
[INFO] Processed another  101 items, for a total of 133523
[INFO] Processed another  101 items, for a total of 133624
[INFO] Processed another  101 items, for a total of 133725
[INFO] Processed another  101 items, for a total of 133826
[INFO] Processed another  101 items, for a total of 133927
[INFO] Processed another  101 items, for a total of 134028
[INFO] Processed another  101 items, for a total of 134129
[INFO] Processed another  101 items

[INFO] Processed another  101 items, for a total of 144858
[INFO] Processed another  101 items, for a total of 144959
[INFO] Processed another  101 items, for a total of 145060
[INFO] Processed another  101 items, for a total of 145161
[INFO] Processed another  101 items, for a total of 145262
[INFO] Processed another  101 items, for a total of 145363
[INFO] Processed another  77 items, for a total of 145440
[INFO] No more jokes at: https://www.anekdot.ru/an/an1503/x150333;1100,100.html
[INFO] Processed another  101 items, for a total of 145541
[INFO] Processed another  101 items, for a total of 145642
[INFO] Processed another  101 items, for a total of 145743
[INFO] Processed another  101 items, for a total of 145844
[INFO] Processed another  101 items, for a total of 145945
[INFO] Processed another  101 items, for a total of 146046
[INFO] Processed another  101 items, for a total of 146147
[INFO] Processed another  101 items, for a total of 146248
[INFO] Processed another  101 items,

[INFO] Processed another  101 items, for a total of 156823
[INFO] Processed another  101 items, for a total of 156924
[INFO] Processed another  76 items, for a total of 157000
[INFO] No more jokes at: https://www.anekdot.ru/an/an1602/x160233;1000,100.html
[INFO] Processed another  101 items, for a total of 157101
[INFO] Processed another  101 items, for a total of 157202
[INFO] Processed another  101 items, for a total of 157303
[INFO] Processed another  101 items, for a total of 157404
[INFO] Processed another  101 items, for a total of 157505
[INFO] Processed another  101 items, for a total of 157606
[INFO] Processed another  101 items, for a total of 157707
[INFO] Processed another  101 items, for a total of 157808
[INFO] Processed another  78 items, for a total of 157886
[INFO] No more jokes at: https://www.anekdot.ru/an/an1603/x160333;900,100.html
[INFO] Processed another  101 items, for a total of 157987
[INFO] Processed another  101 items, for a total of 158088
[INFO] Processed 

[INFO] Processed another  101 items, for a total of 168536
[INFO] Processed another  101 items, for a total of 168637
[INFO] Processed another  101 items, for a total of 168738
[INFO] Processed another  101 items, for a total of 168839
[INFO] Processed another  101 items, for a total of 168940
[INFO] Processed another  101 items, for a total of 169041
[INFO] Processed another  101 items, for a total of 169142
[INFO] Processed another  101 items, for a total of 169243
[INFO] Processed another  5 items, for a total of 169248
[INFO] No more jokes at: https://www.anekdot.ru/an/an1703/x170333;1100,100.html
[INFO] Processed another  101 items, for a total of 169349
[INFO] Processed another  101 items, for a total of 169450
[INFO] Processed another  101 items, for a total of 169551
[INFO] Processed another  101 items, for a total of 169652
[INFO] Processed another  101 items, for a total of 169753
[INFO] Processed another  101 items, for a total of 169854
[INFO] Processed another  101 items, 

[INFO] Processed another  101 items, for a total of 180370
[INFO] Processed another  101 items, for a total of 180471
[INFO] Processed another  101 items, for a total of 180572
[INFO] Processed another  101 items, for a total of 180673
[INFO] Processed another  101 items, for a total of 180774
[INFO] Processed another  101 items, for a total of 180875
[INFO] Processed another  101 items, for a total of 180976
[INFO] Processed another  101 items, for a total of 181077
[INFO] Processed another  101 items, for a total of 181178
[INFO] Processed another  92 items, for a total of 181270
[INFO] No more jokes at: https://www.anekdot.ru/an/an1803/x180333;1300,100.html
[INFO] Processed another  101 items, for a total of 181371
[INFO] Processed another  101 items, for a total of 181472
[INFO] Processed another  101 items, for a total of 181573
[INFO] Processed another  101 items, for a total of 181674
[INFO] Processed another  101 items, for a total of 181775
[INFO] Processed another  101 items,

[INFO] Processed another  101 items, for a total of 192917
[INFO] Processed another  101 items, for a total of 193018
[INFO] Processed another  101 items, for a total of 193119
[INFO] Processed another  101 items, for a total of 193220
[INFO] Processed another  101 items, for a total of 193321
[INFO] Processed another  101 items, for a total of 193422
[INFO] Processed another  101 items, for a total of 193523
[INFO] Processed another  101 items, for a total of 193624
[INFO] Processed another  101 items, for a total of 193725
[INFO] Processed another  10 items, for a total of 193735
[INFO] No more jokes at: https://www.anekdot.ru/an/an1812/x181233;1600,100.html
[INFO] Processed another  101 items, for a total of 193836
[INFO] Processed another  101 items, for a total of 193937
[INFO] Processed another  101 items, for a total of 194038
[INFO] Processed another  101 items, for a total of 194139
[INFO] Processed another  101 items, for a total of 194240
[INFO] Processed another  101 items,

[INFO] Processed another  101 items, for a total of 205074
[INFO] Processed another  101 items, for a total of 205175
[INFO] Processed another  101 items, for a total of 205276
[INFO] Processed another  101 items, for a total of 205377
[INFO] Processed another  81 items, for a total of 205458
[INFO] No more jokes at: https://www.anekdot.ru/an/an1909/x190933;1200,100.html
[INFO] Processed another  43 items, for a total of 205501
[INFO] Processed another  101 items, for a total of 205602
[INFO] Processed another  101 items, for a total of 205703
[INFO] Processed another  101 items, for a total of 205804
[INFO] Processed another  101 items, for a total of 205905
[INFO] Processed another  101 items, for a total of 206006
[INFO] Processed another  101 items, for a total of 206107
[INFO] Processed another  101 items, for a total of 206208
[INFO] Processed another  101 items, for a total of 206309
[INFO] Processed another  101 items, for a total of 206410
[INFO] Processed another  101 items, 

[INFO] Processed another  101 items, for a total of 217712
[INFO] Processed another  101 items, for a total of 217813
[INFO] Processed another  79 items, for a total of 217892
[INFO] No more jokes at: https://www.anekdot.ru/an/an2004/x200433;2500,100.html
[INFO] Processed another  101 items, for a total of 217993
[INFO] Processed another  101 items, for a total of 218094
[INFO] Processed another  101 items, for a total of 218195
[INFO] Processed another  101 items, for a total of 218296
[INFO] Processed another  101 items, for a total of 218397
[INFO] Processed another  101 items, for a total of 218498
[INFO] Processed another  46 items, for a total of 218544
[INFO] No more jokes at: https://www.anekdot.ru/an/an2005/x200533;700,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2006/x200633;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2007/x200733;0,100.html
[INFO] No more jokes at: https://www.anekdot.ru/an/an2008/x200833;0,100.html
[INFO] No more jokes a

In [45]:
len(anekdot_jokes)

218544

In [46]:
# pd.DataFrame(anekdot_jokes, columns=['Text']).to_csv('../data/anekdot_others.csv')

### Telegram channel
https://t.me/ligaplohihshutok

In [47]:
import json
with open('../data/liga-plohih-shutok.json', encoding='utf-8') as in_file:
    liga_jokes = json.loads(in_file.read())

In [48]:
pd.DataFrame(liga_jokes, columns=['Text']).to_csv('../data/ru_lpsh_jokes.csv')

### Extract QA jokes
We can extract the QA jokes from the datasets with the general jokes.

In [61]:
import nltk
import traceback

regexps = [ # Regexp for the special chars
    (re.compile('♦'), '*'),
    (re.compile('\n *\n'), '\n'), # Replace multiple newlines with one
    (re.compile(r' {2,}'), ' '), # Replace multiple spaces with one
]

def fix_text(s):
    for regexp in regexps:
        s = regexp[0].sub(regexp[1], s)
    s = s.strip(' -—')
    s = re.sub('^(?:вопрос|ответ):?', '', s, flags=re.IGNORECASE)
    return s.strip()


def extract_qa_jokes(iterator, max_num_sents=2):
    res = []
    pbar = tqdm(total=len(iterator))
    for i, joke in enumerate(iterator):
        try:
            joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио[^\n:?]*\:', '', joke, flags=re.IGNORECASE)
            sentences = [fix_text(s) for s in nltk.sent_tokenize(joke, language="russian")]
            sentences = [s for s in sentences if s]
            if sentences and sentences[0][-1] == '?' and 1 < len(sentences) <= max_num_sents:
                res.append({
                    'Question': sentences[0],
                    'Answer': ' '.join(sentences[1:])
                })
        except:
            print(f'Error at {i}')
            traceback.print_exc()
        if i % 500 == 0:
            pbar.set_description(f'Extracted: {len(res)} jokes')
        pbar.update(1)
    pbar.set_description(f'Extracted: {len(res)} jokes')
    pbar.close()
    return res

In [62]:
files = [
    '../data/anecdotika.csv',
    '../data/anekdot_fresh.csv',
    '../data/anekdot_repetative.csv',
    '../data/anekdot_others.csv',
    '../data/ru_lpsh_jokes.csv',
]

qa_jokes = []

for file in files:
    jokes = pd.read_csv(file)
    qa_anekdot = extract_qa_jokes(jokes['Text'].values, max_num_sents=3)
    qa_jokes.extend(qa_anekdot)
len(qa_jokes)

HBox(children=(FloatProgress(value=0.0, max=20025.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200476.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=216603.0), HTML(value='')))

Error at 49407
Error at 64195
Error at 69074


Traceback (most recent call last):
  File "<ipython-input-61-5ffadd4deacd>", line 23, in extract_qa_jokes
    joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио[^\n:?]*\:', '', joke, flags=re.IGNORECASE)
  File "C:\Users\Alex\Anaconda3\envs\pytorch\lib\re.py", line 192, in sub
    return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or bytes-like object
Traceback (most recent call last):
  File "<ipython-input-61-5ffadd4deacd>", line 23, in extract_qa_jokes
    joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио[^\n:?]*\:', '', joke, flags=re.IGNORECASE)
  File "C:\Users\Alex\Anaconda3\envs\pytorch\lib\re.py", line 192, in sub
    return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or bytes-like object
Traceback (most recent call last):
  File "<ipython-input-61-5ffadd4deacd>", line 23, in extract_qa_jokes
    joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио

Error at 132540
Error at 134146


Traceback (most recent call last):
  File "<ipython-input-61-5ffadd4deacd>", line 23, in extract_qa_jokes
    joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио[^\n:?]*\:', '', joke, flags=re.IGNORECASE)
  File "C:\Users\Alex\Anaconda3\envs\pytorch\lib\re.py", line 192, in sub
    return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or bytes-like object
Traceback (most recent call last):
  File "<ipython-input-61-5ffadd4deacd>", line 23, in extract_qa_jokes
    joke = re.sub(r'^[^\n\:\?\.]*(?:Армянское|Армянскому?|Армянского) *радио[^\n:?]*\:', '', joke, flags=re.IGNORECASE)
  File "C:\Users\Alex\Anaconda3\envs\pytorch\lib\re.py", line 192, in sub
    return _compile(pattern, flags).sub(repl, string, count)
TypeError: expected string or bytes-like object





HBox(children=(FloatProgress(value=0.0, max=218544.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1806.0), HTML(value='')))




67563

In [63]:
# pd.DataFrame.from_dict(qa_jokes).to_csv('../data/rus_qa_jokes.csv')