# Setting things up

In [1]:
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook, tqdm
import multiprocessing as mp
import os
import time
from pathlib import Path
import numpy as np
import re

In [2]:
"""Set up the working path"""
home = os.path.expanduser("~")


# raw html paths
html_root = os.path.join(home, 'Desktop/data')
path_master_indexes = os.path.join(html_root, 'master_indexes')
path_daily_data_html = os.path.join(html_root, 'daily_data')

# filtered text paths
filtered_text_root = os.path.join(home, 'Desktop/filtered_text_data')
filtered_text_log = os.path.join(filtered_text_root, 'errors.log')
path_daily_data_text_log = os.path.join(filtered_text_root, 'daily_data')

# Split the work

## Generate the list of files
Ultimately, this will come from the S3 bucket.

In [3]:
list_html_paths = []
file_size = []
t0 = time.perf_counter()
for filename in Path(path_daily_data_html).glob('**/*.html'):
    list_html_paths.append(filename)
    file_size.append(os.path.getsize(filename))
t1 = time.perf_counter()
print("[INFO] Found {} html files in {:.3f} s ({:,.0f} files/s)"
      .format(len(list_html_paths), t1-t0, len(list_html_paths)/(t1-t0)))
print("[INFO] Total size: {:,} bytes".format(sum(file_size)))

[INFO] Found 104 html files in 0.008 s (12,900 files/s)
[INFO] Total size: 1,539,017,824 bytes


## Split the work equally among N instances

In [4]:
def balanced_split_of_paths(list_html_paths, instances):
    # In the unlikely event we have summoned more machines than html links
    nb_instances = min(len(list_html_paths), len(instances))
    instances = instances[:nb_instances]
    
    # Split the work equally
    html_indexes = np.linspace(0, len(list_html_paths), len(instances), endpoint=False)
    html_indexes = [int(i) for i in html_indexes]  # Cast to int
    #print(qtr_indexes)
    html_indexes.append(len(list_html_paths))  # Add the last element for the comprehension below
    #print(qtr_indexes)
    split_work = []
    for i in range(len(html_indexes)-1):
        split_work.append(list_html_paths[html_indexes[i]:html_indexes[i+1]])
    
    return split_work

def test_balanced_split_of_paths():
    list_html_paths = [1]*100
    list_instances = [[1]*1, [1]*8, [1]*150]  # Three key cases
    
    for instances in list_instances:
        # Test 1: test length of the list
        test_1 = balanced_split_of_paths(list_html_paths, instances)
        assert len(test_1) == min(len(list_html_paths), len(instances))

        # Test 2: verify that wp are of near equal length
        theoretical_wp_len = len(list_html_paths)/len(instances)
        for work_package in test_1:
            assert theoretical_wp_len-1 <= len(work_package) <= theoretical_wp_len+1

        # Test 3: check package length count
        counter = 0
        for work_package in test_1:
            counter += len(work_package)
        assert counter == len(list_html_paths)
    
    return True


test_balanced_split_of_paths()

True

In [5]:
list_path_html = balanced_split_of_paths(list_html_paths, [1]*1)

In [6]:
def path_html_to_path_text(path_html):
    #print(path_html)
    end_path_html = path_html.split(html_root)[1].lstrip('/')
    #print(end_path_html)
    end_path_text = end_path_html[:-5] + '.txt'
    #print(end_path_text)
    filtered_text_path = os.path.join(filtered_text_root, end_path_text)
    return filtered_text_path


In [7]:
work_package = list_path_html[0]
path_html = str(work_package[0])
path_html_to_path_text(path_html)

'/home/alex/Desktop/filtered_text_data/daily_data/20180222/100517/000119312518054235.txt'

In [8]:
def is_downloaded(filepath):
    #expected_path = master_url_to_filepath(url_idx)
    if os.path.isfile(filepath):
        return True
    else:  # Build the folder architecture if needed
        if not os.path.isdir(os.path.split(filepath)[0]):
            os.makedirs(os.path.split(filepath)[0])
        return False

In [9]:
def filter_html_file(path_html):
    # Build the file path and make sure there is not already something
    # WARNING: path_html comes as a PosixPath or WindowsPath object - cast to str first
    path_filtered_text = path_html_to_path_text(str(path_html))
    if not is_downloaded(path_filtered_text):
        # 1. Read the file
        with open(path_html, 'r', encoding="utf8", errors='ignore') as f:
            data_html = f.read()
        
        # 2. Find the PART I, II, III and IV markers
        # WARNING: This is not easy due to the amount of entangled HTML in the text.
        # Instead, I might have to parse with bs4 and then perform the search. Will take a long time.
        soup = BeautifulSoup(data_html, 'lxml')
        data_html = soup.get_text()
        
        end_toc = re.search('PART[ \n\t]*IV', data_html)
        
        if not end_toc:
            print("[INFO] Failure 1 in", path_html)  # PART IV could not be located
            return
        data_html = data_html[end_toc.end():]  # Skip the ToC references
        
        # 3. Find all mentions of "PART XX"
        pattern = re.compile('PART[ \n\t]*I[IV]{0,2}')
        #pattern = re.compile('P(<[ A-Za-z0-9\"\-\;\/\=\:]*>)*A(<[ A-Za-z0-9\"\-\;\/\=\:]*>)R(<[ A-Za-z0-9\"\-\;\/\=\:]*>)T[ \n\t]*I[IV]{0,2}')
        parts = {}
        
        # There are many variations for what separates PART from the number
        # So we normalize as many as we can
        for m in re.finditer(pattern, data_html):
            part = m.group()
            value = re.search('I[IV]{0,2}$', part)  # Search the end
            part = part[:4] + ' ' + part[value.start():]
            parts[part] = m.span()
        # parts = {m.group(): m.span() for m in re.finditer(pattern, data_html)}
        try:
            assert len(parts) == 4
            assert 'PART I' in parts
            assert 'PART II' in parts
            assert 'PART III' in parts
            assert 'PART IV' in parts
        except:
            print("[INFO] Failure 2 in {} | {}".format(path_html, parts))
            return
        
        # 4. Filter the text - get PART I, II, III but discard PART IV
        cropped_data_html = data_html[parts['PART I'][0]:parts['PART IV'][0]]
        filtered_text = cropped_data_html
        
        """[TBR] Originally, get_text() after cropping but nearly impossible due to entangled HTML
        soup = BeautifulSoup(cropped_data_html, 'lxml')
        filtered_text = soup.get_text()
        """
        
        # 5. Write to file
        with open(path_filtered_text, 'w') as f:
            f.write(filtered_text)
        return True
   
    else:  # Nothing to do if the file already exists
        return False

In [10]:
work_package = list_path_html[0]  # Let's say we are instance 0
processing_stats = {
    'count_processed': 0,
    'count_already_processed': 0
}

with mp.Pool(processes=min(mp.cpu_count(), len(work_package))) as p:
    with tqdm(total=len(work_package)) as pbar:
        for i, value in tqdm(enumerate(p.imap_unordered(filter_html_file, work_package))):
            pbar.update()
            if value:
                processing_stats['count_processed'] += 1
            else:
                processing_stats['count_already_processed'] += 1
            """
            print("[INFO] Free space: {:,} | count_processed: {} | count_already_processed: {}"
                  .format(os.statvfs(filtered_text_root).f_frsize * os.statvfs(filtered_text_root).f_bavail,
                 processing_stats['count_processed'],
                 processing_stats['count_already_processed']))
            """

  0%|          | 0/104 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180222/1013462/000101346218000004.html | {'PART II': (1124, 1131), 'PART III': (188802, 188810), 'PART IVI': (190077, 190085)}


  1%|          | 1/104 [00:02<04:40,  2.73s/it]
  2%|▏         | 2/104 [00:09<06:48,  4.00s/it]
  3%|▎         | 3/104 [00:11<05:46,  3.43s/it]
3it [00:11,  3.43s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180222/101778/000010177818000026.html | {}


  4%|▍         | 4/104 [00:13<04:57,  2.97s/it]
  5%|▍         | 5/104 [00:13<03:34,  2.17s/it]
5it [00:13,  2.16s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180301/1003642/000100364218000038.html | {'PART I': (6376, 6382)}


  6%|▌         | 6/104 [00:14<02:32,  1.55s/it]
6it [00:14,  1.55s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180301/1000623/000100062318000044.html | {}


  7%|▋         | 7/104 [00:15<02:30,  1.55s/it]
7it [00:15,  1.55s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180222/1004036/000089971518000070.html | {}


  8%|▊         | 8/104 [00:17<02:46,  1.74s/it]
8it [00:17,  1.74s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180222/100517/000119312518054235.html


  9%|▊         | 9/104 [00:18<02:19,  1.47s/it]
9it [00:18,  1.47s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180301/1012019/000143774918003630.html | {}


 10%|▉         | 10/104 [00:26<05:10,  3.30s/it]
 11%|█         | 11/104 [00:27<04:17,  2.77s/it]
11it [00:27,  2.77s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180301/1013871/000101387118000011.html


 12%|█▏        | 12/104 [00:30<04:07,  2.69s/it]
 12%|█▎        | 13/104 [00:31<03:35,  2.37s/it]
 13%|█▎        | 14/104 [00:35<04:00,  2.67s/it]
14it [00:35,  2.67s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180326/1003509/000119312518096155.html | {}


 14%|█▍        | 15/104 [00:35<02:56,  1.98s/it]
15it [00:35,  1.98s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180309/1015155/000114036118012655.html | {'PART II': (102, 109), 'PART III': (688185, 688193), 'PART IVI': (693795, 693803)}


 15%|█▌        | 16/104 [00:36<02:28,  1.69s/it]
 16%|█▋        | 17/104 [00:42<04:07,  2.85s/it]
 17%|█▋        | 18/104 [00:42<03:00,  2.09s/it]
18it [00:42,  2.09s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180301/1005817/000100581718000002.html | {}


 18%|█▊        | 19/104 [00:44<03:00,  2.13s/it]
 19%|█▉        | 20/104 [00:44<02:07,  1.52s/it]

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180216/1003410/000078328018000012.html | {'PART II': (4017, 4024), 'PART III': (180292, 180300), 'PART IVI': (187130, 187138)}


 20%|██        | 21/104 [00:46<02:19,  1.68s/it]
21it [00:46,  1.81s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180326/1001171/000104746918002146.html | {'PART I': (3296, 3302), 'PART II': (109123, 109130), 'PART III': (160744, 160752)}


 21%|██        | 22/104 [00:53<04:07,  3.01s/it]
22it [00:53,  3.11s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180302/1011509/000155837018001416.html | {}


 22%|██▏       | 23/104 [00:53<02:53,  2.15s/it]
23it [00:53,  2.21s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180216/1014473/000101447318000018.html | {}


 23%|██▎       | 24/104 [00:59<04:30,  3.38s/it]
 24%|██▍       | 25/104 [01:00<03:29,  2.65s/it]
 25%|██▌       | 26/104 [01:01<02:41,  2.07s/it]
 26%|██▌       | 27/104 [01:03<02:37,  2.05s/it]
27it [01:03,  2.07s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180301/1001233/000156459018004163.html


 27%|██▋       | 28/104 [01:07<03:24,  2.69s/it]
28it [01:07,  2.70s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180302/1007587/000100758718000006.html | {'PART II': (113, 120), 'PART III': (254152, 254160), 'PART IVI': (256008, 256016)}


 28%|██▊       | 29/104 [01:10<03:42,  2.97s/it]
29it [01:10,  2.98s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180301/1012771/000101277118000006.html | {'PART II': (5238, 5245), 'PART III': (382959, 382967), 'PART IVI': (383258, 383266)}


 29%|██▉       | 30/104 [01:11<02:46,  2.24s/it]
30it [01:11,  2.25s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180221/1015780/000101578018000033.html | {'PART I': (62371, 62376)}


 30%|██▉       | 31/104 [01:13<02:44,  2.25s/it]
31it [01:13,  2.26s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180302/1011659/000162828018002667.html | {'PART II': (12072, 12079), 'PART III': (164560, 164568), 'PART IVI': (169261, 169269)}


 31%|███       | 32/104 [01:17<03:06,  2.58s/it]
 32%|███▏      | 33/104 [01:17<02:23,  2.02s/it]
33it [01:17,  2.02s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180221/1004155/000009212218000012.html | {'PART I': (299433, 299438)}


 33%|███▎      | 34/104 [01:22<03:19,  2.85s/it]
 34%|███▎      | 35/104 [01:23<02:31,  2.20s/it]
 35%|███▍      | 36/104 [01:26<02:52,  2.54s/it]
 36%|███▌      | 37/104 [01:26<02:02,  1.83s/it]
37it [01:26,  1.83s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180328/101594/000149315218004053.html | {'PART II': (2106, 2113), 'PART III': (301895, 301903), 'PART IVI': (321357, 321365)}


 37%|███▋      | 38/104 [01:27<01:36,  1.46s/it]
38it [01:27,  1.46s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180209/101829/000010182918000005.html | {}


 38%|███▊      | 39/104 [01:31<02:18,  2.12s/it]
39it [01:31,  2.12s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180316/1017968/000119312518086033.html


 38%|███▊      | 40/104 [01:35<03:01,  2.84s/it]
40it [01:35,  2.84s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180221/1001082/000155837018000826.html


 39%|███▉      | 41/104 [01:35<02:12,  2.10s/it]
 40%|████      | 42/104 [01:37<01:58,  1.92s/it]
 41%|████▏     | 43/104 [01:38<01:36,  1.59s/it]
43it [01:38,  1.59s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180221/1012100/000162828018001957.html


 42%|████▏     | 44/104 [01:41<02:08,  2.14s/it]
 43%|████▎     | 45/104 [01:47<03:11,  3.25s/it]
 44%|████▍     | 46/104 [01:48<02:31,  2.62s/it]
 45%|████▌     | 47/104 [01:50<02:10,  2.28s/it]
47it [01:50,  2.28s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180316/1012477/000101247718000008.html | {'PART I': (759432, 759437), 'PART III': (420170, 420178), 'PART IV': (422227, 422234)}


 46%|████▌     | 48/104 [01:50<01:39,  1.78s/it]
 47%|████▋     | 49/104 [01:55<02:34,  2.80s/it]
 48%|████▊     | 50/104 [01:57<02:12,  2.46s/it]
 49%|████▉     | 51/104 [01:59<02:07,  2.41s/it]
51it [01:59,  2.41s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180316/1000232/000155837018002201.html


 50%|█████     | 52/104 [02:00<01:34,  1.82s/it]
52it [02:00,  1.82s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180316/1001385/000143774918004857.html | {}


 51%|█████     | 53/104 [02:00<01:13,  1.43s/it]
53it [02:00,  1.44s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180131/1011452/000101145218000003.html | {}


 52%|█████▏    | 54/104 [02:01<00:55,  1.12s/it]
 53%|█████▎    | 55/104 [02:01<00:48,  1.01it/s]
55it [02:01,  1.01it/s][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180227/1011570/000101157018000012.html | {'PART II': (94, 101), 'PART III': (84510, 84518)}


 54%|█████▍    | 56/104 [02:02<00:39,  1.22it/s]
56it [02:02,  1.22it/s][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180330/1017655/000165495418003409.html


 55%|█████▍    | 57/104 [02:04<00:50,  1.08s/it]
57it [02:04,  1.08s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180321/1017303/000101730318000022.html | {'PART II': (79480, 79487), 'PART III': (223714, 223722), 'PART IV': (229233, 229240)}


 56%|█████▌    | 58/104 [02:07<01:23,  1.81s/it]
 57%|█████▋    | 59/104 [02:11<01:53,  2.52s/it]
 58%|█████▊    | 60/104 [02:12<01:22,  1.88s/it]
 59%|█████▊    | 61/104 [02:14<01:25,  1.98s/it]
61it [02:14,  1.98s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180314/1006045/000156459018005593.html | {}


 60%|█████▉    | 62/104 [02:16<01:25,  2.04s/it]
62it [02:16,  2.04s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180227/1000697/000119312518060830.html | {'PART I': (634937, 634942)}


 61%|██████    | 63/104 [02:18<01:20,  1.96s/it]
63it [02:18,  1.96s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180227/1008654/000100865418000006.html | {}


 62%|██████▏   | 64/104 [02:25<02:19,  3.48s/it]
64it [02:25,  3.48s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180215/100790/000002991518000005.html | {'PART II': (2080, 2087), 'PART III': (173676, 173684), 'PART IVI': (175521, 175529)}


 63%|██████▎   | 66/104 [02:31<02:05,  3.30s/it]
66it [02:31,  3.30s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180215/100122/000010012218000005.html | {'PART II': (7777, 7784), 'PART III': (287547, 287555), 'PART IVI': (291581, 291589)}


 64%|██████▍   | 67/104 [02:31<01:33,  2.54s/it]
67it [02:31,  2.54s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180306/1010086/000101008618000003.html | {'PART II': (133913, 133920), 'PART III': (272014, 272022), 'PART IV': (273914, 273921)}


 65%|██████▌   | 68/104 [02:33<01:20,  2.24s/it]
68it [02:33,  2.24s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180214/1001902/000119312518045618.html | {}


 66%|██████▋   | 69/104 [02:35<01:18,  2.25s/it]
 67%|██████▋   | 70/104 [02:35<00:54,  1.62s/it]
 68%|██████▊   | 71/104 [02:37<00:58,  1.77s/it]
 69%|██████▉   | 72/104 [02:41<01:11,  2.22s/it]
72it [02:41,  2.22s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180322/101295/000117184318002183.html


 70%|███████   | 73/104 [02:41<00:52,  1.71s/it]
73it [02:41,  1.71s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180314/1006837/000100683718000023.html | {'PART I': (35995, 36001)}


 71%|███████   | 74/104 [02:43<00:51,  1.70s/it]
74it [02:43,  1.70s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180228/101199/000010119918000040.html | {'PART I': (4829, 4835), 'PART II': (113016, 113023)}


 72%|███████▏  | 75/104 [02:44<00:46,  1.62s/it]
75it [02:44,  1.62s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180220/1018963/000101896318000004.html


 73%|███████▎  | 76/104 [02:48<01:00,  2.17s/it]
76it [02:48,  2.17s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180228/1018979/000156459018003779.html


 74%|███████▍  | 77/104 [02:50<00:58,  2.16s/it]
 75%|███████▌  | 78/104 [02:50<00:43,  1.66s/it]
78it [02:50,  1.67s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180220/1018254/000007420818000024.html


 76%|███████▌  | 79/104 [02:54<00:56,  2.28s/it]
79it [02:54,  2.28s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180228/1004702/000100470218000078.html | {}


 77%|███████▋  | 80/104 [02:56<00:54,  2.26s/it]
 78%|███████▊  | 81/104 [02:58<00:46,  2.03s/it]
81it [02:58,  2.03s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180228/1012620/000101262018000008.html | {'PART II': (3337, 3344), 'PART III': (394363, 394371), 'PART IVI': (397594, 397602)}


 79%|███████▉  | 82/104 [03:04<01:11,  3.25s/it]
 80%|███████▉  | 83/104 [03:04<00:48,  2.32s/it]
 81%|████████  | 84/104 [03:09<01:00,  3.00s/it]
84it [03:09,  3.00s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180228/100826/000100291018000069.html | {'PART II': (25901, 25908), 'PART III': (591402, 591410), 'PART IVI': (600459, 600467), 'PART I': (1061878, 1061883)}


 82%|████████▏ | 85/104 [03:18<01:34,  4.97s/it]
 83%|████████▎ | 86/104 [03:20<01:09,  3.86s/it]
86it [03:19,  3.86s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180228/1002910/000100291018000069.html | {'PART II': (25901, 25908), 'PART III': (591402, 591410), 'PART IVI': (600459, 600467), 'PART I': (1061878, 1061883)}
[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180313/1007330/000100733018000002.html | {}


 85%|████████▍ | 88/104 [03:20<00:45,  2.83s/it]
 86%|████████▌ | 89/104 [03:21<00:33,  2.26s/it]
 87%|████████▋ | 90/104 [03:22<00:26,  1.91s/it]
90it [03:22,  1.91s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180313/1010858/000156459018005434.html | {'PART I': (2282807, 2282812)}


 88%|████████▊ | 91/104 [03:23<00:20,  1.57s/it]
 88%|████████▊ | 92/104 [03:25<00:20,  1.68s/it]
92it [03:25,  1.67s/it][A

[INFO] Failure 1 in /home/alex/Desktop/data/daily_data/20180323/1013238/000119312518094094.html


 89%|████████▉ | 93/104 [03:26<00:16,  1.54s/it]
93it [03:26,  1.54s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180212/1000753/000100075318000015.html | {}


 90%|█████████ | 94/104 [03:27<00:12,  1.29s/it]
 91%|█████████▏| 95/104 [03:29<00:12,  1.43s/it]
95it [03:29,  1.43s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180212/1000229/000100022918000025.html | {'PART II': (79, 86), 'PART III': (146948, 146956), 'PART IVI': (147335, 147343)}


 92%|█████████▏| 96/104 [03:31<00:13,  1.68s/it]
96it [03:31,  1.68s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180223/1004434/000100443418000004.html | {'PART I': (287811, 287816), 'PART III': (269006, 269014), 'PART IVI': (270055, 270063)}


 93%|█████████▎| 97/104 [03:31<00:09,  1.31s/it]
97it [03:31,  1.31s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180313/1016169/000156459018005459.html | {'PART I': (67570, 67575)}


 94%|█████████▍| 98/104 [03:35<00:11,  2.00s/it]
 95%|█████████▌| 99/104 [03:37<00:10,  2.12s/it]
99it [03:37,  2.12s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180315/1018164/000155837018002139.html | {'PART II': (98581, 98588), 'PART III': (106282, 106290), 'PART IVI': (112669, 112677)}


 96%|█████████▌| 100/104 [03:38<00:06,  1.61s/it]
 97%|█████████▋| 101/104 [03:39<00:04,  1.48s/it]
101it [03:39,  1.47s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180227/1017673/000110465918012876.html | {}


 98%|█████████▊| 102/104 [03:41<00:03,  1.51s/it]
102it [03:41,  1.51s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180226/1013857/000119312518058491.html | {'PART II': (16986, 16993), 'PART III': (395978, 395986), 'PART IVI': (417633, 417641), 'PART I': (1318433, 1318438)}


 99%|█████████▉| 103/104 [03:50<00:03,  3.93s/it]
103it [03:50,  3.93s/it][A

[INFO] Failure 2 in /home/alex/Desktop/data/daily_data/20180315/1013272/000094627518000013.html | {}


100%|██████████| 104/104 [05:14<00:00, 27.96s/it]
104it [05:14, 27.96s/it][A
