In [1]:
%load_ext dotenv
%dotenv -o

import os
import random
import multiprocessing
from datetime import datetime
from urllib.parse import urlsplit

import ujson
import psycopg2
import pandas as pd
from tqdm import tqdm
from peewee import Model
from newspaper import Article
from newspaper.article import ArticleDownloadState
from playhouse.shortcuts import model_to_dict
from playhouse.postgres_ext import PostgresqlExtDatabase, DateTimeField, CharField, TextField, IntegerField, fn

In [None]:
## this stuff should be moved somewhere :) 

peewee_database = PostgresqlExtDatabase(os.environ['RESEARCHABLY_DB_NAME'], **{
    'host': os.environ    ['RESEARCHABLY_DB_HOST'],
    'user': os.environ    ['RESEARCHABLY_DB_USER'],
    'password': os.environ['RESEARCHABLY_DB_PASSWORD'],
    'register_hstore':    False
})


class BaseModel(Model):
    class Meta:
        database = peewee_database

        
class ScrapedPage(BaseModel):
    batch = IntegerField(null=False)
    url = CharField(null=False)
    html = TextField(null=False)
    inserted_at = DateTimeField(null=False, default=datetime.now())
    updated_at = DateTimeField(null=False, default=datetime.now())

    class Meta:
        db_table = 'fnr_scraped_pages'
        
        
def big_select_query(query, batch_size=2000):
    cursor = peewee_database._local.conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    cursor_name = 'task_big_query' + str(random.randint(1, 1000 * 1000))
    cursor.execute('begin; declare ' + cursor_name + ' cursor for ' + query.sql()[0], query.sql()[1])

    while True:
        cursor.execute('fetch ' + str(batch_size) + ' from ' + cursor_name + ' ;')
        batch_results = cursor.fetchall()

        if len(batch_results) <= 0:
            break

        for row in batch_results:
            yield row

    cursor.close()

# Get scraped articles from database

In [None]:
scraped_pages_count = list(ScrapedPage.select(fn.Count(ScrapedPage.id).alias('count')).where(ScrapedPage.batch == 2))
scraped_pages_count = scraped_pages_count[0].count
scraped_pages_count

In [None]:
peewee_database.connect()

In [None]:
with open('data/7_opensources_co/scraped_pages.jsonl', 'w') as _out:
    for page in tqdm(big_select_query(ScrapedPage.select().where(ScrapedPage.batch == 2), 100)):
        _out.write(ujson.dumps(page) + '\n')

In [None]:
1

# Clean articles

In [4]:
def parse_article(line):
    try:
        page = ujson.loads(line)
        article = Article(page['url'])
        article.set_html(page['html'])
        article.config.fetch_images = False
        article.parse()

        return ujson.dumps({
            'id': page['id'],
            'batch': page['batch'],
            'url': page['url'],
            'title': article.title,
            'content': article.text,
            'published_at': article.publish_date,
            'authors': article.authors,
            'meta_description': article.meta_description,
            'meta_keywords': article.meta_keywords
        })
    except Exception as a:
        print(a)
        return None

In [5]:
def yield_scraped_pages():
    with open('data/7_opensources_co/scraped_pages.jsonl', 'r') as _in:
        for line in _in:
            yield line

with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
    with open('data/7_opensources_co/scraped_pages_articles.jsonl', 'w') as _out:
        for result in tqdm(pool.imap(parse_article, yield_scraped_pages(), chunksize=100)):
            if result is None:
                continue

            _out.write(result + '\n')


0it [00:00, ?it/s][A
1it [00:11, 11.83s/it][A
101it [00:13,  7.50it/s][A
101it [00:26,  3.86it/s][A
201it [01:05,  3.05it/s][A
5501it [01:06, 82.26it/s][A
5672it [01:08, 83.29it/s][A
5796it [01:11, 81.53it/s][A
5884it [01:11, 81.97it/s][A
5950it [01:12, 82.57it/s][A
6003it [01:12, 82.45it/s][A
6301it [01:13, 85.70it/s][A
6401it [01:14, 86.14it/s][A
6701it [01:14, 89.43it/s][A
6801it [01:17, 88.07it/s][A
6901it [01:17, 88.91it/s][A
7001it [01:19, 87.71it/s][A
7201it [01:21, 88.32it/s][A
7301it [01:24, 86.78it/s][A
7501it [01:24, 88.44it/s][A
7701it [01:27, 88.11it/s][A
8001it [01:28, 90.69it/s][A
8301it [01:29, 92.48it/s][A
8401it [01:31, 91.73it/s][A
8501it [01:32, 92.14it/s][A
8601it [01:33, 92.12it/s][A
8801it [01:35, 92.25it/s][A
8901it [01:37, 90.85it/s][A
9001it [01:38, 91.13it/s][A
9201it [01:39, 92.67it/s][A
9301it [01:41, 91.22it/s][A
9401it [01:42, 91.90it/s][A
9601it [01:42, 93.71it/s][A
9901it [01:44, 94.92it/s][A
10001it [01:46, 94.30it/s]

44601it [06:48, 109.31it/s][A
44801it [06:49, 109.34it/s][A
44901it [06:50, 109.50it/s][A
45001it [06:50, 109.56it/s][A
45101it [06:50, 109.73it/s][A
45201it [06:52, 109.55it/s][A
45301it [06:52, 109.73it/s][A
45401it [06:53, 109.70it/s][A
45601it [06:54, 110.04it/s][A
45701it [06:54, 110.23it/s][A
45801it [06:56, 110.06it/s][A
45901it [06:56, 110.15it/s][A
46001it [06:58, 109.94it/s][A
46101it [06:59, 109.91it/s][A
46201it [07:01, 109.53it/s][A
46401it [07:03, 109.52it/s][A
46501it [07:03, 109.72it/s][A
46601it [07:04, 109.79it/s][A
46801it [07:06, 109.66it/s][A
47001it [07:07, 110.00it/s][A
47201it [07:07, 110.42it/s][A
47301it [07:07, 110.57it/s][A
47401it [07:08, 110.52it/s][A
47501it [07:09, 110.49it/s][A
47601it [07:11, 110.26it/s][A
47701it [07:13, 110.15it/s][A
47801it [07:14, 109.89it/s][A
47901it [07:15, 110.08it/s][A
48001it [07:16, 109.87it/s][A
48301it [07:17, 110.33it/s][A
48401it [07:19, 110.07it/s][A
48501it [07:20, 110.21it/s][A
48701it 

81801it [12:12, 111.66it/s][A
81901it [12:13, 111.65it/s][A
82001it [12:14, 111.65it/s][A
82101it [12:16, 111.46it/s][A
82201it [12:18, 111.38it/s][A
82301it [12:18, 111.37it/s][A
82401it [12:19, 111.48it/s][A
82501it [12:19, 111.51it/s][A
82601it [12:19, 111.62it/s][A
82701it [12:21, 111.57it/s][A
82801it [12:21, 111.66it/s][A
82901it [12:22, 111.59it/s][A
83301it [12:23, 112.01it/s][A
83401it [12:25, 111.81it/s][A
83501it [12:27, 111.76it/s][A
83601it [12:27, 111.83it/s][A
83701it [12:29, 111.72it/s][A
83801it [12:29, 111.75it/s][A
83901it [12:31, 111.69it/s][A
84001it [12:31, 111.78it/s][A
84101it [12:32, 111.82it/s][A
84201it [12:32, 111.87it/s][A
84301it [12:34, 111.80it/s][A
84401it [12:34, 111.84it/s][A
84501it [12:36, 111.67it/s][A
84701it [12:37, 111.85it/s][A
84801it [12:37, 111.88it/s][A
84901it [12:39, 111.84it/s][A
85001it [12:40, 111.79it/s][A
85101it [12:41, 111.73it/s][A
85201it [12:41, 111.82it/s][A
85301it [12:43, 111.67it/s][A
85401it 

129101it [17:41, 121.66it/s][A
129501it [17:41, 122.00it/s][A
129801it [17:41, 122.27it/s][A
130001it [17:41, 122.43it/s][A
130110it [17:41, 122.52it/s][A
130213it [17:42, 122.54it/s][A
130401it [17:43, 122.60it/s][A
130601it [17:47, 122.30it/s][A
130901it [17:48, 122.54it/s][A
131201it [17:48, 122.78it/s][A
131601it [17:48, 123.14it/s][A
131719it [17:49, 123.21it/s][A
131813it [17:49, 123.26it/s][A
131901it [17:49, 123.29it/s][A
132001it [17:50, 123.31it/s][A
132101it [17:51, 123.34it/s][A
132201it [17:54, 122.98it/s][A
132501it [17:55, 123.24it/s][A
132601it [17:55, 123.32it/s][A
132801it [17:55, 123.43it/s][A
133301it [17:56, 123.82it/s][A
133501it [17:57, 123.94it/s][A
133601it [17:57, 124.01it/s][A
133701it [17:57, 124.03it/s][A
133801it [18:01, 123.67it/s][A
134001it [18:02, 123.83it/s][A
134101it [18:02, 123.91it/s][A
134601it [18:02, 124.31it/s][A
134701it [18:03, 124.37it/s][A
134901it [18:03, 124.52it/s][A
135001it [18:03, 124.57it/s][A
135101it

176601it [23:09, 127.10it/s][A
176701it [23:13, 126.81it/s][A
176801it [23:14, 126.77it/s][A
177201it [23:16, 126.91it/s][A
177801it [23:18, 127.16it/s][A
178101it [23:22, 127.00it/s][A
178201it [23:23, 126.94it/s][A
178301it [23:28, 126.63it/s][A
178501it [23:30, 126.52it/s][A
179301it [23:31, 126.99it/s][A
179401it [23:32, 127.04it/s][A
179701it [23:35, 126.98it/s][A
179801it [23:38, 126.73it/s][A
180001it [23:39, 126.76it/s][A
180201it [23:41, 126.80it/s][A
180301it [23:41, 126.86it/s][A
180401it [23:43, 126.73it/s][A
180701it [23:44, 126.85it/s][A
181301it [23:46, 127.14it/s][A
181401it [23:47, 127.05it/s][A
181501it [23:49, 126.95it/s][A
181801it [23:51, 127.03it/s][A
181901it [23:51, 127.07it/s][A
182101it [23:54, 126.96it/s][A
182201it [23:55, 126.90it/s][A
182301it [23:56, 126.94it/s][A
182601it [23:57, 127.01it/s][A
182901it [23:59, 127.08it/s][A
183001it [24:00, 127.00it/s][A
183201it [24:03, 126.88it/s][A
183301it [24:04, 126.91it/s][A
183601it

You must `download()` an article first!

You must `download()` an article first!




201701it [26:36, 126.38it/s][A
202201it [26:37, 126.56it/s][A
202601it [26:38, 126.71it/s][A
202701it [26:45, 126.23it/s][A
202701it [26:57, 125.32it/s][A
203101it [26:58, 125.49it/s][A
203201it [27:00, 125.43it/s][A
203601it [27:00, 125.61it/s][A
203701it [27:02, 125.56it/s][A
204201it [27:05, 125.61it/s][A
204601it [27:10, 125.46it/s][A
205601it [27:13, 125.89it/s][A
205701it [27:22, 125.27it/s][A
206601it [27:28, 125.35it/s][A
207101it [27:41, 124.67it/s][A
208901it [27:41, 125.72it/s][A
209201it [27:41, 125.89it/s][A
209401it [27:44, 125.78it/s][A
209515it [27:46, 125.75it/s][A
209601it [27:46, 125.74it/s][A
209701it [27:52, 125.38it/s][A
210401it [27:53, 125.71it/s][A
210701it [27:54, 125.86it/s][A
211101it [27:55, 126.03it/s][A
211201it [27:55, 126.07it/s][A
211301it [27:58, 125.90it/s][A
211601it [28:00, 125.92it/s][A
211701it [28:02, 125.81it/s][A
212301it [28:04, 126.01it/s][A
212501it [28:05, 126.08it/s][A
212601it [28:06, 126.06it/s][A
212701i

251001it [33:32, 124.71it/s][A
251101it [33:32, 124.75it/s][A
251201it [33:33, 124.76it/s][A
251301it [33:34, 124.76it/s][A
251401it [33:35, 124.74it/s][A
251501it [33:36, 124.74it/s][A
251601it [33:36, 124.75it/s][A
251701it [33:38, 124.72it/s][A
251801it [33:38, 124.74it/s][A
251901it [33:38, 124.77it/s][A
252201it [33:39, 124.87it/s][A
252301it [33:41, 124.83it/s][A
252401it [33:43, 124.76it/s][A
252501it [33:43, 124.78it/s][A
252701it [33:43, 124.87it/s][A
252801it [33:43, 124.90it/s][A
252901it [33:45, 124.85it/s][A
253001it [33:47, 124.79it/s][A
253301it [33:48, 124.87it/s][A
253701it [33:48, 125.06it/s][A
253828it [33:50, 125.04it/s][A
253920it [33:50, 125.05it/s][A
254001it [33:53, 124.93it/s][A
254301it [33:54, 125.01it/s][A
254501it [33:54, 125.09it/s][A
254601it [33:54, 125.12it/s][A
254701it [33:56, 125.09it/s][A
254801it [33:56, 125.10it/s][A
254901it [34:00, 124.95it/s][A
255501it [34:00, 125.22it/s][A
255601it [34:03, 125.10it/s][A
255701it

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




303901it [39:42, 127.55it/s][A
304001it [39:45, 127.43it/s][A
304301it [39:46, 127.49it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




304401it [39:53, 127.18it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




304401it [40:08, 126.41it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




304501it [40:39, 124.80it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




305301it [41:13, 123.43it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




305301it [41:28, 122.70it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




305401it [41:39, 122.17it/s][A

You must `download()` an article first!




305501it [42:22, 120.16it/s][A
313736it [42:22, 123.39it/s][A
316027it [42:23, 124.27it/s][A
316027it [42:38, 123.53it/s][A
317401it [42:38, 124.04it/s][A
317501it [42:44, 123.82it/s][A
318401it [42:45, 124.12it/s][A
318701it [42:47, 124.14it/s][A
318801it [42:50, 124.00it/s][A
319001it [42:54, 123.89it/s][A
319701it [42:54, 124.16it/s][A
319862it [42:55, 124.21it/s][A
320201it [42:56, 124.30it/s][A
320309it [42:58, 124.24it/s][A
320401it [43:00, 124.18it/s][A
320501it [43:01, 124.14it/s][A
320601it [43:02, 124.13it/s][A
320701it [43:03, 124.14it/s][A
320801it [43:04, 124.14it/s][A
320901it [43:04, 124.15it/s][A
321101it [43:05, 124.20it/s][A
321501it [43:05, 124.35it/s][A
321601it [43:06, 124.36it/s][A
321801it [43:06, 124.43it/s][A
321901it [43:08, 124.38it/s][A
322001it [43:10, 124.31it/s][A
322101it [43:11, 124.29it/s][A
322201it [43:13, 124.26it/s][A
322301it [43:13, 124.27it/s][A
322401it [43:14, 124.27it/s][A
322501it [43:14, 124.29it/s][A
322601i

355001it [47:14, 125.22it/s][A
355101it [47:16, 125.20it/s][A
355401it [47:16, 125.29it/s][A
355501it [47:17, 125.29it/s][A
355601it [47:17, 125.31it/s][A
355701it [47:18, 125.32it/s][A
355801it [47:18, 125.34it/s][A
355901it [47:19, 125.36it/s][A
356001it [47:20, 125.32it/s][A
356101it [47:20, 125.35it/s][A
356201it [47:22, 125.32it/s][A
356401it [47:25, 125.27it/s][A
356501it [47:27, 125.19it/s][A
356601it [47:28, 125.17it/s][A
356701it [47:29, 125.18it/s][A
356801it [47:29, 125.19it/s][A
357001it [47:30, 125.23it/s][A
357101it [47:31, 125.24it/s][A
357201it [47:31, 125.27it/s][A
357301it [47:32, 125.27it/s][A
357501it [47:41, 124.93it/s][A
357601it [47:43, 124.89it/s][A
358401it [47:44, 125.10it/s][A
358501it [47:46, 125.07it/s][A
358601it [47:46, 125.08it/s][A
358701it [47:57, 124.67it/s][A
358901it [47:57, 124.71it/s][A
359101it [48:02, 124.59it/s][A
359201it [48:02, 124.59it/s][A
359401it [48:04, 124.62it/s][A
359501it [48:07, 124.52it/s][A
359601it

397301it [54:53, 120.64it/s][A
397401it [54:54, 120.62it/s][A
397701it [54:55, 120.66it/s][A
397801it [54:58, 120.61it/s][A
397901it [54:59, 120.61it/s][A
398101it [55:00, 120.63it/s][A
398201it [55:00, 120.64it/s][A
398601it [55:01, 120.73it/s][A
398701it [55:03, 120.70it/s][A
398801it [55:07, 120.58it/s][A
398901it [55:09, 120.55it/s][A
399001it [55:09, 120.55it/s][A
399101it [55:10, 120.54it/s][A
399201it [55:11, 120.55it/s][A
399301it [55:13, 120.52it/s][A
399501it [55:13, 120.57it/s][A
399801it [55:13, 120.65it/s][A
399901it [55:14, 120.63it/s][A
400101it [55:17, 120.61it/s][A
400201it [55:18, 120.60it/s][A
400301it [55:19, 120.59it/s][A
400401it [55:21, 120.54it/s][A
400501it [55:23, 120.51it/s][A
400601it [55:25, 120.45it/s][A
400701it [55:27, 120.43it/s][A
400801it [55:28, 120.43it/s][A
400901it [55:29, 120.43it/s][A
401101it [55:31, 120.40it/s][A
401501it [55:31, 120.51it/s][A
401701it [55:34, 120.47it/s][A
402001it [55:34, 120.56it/s][A
402108it

You must `download()` an article first!




419001it [57:46, 120.89it/s][A
419501it [57:46, 121.02it/s][A
419601it [57:54, 120.77it/s][A
420601it [57:55, 121.03it/s][A
420901it [57:55, 121.09it/s][A

You must `download()` an article first!




421001it [58:06, 120.75it/s][A
422101it [58:06, 121.06it/s][A
422325it [58:10, 121.00it/s][A
422484it [58:12, 120.98it/s][A
422598it [58:14, 120.94it/s][A
423401it [58:14, 121.17it/s][A
423657it [58:14, 121.23it/s][A
423858it [58:18, 121.14it/s][A
424001it [58:20, 121.14it/s][A
424104it [58:21, 121.12it/s][A
424201it [58:29, 120.88it/s][A
424701it [58:32, 120.92it/s][A
425801it [58:32, 121.22it/s][A
425920it [58:34, 121.19it/s][A
426005it [58:36, 121.15it/s][A
426101it [58:37, 121.13it/s][A
426301it [58:38, 121.17it/s][A
426401it [58:39, 121.17it/s][A
426501it [58:40, 121.16it/s][A
426601it [58:43, 121.08it/s][A
426801it [58:45, 121.06it/s][A
427301it [58:47, 121.15it/s][A
427601it [58:50, 121.13it/s][A
427701it [58:50, 121.14it/s][A
427801it [58:51, 121.15it/s][A
427901it [58:52, 121.13it/s][A
428101it [58:54, 121.12it/s][A
428201it [58:58, 121.02it/s][A

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




428701it [59:04, 120.93it/s][A
429201it [59:07, 120.97it/s][A
429801it [59:10, 121.05it/s][A
430101it [59:11, 121.11it/s][A
430301it [59:14, 121.06it/s][A
430601it [59:15, 121.11it/s][A
430701it [59:18, 121.03it/s][A
430801it [59:21, 120.97it/s][A
431201it [59:21, 121.08it/s][A
431401it [59:21, 121.14it/s][A
431528it [59:22, 121.15it/s][A
431623it [59:24, 121.09it/s][A
431701it [59:28, 120.99it/s][A
432101it [59:28, 121.08it/s][A
432201it [59:33, 120.94it/s][A
432601it [59:33, 121.04it/s][A
432701it [59:35, 121.03it/s][A
433101it [59:35, 121.12it/s][A
433201it [59:40, 121.00it/s][A
433601it [59:41, 121.08it/s][A
433701it [59:41, 121.10it/s][A
433801it [59:43, 121.07it/s][A
433901it [59:43, 121.07it/s][A
434001it [59:44, 121.06it/s][A
434101it [59:46, 121.03it/s][A
434401it [59:47, 121.10it/s][A
434501it [59:47, 121.10it/s][A
434601it [59:49, 121.08it/s][A
434801it [59:50, 121.09it/s][A
434901it [59:52, 121.06it/s][A
435001it [59:53, 121.05it/s][A
435101i

You must `download()` an article first!

You must `download()` an article first!




439601it [1:00:31, 121.05it/s][A
439901it [1:00:33, 121.05it/s][A
440001it [1:00:35, 121.02it/s][A
440101it [1:00:36, 121.02it/s][A
440201it [1:00:37, 121.02it/s][A
440301it [1:00:37, 121.05it/s][A
440401it [1:00:37, 121.06it/s][A
440501it [1:00:38, 121.07it/s][A
440701it [1:00:40, 121.06it/s][A
440801it [1:00:41, 121.05it/s][A
440901it [1:00:43, 121.00it/s][A
441301it [1:00:44, 121.08it/s][A
441401it [1:00:44, 121.11it/s][A
441501it [1:00:47, 121.03it/s][A
441701it [1:00:48, 121.07it/s][A
441801it [1:00:50, 121.02it/s][A

You must `download()` an article first!




441901it [1:00:57, 120.82it/s][A
443001it [1:00:58, 121.07it/s][A
443101it [1:00:59, 121.07it/s][A
443201it [1:01:01, 121.03it/s][A
443501it [1:01:06, 120.96it/s][A
443901it [1:01:07, 121.04it/s][A
444001it [1:01:08, 121.02it/s][A
444201it [1:01:09, 121.05it/s][A
444401it [1:01:10, 121.07it/s][A
444501it [1:01:11, 121.05it/s][A
444601it [1:01:13, 121.03it/s][A
444701it [1:01:16, 120.95it/s][A
444901it [1:01:17, 120.98it/s][A
445001it [1:01:22, 120.85it/s][A
446001it [1:01:22, 121.10it/s][A
446101it [1:01:26, 121.02it/s][A
446301it [1:01:28, 121.01it/s][A
446401it [1:01:29, 120.98it/s][A
446701it [1:01:34, 120.93it/s][A
447201it [1:01:34, 121.05it/s][A
447301it [1:01:34, 121.07it/s][A
447501it [1:01:36, 121.05it/s][A
447701it [1:01:37, 121.08it/s][A
447801it [1:01:38, 121.09it/s][A
447901it [1:01:41, 121.01it/s][A
448001it [1:01:42, 121.01it/s][A
448201it [1:01:42, 121.04it/s][A
448301it [1:01:43, 121.04it/s][A
448401it [1:01:43, 121.06it/s][A
448501it [1:0

You must `download()` an article first!




456701it [1:02:34, 121.63it/s][A
457001it [1:02:35, 121.69it/s][A
457101it [1:02:35, 121.71it/s][A
457301it [1:02:39, 121.65it/s][A
457401it [1:02:39, 121.65it/s][A
457501it [1:02:40, 121.67it/s][A
457601it [1:02:40, 121.68it/s][A
457701it [1:02:41, 121.68it/s][A
457801it [1:02:43, 121.65it/s][A
458201it [1:02:43, 121.75it/s][A
458301it [1:02:45, 121.72it/s][A
458601it [1:02:52, 121.56it/s][A
458701it [1:03:05, 121.18it/s][A
458801it [1:03:20, 120.71it/s][A
461501it [1:03:24, 121.31it/s][A
462201it [1:03:25, 121.45it/s][A
462301it [1:03:26, 121.46it/s][A
462401it [1:03:27, 121.45it/s][A
462501it [1:03:27, 121.46it/s][A
462601it [1:03:28, 121.48it/s][A
462801it [1:03:30, 121.45it/s][A
463001it [1:03:31, 121.46it/s][A
463101it [1:03:34, 121.41it/s][A
463401it [1:03:34, 121.47it/s][A
463501it [1:03:39, 121.35it/s][A
463701it [1:03:40, 121.38it/s][A
463801it [1:03:43, 121.31it/s][A
463901it [1:03:45, 121.26it/s][A
464001it [1:03:48, 121.19it/s][A
464601it [1:0

You must `download()` an article first!




497801it [1:07:05, 123.66it/s][A
497901it [1:07:07, 123.64it/s][A
498201it [1:07:07, 123.70it/s][A
498301it [1:07:08, 123.70it/s][A
498401it [1:07:08, 123.71it/s][A
498501it [1:07:09, 123.72it/s][A
498801it [1:07:09, 123.79it/s][A
499001it [1:07:10, 123.80it/s][A
499101it [1:07:13, 123.74it/s][A
499501it [1:07:14, 123.82it/s][A
499601it [1:07:16, 123.78it/s][A
499701it [1:07:16, 123.80it/s][A
499801it [1:07:17, 123.80it/s][A
499901it [1:07:17, 123.81it/s][A
500201it [1:07:21, 123.76it/s][A
500301it [1:07:22, 123.77it/s][A
500401it [1:07:22, 123.79it/s][A
500501it [1:07:23, 123.77it/s][A
500801it [1:07:27, 123.72it/s][A
501201it [1:07:30, 123.75it/s][A
501401it [1:07:30, 123.79it/s][A
501901it [1:07:32, 123.85it/s][A
502001it [1:07:33, 123.84it/s][A
502201it [1:07:33, 123.88it/s][A
502501it [1:07:34, 123.94it/s][A
502601it [1:07:34, 123.96it/s][A
502701it [1:07:35, 123.94it/s][A
502801it [1:07:36, 123.94it/s][A
502901it [1:07:37, 123.93it/s][A
503001it [1:0

540801it [1:12:27, 124.38it/s][A
540901it [1:12:29, 124.36it/s][A
541001it [1:12:30, 124.35it/s][A
541201it [1:12:31, 124.38it/s][A
541501it [1:12:31, 124.44it/s][A
541601it [1:12:32, 124.43it/s][A
541801it [1:12:32, 124.47it/s][A
542001it [1:12:33, 124.50it/s][A
542101it [1:12:34, 124.50it/s][A
542201it [1:12:35, 124.49it/s][A
542301it [1:12:36, 124.48it/s][A
542401it [1:12:39, 124.42it/s][A
542501it [1:12:42, 124.37it/s][A
542601it [1:12:42, 124.37it/s][A
542901it [1:12:42, 124.44it/s][A
543001it [1:12:43, 124.45it/s][A
543201it [1:12:44, 124.47it/s][A
543301it [1:12:44, 124.47it/s][A
543401it [1:12:45, 124.48it/s][A
543801it [1:12:47, 124.52it/s][A
543901it [1:12:47, 124.53it/s][A
544001it [1:12:50, 124.46it/s][A
544101it [1:12:53, 124.41it/s][A
544201it [1:12:53, 124.43it/s][A
544601it [1:12:53, 124.52it/s][A
544801it [1:12:54, 124.55it/s][A
545001it [1:12:54, 124.58it/s][A
545201it [1:12:55, 124.59it/s][A
545401it [1:12:57, 124.58it/s][A
545501it [1:12

585601it [1:18:12, 124.81it/s][A
585701it [1:18:15, 124.74it/s][A
585801it [1:18:16, 124.72it/s][A
585901it [1:18:18, 124.70it/s][A
586101it [1:18:18, 124.73it/s][A
586201it [1:18:19, 124.74it/s][A
586301it [1:18:19, 124.76it/s][A
586401it [1:18:19, 124.77it/s][A
586701it [1:18:20, 124.81it/s][A
586801it [1:18:21, 124.82it/s][A
587001it [1:18:23, 124.81it/s][A
587301it [1:18:25, 124.81it/s][A
587401it [1:18:26, 124.82it/s][A
587501it [1:18:27, 124.80it/s][A
587601it [1:18:27, 124.82it/s][A
587701it [1:18:28, 124.83it/s][A
588101it [1:18:28, 124.89it/s][A
588201it [1:18:29, 124.90it/s][A
588301it [1:18:30, 124.88it/s][A
588601it [1:18:31, 124.94it/s][A
588701it [1:18:32, 124.93it/s][A
588801it [1:18:33, 124.92it/s][A
588901it [1:18:36, 124.87it/s][A
589101it [1:18:36, 124.90it/s][A
589201it [1:18:37, 124.90it/s][A
589301it [1:18:39, 124.87it/s][A
589501it [1:18:39, 124.90it/s][A
589601it [1:18:40, 124.91it/s][A
589701it [1:18:42, 124.87it/s][A
590001it [1:18

You must `download()` an article first!




613001it [1:22:46, 123.42it/s][A
613201it [1:22:49, 123.40it/s][A
613601it [1:22:49, 123.46it/s][A
613701it [1:22:50, 123.48it/s][A
613901it [1:22:50, 123.51it/s][A
614001it [1:22:50, 123.53it/s][A
614401it [1:22:50, 123.60it/s][A
614601it [1:22:54, 123.54it/s][A
615101it [1:22:56, 123.61it/s][A
615601it [1:22:59, 123.62it/s][A
615901it [1:23:00, 123.66it/s][A
616801it [1:23:00, 123.84it/s][A
616945it [1:23:04, 123.76it/s][A
617301it [1:23:05, 123.82it/s][A
617801it [1:23:08, 123.85it/s][A
617901it [1:23:12, 123.78it/s][A
618201it [1:23:12, 123.83it/s][A
618701it [1:23:15, 123.85it/s][A
619201it [1:23:16, 123.94it/s][A
619301it [1:23:16, 123.95it/s][A
619401it [1:23:19, 123.89it/s][A
619501it [1:23:20, 123.88it/s][A
619601it [1:23:22, 123.85it/s][A
619701it [1:23:23, 123.86it/s][A
619901it [1:23:23, 123.89it/s][A
620001it [1:23:26, 123.85it/s][A
620201it [1:23:27, 123.87it/s][A
620301it [1:23:29, 123.83it/s][A
620401it [1:23:29, 123.84it/s][A
620601it [1:2

665101it [1:32:01, 120.45it/s][A
665601it [1:32:02, 120.53it/s][A
665701it [1:32:06, 120.47it/s][A
665801it [1:32:06, 120.48it/s][A
666001it [1:32:06, 120.51it/s][A
666101it [1:32:06, 120.53it/s][A
666601it [1:32:06, 120.61it/s][A
666746it [1:32:07, 120.61it/s][A
666851it [1:32:08, 120.62it/s][A
666931it [1:32:12, 120.54it/s][A
667001it [1:32:16, 120.48it/s][A
667601it [1:32:16, 120.58it/s][A
667701it [1:32:17, 120.58it/s][A
668401it [1:32:18, 120.68it/s][A
668501it [1:32:21, 120.64it/s][A
668601it [1:32:21, 120.66it/s][A
668701it [1:32:22, 120.64it/s][A
668801it [1:32:25, 120.60it/s][A
668901it [1:32:26, 120.60it/s][A
669501it [1:32:26, 120.71it/s][A
669901it [1:32:27, 120.75it/s][A
670030it [1:32:28, 120.76it/s][A
670125it [1:32:31, 120.70it/s][A
670201it [1:32:34, 120.66it/s][A
670601it [1:32:34, 120.72it/s][A
670701it [1:32:36, 120.71it/s][A
670801it [1:32:36, 120.71it/s][A
671001it [1:32:37, 120.73it/s][A
671101it [1:32:38, 120.74it/s][A
671601it [1:32

716201it [1:37:20, 122.62it/s][A
716401it [1:37:20, 122.65it/s][A
716501it [1:37:21, 122.66it/s][A
716701it [1:37:21, 122.69it/s][A
716801it [1:37:21, 122.71it/s][A
716901it [1:37:22, 122.71it/s][A
717101it [1:37:22, 122.73it/s][A
717201it [1:37:25, 122.70it/s][A
717301it [1:37:26, 122.70it/s][A
717501it [1:37:26, 122.73it/s][A
717701it [1:37:26, 122.76it/s][A
717801it [1:37:26, 122.77it/s][A
717901it [1:37:26, 122.78it/s][A
718201it [1:37:27, 122.83it/s][A
718401it [1:37:27, 122.85it/s][A
718601it [1:37:27, 122.88it/s][A
718701it [1:37:29, 122.87it/s][A
718801it [1:37:30, 122.86it/s][A
718901it [1:37:31, 122.86it/s][A
719001it [1:37:32, 122.85it/s][A
719801it [1:37:33, 122.97it/s][A
720001it [1:37:33, 123.00it/s][A
720201it [1:37:34, 123.03it/s][A
720301it [1:37:35, 123.02it/s][A
720401it [1:37:36, 123.01it/s][A
720501it [1:37:37, 123.01it/s][A
720601it [1:37:37, 123.01it/s][A
720701it [1:37:38, 123.03it/s][A
720801it [1:37:38, 123.04it/s][A
721201it [1:37

754901it [1:39:30, 126.43it/s][A
755001it [1:39:31, 126.43it/s][A
755101it [1:39:31, 126.44it/s][A
755301it [1:39:32, 126.47it/s][A
755401it [1:39:32, 126.49it/s][A
755601it [1:39:32, 126.51it/s][A
755701it [1:39:32, 126.52it/s][A
755901it [1:39:33, 126.53it/s][A
756101it [1:39:34, 126.55it/s][A
756401it [1:39:35, 126.59it/s][A
756501it [1:39:35, 126.60it/s][A
756601it [1:39:36, 126.59it/s][A
756801it [1:39:37, 126.61it/s][A
757201it [1:39:37, 126.68it/s][A
757309it [1:39:37, 126.69it/s][A
757401it [1:39:38, 126.70it/s][A
757501it [1:39:38, 126.70it/s][A
757601it [1:39:39, 126.71it/s][A
757701it [1:39:39, 126.72it/s][A
757801it [1:39:40, 126.72it/s][A
758001it [1:39:40, 126.74it/s][A
758101it [1:39:41, 126.75it/s][A
758201it [1:39:41, 126.76it/s][A
758301it [1:39:42, 126.76it/s][A
758501it [1:39:42, 126.79it/s][A
758701it [1:39:42, 126.82it/s][A
758901it [1:39:42, 126.85it/s][A
759001it [1:39:42, 126.86it/s][A
759101it [1:39:43, 126.87it/s][A
759201it [1:39

You must `download()` an article first!




777901it [1:41:05, 128.25it/s][A

You must `download()` an article first!




778001it [1:41:06, 128.24it/s][A
778101it [1:41:09, 128.21it/s][A
778201it [1:41:10, 128.20it/s][A
778601it [1:41:10, 128.26it/s][A
778901it [1:41:12, 128.27it/s][A
779501it [1:41:15, 128.29it/s][A
779701it [1:41:17, 128.29it/s][A
779901it [1:41:19, 128.28it/s][A
780101it [1:41:20, 128.29it/s][A
780601it [1:41:21, 128.35it/s][A
781101it [1:41:23, 128.39it/s][A
781201it [1:41:25, 128.37it/s][A
781301it [1:41:28, 128.33it/s][A
781401it [1:41:28, 128.34it/s][A
781601it [1:41:28, 128.36it/s][A
781701it [1:41:29, 128.37it/s][A
781801it [1:41:29, 128.38it/s][A
782001it [1:41:30, 128.41it/s][A
782201it [1:41:31, 128.40it/s][A
782301it [1:41:33, 128.39it/s][A
782401it [1:41:33, 128.40it/s][A
782701it [1:41:34, 128.43it/s][A
782801it [1:41:35, 128.41it/s][A
782901it [1:41:38, 128.38it/s][A
783001it [1:41:41, 128.34it/s][A
783801it [1:41:42, 128.44it/s][A
783901it [1:41:44, 128.41it/s][A
784401it [1:41:45, 128.47it/s][A
784501it [1:41:48, 128.42it/s][A
785001it [1:4

You must `download()` an article first!




802501it [1:43:56, 128.68it/s][A
802601it [1:44:05, 128.51it/s][A
804001it [1:44:06, 128.70it/s][A
804101it [1:44:11, 128.63it/s][A
804201it [1:44:13, 128.60it/s][A
804501it [1:44:13, 128.64it/s][A
804701it [1:44:15, 128.64it/s][A
804801it [1:44:15, 128.65it/s][A
804901it [1:44:15, 128.66it/s][A
805001it [1:44:16, 128.67it/s][A
805101it [1:44:16, 128.67it/s][A
805301it [1:44:17, 128.70it/s][A
805401it [1:44:18, 128.70it/s][A
805601it [1:44:18, 128.72it/s][A
805701it [1:44:24, 128.61it/s][A
805801it [1:44:25, 128.61it/s][A
805901it [1:44:34, 128.43it/s][A
806401it [1:44:40, 128.40it/s][A
806401it [1:44:51, 128.18it/s][A
806501it [1:44:51, 128.18it/s][A
806601it [1:45:15, 127.73it/s][A
806701it [1:45:19, 127.66it/s][A
806701it [1:45:31, 127.41it/s][A
807001it [1:45:40, 127.28it/s][A
807301it [1:45:45, 127.22it/s][A
807801it [1:45:54, 127.12it/s][A
807801it [1:46:11, 126.79it/s][A
808301it [1:46:17, 126.73it/s][A
811801it [1:46:18, 127.26it/s][A
811911it [1:4

855301it [1:51:18, 128.07it/s][A
855501it [1:51:21, 128.05it/s][A
855601it [1:51:23, 128.01it/s][A
855701it [1:51:24, 128.01it/s][A
855801it [1:51:25, 128.02it/s][A
856001it [1:51:30, 127.95it/s][A
856301it [1:51:32, 127.95it/s][A
856901it [1:51:33, 128.02it/s][A
857001it [1:51:33, 128.03it/s][A
857101it [1:51:35, 128.01it/s][A
857201it [1:51:39, 127.95it/s][A
857401it [1:51:40, 127.95it/s][A
857701it [1:51:41, 127.98it/s][A
857901it [1:51:45, 127.95it/s][A
858501it [1:51:46, 128.01it/s][A
858701it [1:51:46, 128.03it/s][A
858801it [1:51:50, 127.97it/s][A
859101it [1:51:54, 127.95it/s][A
859301it [1:51:56, 127.95it/s][A
859501it [1:51:56, 127.96it/s][A
859601it [1:51:58, 127.95it/s][A
860201it [1:51:58, 128.04it/s][A
860312it [1:51:59, 128.02it/s][A
860401it [1:52:00, 128.03it/s][A
860501it [1:52:04, 127.97it/s][A
860601it [1:52:05, 127.97it/s][A
860701it [1:52:07, 127.95it/s][A
860801it [1:52:08, 127.94it/s][A
861001it [1:52:08, 127.96it/s][A
861101it [1:52

You must `download()` an article first!




887301it [1:55:38, 127.89it/s][A
887801it [1:55:42, 127.89it/s][A
888001it [1:55:43, 127.89it/s][A
888101it [1:55:44, 127.88it/s][A
888601it [1:55:45, 127.93it/s][A
888701it [1:55:48, 127.90it/s][A
888801it [1:55:49, 127.90it/s][A
888901it [1:55:53, 127.84it/s][A
889401it [1:55:56, 127.85it/s][A
889601it [1:55:57, 127.86it/s][A
889701it [1:55:57, 127.87it/s][A
890001it [1:55:58, 127.89it/s][A
890301it [1:56:00, 127.91it/s][A
890401it [1:56:02, 127.88it/s][A
890501it [1:56:06, 127.83it/s][A
890801it [1:56:06, 127.87it/s][A
891001it [1:56:08, 127.87it/s][A
891101it [1:56:08, 127.87it/s][A
891201it [1:56:09, 127.86it/s][A
891301it [1:56:11, 127.84it/s][A
891501it [1:56:13, 127.84it/s][A

You must `download()` an article first!




891701it [1:56:15, 127.83it/s][A
892001it [1:56:16, 127.86it/s][A
892101it [1:56:18, 127.83it/s][A
892201it [1:56:22, 127.77it/s][A
892301it [1:56:24, 127.75it/s][A
893301it [1:56:25, 127.88it/s][A
893401it [1:56:26, 127.87it/s][A
893501it [1:56:28, 127.86it/s][A
893601it [1:56:28, 127.86it/s][A
893701it [1:56:30, 127.85it/s][A
893801it [1:56:32, 127.83it/s][A
894001it [1:56:35, 127.80it/s][A
894701it [1:56:36, 127.88it/s][A
894801it [1:56:38, 127.86it/s][A
895001it [1:56:38, 127.88it/s][A
895101it [1:56:40, 127.86it/s][A
895301it [1:56:41, 127.87it/s][A
895401it [1:56:43, 127.84it/s][A
895501it [1:56:45, 127.82it/s][A
895701it [1:56:46, 127.84it/s][A
895901it [1:56:47, 127.85it/s][A
896301it [1:56:48, 127.90it/s][A
896401it [1:56:49, 127.88it/s][A
896501it [1:56:50, 127.87it/s][A
896701it [1:56:51, 127.88it/s][A
896801it [1:56:53, 127.87it/s][A
896901it [1:56:56, 127.83it/s][A
897101it [1:56:58, 127.82it/s][A
897701it [1:56:58, 127.90it/s][A
897801it [1:5

940701it [2:01:52, 128.64it/s][A
941001it [2:01:53, 128.67it/s][A
941101it [2:01:54, 128.67it/s][A
941201it [2:01:54, 128.68it/s][A
941301it [2:01:56, 128.65it/s][A
941501it [2:01:58, 128.64it/s][A
941701it [2:01:58, 128.67it/s][A
941901it [2:01:59, 128.69it/s][A
942001it [2:02:00, 128.68it/s][A
942101it [2:02:02, 128.67it/s][A
942301it [2:02:02, 128.69it/s][A
942401it [2:02:03, 128.67it/s][A
942601it [2:02:05, 128.68it/s][A
942701it [2:02:05, 128.69it/s][A
942901it [2:02:06, 128.70it/s][A
943101it [2:02:09, 128.67it/s][A
943201it [2:02:10, 128.67it/s][A
943301it [2:02:10, 128.68it/s][A
943601it [2:02:11, 128.70it/s][A
943801it [2:02:13, 128.70it/s][A
943901it [2:02:14, 128.70it/s][A
944001it [2:02:15, 128.69it/s][A
944101it [2:02:15, 128.69it/s][A
944201it [2:02:16, 128.69it/s][A
944601it [2:02:18, 128.71it/s][A
944701it [2:02:20, 128.70it/s][A
944801it [2:02:21, 128.70it/s][A
944901it [2:02:21, 128.71it/s][A
945001it [2:02:22, 128.70it/s][A
945401it [2:02

980101it [2:07:09, 128.46it/s][A
980201it [2:07:11, 128.43it/s][A
980301it [2:07:15, 128.39it/s][A
980401it [2:07:18, 128.35it/s][A
980501it [2:07:19, 128.35it/s][A
980601it [2:07:20, 128.34it/s][A
980701it [2:07:21, 128.34it/s][A
981001it [2:07:23, 128.35it/s][A
981101it [2:07:24, 128.34it/s][A
981301it [2:07:25, 128.35it/s][A
981401it [2:07:28, 128.31it/s][A
981501it [2:07:29, 128.31it/s][A
981601it [2:07:31, 128.29it/s][A
981701it [2:07:34, 128.26it/s][A
981801it [2:07:35, 128.24it/s][A
981901it [2:07:39, 128.19it/s][A
982001it [2:07:42, 128.16it/s][A
982101it [2:07:42, 128.17it/s][A
982201it [2:07:43, 128.16it/s][A
982301it [2:07:44, 128.16it/s][A
982401it [2:07:45, 128.16it/s][A
982601it [2:07:46, 128.16it/s][A
982801it [2:07:48, 128.16it/s][A
982901it [2:07:49, 128.15it/s][A
983001it [2:07:52, 128.12it/s][A
983201it [2:07:54, 128.11it/s][A
983301it [2:07:58, 128.06it/s][A
983401it [2:07:59, 128.06it/s][A
983501it [2:08:02, 128.01it/s][A
983601it [2:08

1011901it [2:13:05, 126.72it/s][A
1012001it [2:13:06, 126.72it/s][A
1012501it [2:13:08, 126.74it/s][A
1012801it [2:13:10, 126.75it/s][A
1012901it [2:13:12, 126.74it/s][A
1013001it [2:13:12, 126.74it/s][A
1013101it [2:13:13, 126.75it/s][A
1013201it [2:13:14, 126.74it/s][A
1013501it [2:13:14, 126.77it/s][A
1014101it [2:13:17, 126.80it/s][A
1014201it [2:13:17, 126.81it/s][A
1014301it [2:13:18, 126.82it/s][A
1014401it [2:13:18, 126.82it/s][A
1014501it [2:13:19, 126.82it/s][A
1014601it [2:13:20, 126.81it/s][A
1014701it [2:13:20, 126.82it/s][A
1015301it [2:13:21, 126.89it/s][A
1015501it [2:13:21, 126.91it/s][A
1015701it [2:13:24, 126.89it/s][A
1015901it [2:13:24, 126.91it/s][A
1016001it [2:13:25, 126.92it/s][A
1016101it [2:13:26, 126.91it/s][A
1016301it [2:13:26, 126.93it/s][A
1016401it [2:13:30, 126.88it/s][A
1016501it [2:13:30, 126.89it/s][A
1016601it [2:13:32, 126.88it/s][A
1017701it [2:13:33, 126.99it/s][A
1017801it [2:13:34, 127.00it/s][A
1018001it [2:13:36, 

You must `download()` an article first!




1029101it [2:14:46, 127.25it/s][A
1029201it [2:14:48, 127.25it/s][A
1029501it [2:14:50, 127.25it/s][A
1029601it [2:14:50, 127.26it/s][A
1029701it [2:14:51, 127.26it/s][A
1029801it [2:14:52, 127.26it/s][A
1029901it [2:14:53, 127.26it/s][A
1030201it [2:14:56, 127.24it/s][A
1030301it [2:14:56, 127.25it/s][A
1030401it [2:14:57, 127.26it/s][A
1030501it [2:14:58, 127.24it/s][A
1030601it [2:15:02, 127.19it/s][A
1030701it [2:15:07, 127.14it/s][A
1030901it [2:15:08, 127.14it/s][A
1031101it [2:15:08, 127.16it/s][A
1031201it [2:15:10, 127.15it/s][A
1031501it [2:15:11, 127.16it/s][A
1031601it [2:15:12, 127.16it/s][A
1031801it [2:15:13, 127.17it/s][A
1032001it [2:15:14, 127.19it/s][A
1032101it [2:15:19, 127.11it/s][A
1032301it [2:15:23, 127.07it/s][A
1032501it [2:15:25, 127.07it/s][A
1032601it [2:15:29, 127.01it/s][A
1033301it [2:15:30, 127.09it/s][A
1033401it [2:15:30, 127.10it/s][A
1033501it [2:15:32, 127.09it/s][A
1033601it [2:15:32, 127.09it/s][A
1033701it [2:15:41,

You must `download()` an article first!




1080501it [2:20:20, 128.31it/s][A
1080801it [2:20:22, 128.33it/s][A
1080901it [2:20:24, 128.30it/s][A
1081201it [2:20:25, 128.33it/s][A
1081601it [2:20:26, 128.35it/s][A
1081901it [2:20:28, 128.36it/s][A
1082001it [2:20:29, 128.35it/s][A
1082101it [2:20:34, 128.29it/s][A
1082301it [2:20:35, 128.30it/s][A
1082501it [2:20:35, 128.32it/s][A
1082601it [2:20:37, 128.31it/s][A
1082701it [2:20:39, 128.29it/s][A
1083001it [2:20:39, 128.32it/s][A
1083201it [2:20:40, 128.34it/s][A
1083301it [2:20:42, 128.32it/s][A
1083601it [2:20:43, 128.34it/s][A
1083701it [2:20:47, 128.29it/s][A
1083801it [2:20:47, 128.29it/s][A
1083901it [2:20:48, 128.29it/s][A
1084201it [2:20:49, 128.32it/s][A
1084301it [2:20:51, 128.29it/s][A
1085001it [2:20:52, 128.37it/s][A
1085108it [2:20:52, 128.38it/s][A
1085201it [2:20:53, 128.37it/s][A
1085301it [2:20:56, 128.33it/s][A
1085401it [2:20:58, 128.32it/s][A
1085801it [2:20:59, 128.36it/s][A
1085901it [2:20:59, 128.37it/s][A
1086001it [2:21:00,

You must `download()` an article first!




1090701it [2:21:37, 128.35it/s][A
1091601it [2:21:38, 128.44it/s][A
1091701it [2:21:41, 128.41it/s][A
1092001it [2:21:42, 128.43it/s][A
1092101it [2:21:43, 128.43it/s][A
1092201it [2:21:47, 128.38it/s][A
1092301it [2:21:48, 128.38it/s][A
1092501it [2:21:53, 128.33it/s][A
1092801it [2:21:54, 128.35it/s][A
1093301it [2:21:57, 128.36it/s][A
1093401it [2:21:58, 128.36it/s][A
1093701it [2:21:59, 128.37it/s][A
1094401it [2:22:00, 128.44it/s][A
1094501it [2:22:02, 128.43it/s][A
1094601it [2:22:02, 128.44it/s][A
1094801it [2:22:03, 128.45it/s][A
1094901it [2:22:06, 128.41it/s][A
1095101it [2:22:06, 128.43it/s][A
1095201it [2:22:07, 128.43it/s][A
1095501it [2:22:08, 128.45it/s][A
1095601it [2:22:11, 128.42it/s][A
1095701it [2:22:11, 128.42it/s][A
1095901it [2:22:12, 128.44it/s][A
1096001it [2:22:13, 128.44it/s][A
1096101it [2:22:16, 128.41it/s][A
1096301it [2:22:17, 128.41it/s][A
1096401it [2:22:18, 128.40it/s][A
1096501it [2:22:20, 128.38it/s][A
1096601it [2:22:21,

1129501it [2:27:08, 127.94it/s][A
1129601it [2:27:08, 127.94it/s][A
1129701it [2:27:09, 127.94it/s][A
1129801it [2:27:11, 127.93it/s][A
1130001it [2:27:11, 127.95it/s][A
1130101it [2:27:13, 127.94it/s][A
1130201it [2:27:14, 127.93it/s][A
1130301it [2:27:15, 127.92it/s][A
1130401it [2:27:16, 127.92it/s][A
1130601it [2:27:20, 127.90it/s][A
1130701it [2:27:22, 127.88it/s][A
1131001it [2:27:25, 127.87it/s][A
1131301it [2:27:25, 127.89it/s][A
1131401it [2:27:27, 127.88it/s][A
1131701it [2:27:28, 127.89it/s][A
1131801it [2:27:29, 127.90it/s][A
1131901it [2:27:30, 127.90it/s][A
1132001it [2:27:30, 127.91it/s][A
1132101it [2:27:30, 127.91it/s][A
1132201it [2:27:34, 127.87it/s][A
1132501it [2:27:37, 127.86it/s][A
1132801it [2:27:39, 127.86it/s][A
1133001it [2:27:39, 127.88it/s][A
1133201it [2:27:47, 127.80it/s][A
1133301it [2:27:47, 127.80it/s][A
1134101it [2:27:47, 127.89it/s][A
1134330it [2:27:50, 127.88it/s][A
1134493it [2:27:52, 127.86it/s][A
1134701it [2:27:52, 

You must `download()` an article first!

You must `download()` an article first!



1163201it [2:31:13, 128.20it/s][A


You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!

You must `download()` an article first!




1163401it [2:31:13, 128.22it/s][A
1163701it [2:31:14, 128.23it/s][A
1163901it [2:31:15, 128.25it/s][A
1164001it [2:31:16, 128.25it/s][A
1164401it [2:31:18, 128.26it/s][A
1164501it [2:31:21, 128.23it/s][A
1164601it [2:31:24, 128.20it/s][A
1164701it [2:31:25, 128.19it/s][A
1164801it [2:31:26, 128.19it/s][A
1164901it [2:31:26, 128.20it/s][A
1165001it [2:31:27, 128.21it/s][A
1165101it [2:31:27, 128.21it/s][A
1165301it [2:31:29, 128.20it/s][A
1166001it [2:31:31, 128.25it/s][A
1166101it [2:31:35, 128.20it/s][A
1166201it [2:31:39, 128.16it/s][A
1166401it [2:31:41, 128.15it/s][A
1167301it [2:31:43, 128.23it/s][A
1167501it [2:31:43, 128.25it/s][A
1167601it [2:31:44, 128.24it/s][A
1167701it [2:31:49, 128.18it/s][A
1167801it [2:31:51, 128.17it/s][A
1167901it [2:31:52, 128.16it/s][A
1168101it [2:31:54, 128.16it/s][A
1168601it [2:31:54, 128.21it/s][A
1168701it [2:31:54, 128.22it/s][A
1168801it [2:31:55, 128.22it/s][A
1168901it [2:31:56, 128.22it/s][A
1169001it [2:31:56,

1212601it [2:37:28, 128.34it/s][A
1212801it [2:37:29, 128.34it/s][A
1212901it [2:37:30, 128.34it/s][A
1213201it [2:37:31, 128.36it/s][A
1213301it [2:37:33, 128.35it/s][A
1213401it [2:37:35, 128.33it/s][A
1213501it [2:37:35, 128.34it/s][A
1213701it [2:37:35, 128.35it/s][A
1213801it [2:37:37, 128.34it/s][A
1214101it [2:37:38, 128.37it/s][A
1214201it [2:37:39, 128.35it/s][A
1214301it [2:37:40, 128.35it/s][A
1214401it [2:37:42, 128.34it/s][A
1214701it [2:37:43, 128.35it/s][A
1214801it [2:37:43, 128.36it/s][A
1214901it [2:37:46, 128.34it/s][A
1215101it [2:37:46, 128.36it/s][A
1215201it [2:37:48, 128.34it/s][A
1215301it [2:37:49, 128.34it/s][A
1215401it [2:37:49, 128.35it/s][A
1215601it [2:37:51, 128.34it/s][A
1215801it [2:37:52, 128.36it/s][A
1215901it [2:37:53, 128.35it/s][A
1216101it [2:37:54, 128.36it/s][A
1216301it [2:37:56, 128.34it/s][A
1216501it [2:38:00, 128.32it/s][A
1216801it [2:38:05, 128.28it/s][A
1217701it [2:38:06, 128.36it/s][A
1218001it [2:38:08, 

You must `download()` an article first!




1227601it [2:39:32, 128.24it/s][A
1227701it [2:39:33, 128.24it/s][A
1227901it [2:39:33, 128.26it/s][A
1228001it [2:39:37, 128.22it/s][A
1228101it [2:39:40, 128.18it/s][A
1228201it [2:39:41, 128.19it/s][A
1228401it [2:39:42, 128.19it/s][A
1228601it [2:39:43, 128.20it/s][A
1228701it [2:39:44, 128.20it/s][A
1229101it [2:39:45, 128.23it/s][A
1229201it [2:39:45, 128.24it/s][A
1229301it [2:39:47, 128.22it/s][A
1229601it [2:39:49, 128.23it/s][A
1229701it [2:39:54, 128.17it/s][A
1229801it [2:39:54, 128.18it/s][A
1229901it [2:39:55, 128.18it/s][A
1230001it [2:39:56, 128.17it/s][A
1230101it [2:39:56, 128.18it/s][A
1230301it [2:39:57, 128.19it/s][A
1230501it [2:39:57, 128.20it/s][A
1230601it [2:39:58, 128.20it/s][A
1230701it [2:39:59, 128.21it/s][A
1230801it [2:40:00, 128.21it/s][A
1230901it [2:40:00, 128.22it/s][A
1231001it [2:40:00, 128.22it/s][A
1231101it [2:40:01, 128.21it/s][A
1231201it [2:40:03, 128.20it/s][A
1231301it [2:40:09, 128.14it/s][A
1231501it [2:40:09,

You must `download()` an article first!

You must `download()` an article first!




1256334it [2:44:21, 127.40it/s][A
1257501it [2:44:23, 127.49it/s][A
1257601it [2:44:24, 127.49it/s][A
1258301it [2:44:26, 127.53it/s][A
1258401it [2:44:26, 127.54it/s][A
1258501it [2:44:28, 127.52it/s][A
1258601it [2:44:30, 127.51it/s][A
1258701it [2:44:32, 127.50it/s][A
1259001it [2:44:32, 127.52it/s][A
1259101it [2:44:34, 127.51it/s][A
1259401it [2:44:37, 127.51it/s][A
1259701it [2:44:37, 127.54it/s][A
1259901it [2:44:41, 127.51it/s][A
1260401it [2:44:45, 127.50it/s][A
1261301it [2:44:45, 127.59it/s][A
1261401it [2:44:47, 127.58it/s][A
1261501it [2:44:47, 127.59it/s][A
1261601it [2:44:48, 127.59it/s][A
1261701it [2:44:49, 127.58it/s][A
1261801it [2:44:49, 127.58it/s][A
1261901it [2:44:51, 127.58it/s][A
1262001it [2:44:53, 127.56it/s][A
1262201it [2:44:56, 127.55it/s][A
1263101it [2:44:56, 127.63it/s][A
1263201it [2:44:57, 127.62it/s][A
1263301it [2:44:58, 127.62it/s][A
1263401it [2:44:59, 127.62it/s][A
1263501it [2:45:00, 127.61it/s][A
1263701it [2:45:02,

# Analyse the articles

In [6]:
pages = []
with open('data/7_opensources_co/scraped_pages_articles.jsonl', 'r') as _in:
    for line in tqdm(_in):
        pages.append(ujson.loads(line))

df_pages = pd.DataFrame(pages)
df_pages


0it [00:00, ?it/s][A
4851it [00:00, 48124.87it/s][A
11394it [00:00, 56774.05it/s][A
17626it [00:00, 58468.59it/s][A
21583it [00:00, 52868.03it/s][A
27683it [00:00, 54481.40it/s][A
35426it [00:00, 58190.16it/s][A
44486it [00:00, 62802.12it/s][A
51330it [00:00, 57282.27it/s][A
57022it [00:01, 56461.94it/s][A
64542it [00:01, 58152.91it/s][A
71642it [00:01, 59192.83it/s][A
78564it [00:01, 59975.76it/s][A
85116it [00:01, 55843.22it/s][A
91198it [00:01, 56148.80it/s][A
96876it [00:01, 54998.25it/s][A
102091it [00:01, 54708.23it/s][A
107616it [00:01, 54743.28it/s][A
113417it [00:02, 51236.06it/s][A
120942it [00:02, 52265.69it/s][A
133576it [00:02, 55333.87it/s][A
148769it [00:02, 59173.11it/s][A
158587it [00:02, 55380.28it/s][A
166302it [00:03, 54544.86it/s][A
172951it [00:03, 53687.88it/s][A
178762it [00:03, 53010.94it/s][A
183988it [00:03, 52486.52it/s][A
188821it [00:03, 52052.24it/s][A
193392it [00:03, 51544.36it/s][A
197694it [00:03, 50996.08it/s][A
201742i

Unnamed: 0,authors,batch,content,id,meta_description,meta_keywords,published_at,title,url
0,[],2,"Listen up, America. Things are going great for...",260606,"Fueled by bees’ busyness, seagulls’ saving gra...",[],,Putting the UT in TRUTH,http://beehivebugle.com
1,[],2,"Trump’s Salary Donation Is Just Bullsh*t PR, A...",260521,Addicting Info | The Knowledge You Crave:,[Addicting Info | The Knowledge You Crave],,The Knowledge You Crave,http://addictinginfo.org
2,[],2,There is something to be said for a physically...,260522,,[],,AWM,http://americanoverlook.com
3,[Vanessa Beeley],2,Episode #209 – ‘By the Rivers of Babylon’ gues...,260523,News for the Waking Generation,[],,News for the Waking Generation,http://21stcenturywire.com
4,[Kate Anslinger],2,There is something to be said for a physically...,260524,,[],,Bodybuilder Sees Woman In Distress After Thug ...,http://americanoverlook.com/bodybuilder-sees-w...
5,[Brandi Philip],2,Just when you thought it was safe to watch sum...,260525,,[],,Here’s Video Footage Of What Experts Are Calli...,http://americanoverlook.com/heres-video-footag...
6,[],2,This web site is updated multiple times daily....,260526,,[],,World Class Investigative Truth,http://82.221.129.208/.zu1.html
7,[],2,,260527,,[],,,http://abcnewsgo.co
8,"[John Aravosis, Chris Andoe]",2,"In today’s podcast, Cliff Schecter and I discu...",260528,"News, breaking news, analysis and original con...","[news, breaking news, news blog, politics]",,A great nation deserves the truth // One of Am...,http://americablog.com
9,"[Alexander Smith, Robert Winthrop, Kate Anslin...",2,"When you were a kid, did you use certain words...",260529,,[],,AWM,http://americanoverlook.com/category/quiz/


In [13]:
netlocs_count = {}
with tqdm(total=len(df_pages)) as progress:
    for url in df_pages.url:
        netloc = urlsplit(url).netloc
        if netloc not in netlocs_count:
            netlocs_count[netloc] = 0

        netlocs_count[netloc] += 1
        progress.update()


  0%|          | 0/1265093 [00:00<?, ?it/s][A
  1%|          | 8313/1265093 [00:00<00:15, 83127.11it/s][A
  2%|▏         | 23497/1265093 [00:00<00:12, 96185.06it/s][A
  3%|▎         | 39769/1265093 [00:00<00:11, 109632.83it/s][A
  4%|▍         | 53861/1265093 [00:00<00:10, 117450.92it/s][A
  6%|▌         | 71640/1265093 [00:00<00:09, 130764.17it/s][A
  7%|▋         | 89687/1265093 [00:00<00:08, 142540.86it/s][A
  8%|▊         | 107379/1265093 [00:00<00:07, 151364.78it/s][A
 10%|▉         | 122966/1265093 [00:00<00:07, 152686.98it/s][A
 11%|█         | 138417/1265093 [00:00<00:07, 153099.63it/s][A
 12%|█▏        | 154876/1265093 [00:01<00:07, 156373.66it/s][A
 13%|█▎        | 170715/1265093 [00:01<00:06, 156972.19it/s][A
 15%|█▍        | 186488/1265093 [00:01<00:07, 152957.77it/s][A
 16%|█▌        | 201859/1265093 [00:01<00:06, 152415.91it/s][A
 17%|█▋        | 219441/1265093 [00:01<00:06, 158752.55it/s][A
 19%|█▊        | 236071/1265093 [00:01<00:06, 160943.68it/s][A
 

In [18]:
df_netlocs_count = pd.DataFrame([{'domain': k, 'count': v} for k, v in netlocs_count.items()]).sort_values('count', ascending=False)
df_netlocs_count

Unnamed: 0,count,domain
115,165817,www.dailykos.com
463,99444,www.express.co.uk
423,90977,sputniknews.com
131,71362,www.lifezette.com
332,62211,beforeitsnews.com
75,26395,www.judicialwatch.org
107,26136,www.breitbart.com
257,20840,dailycaller.com
248,20819,www.abovetopsecret.com
226,18339,www.washingtonexaminer.com


In [39]:
df_pages[df_pages.url.str.contains('www.express.co.uk')]

Unnamed: 0,authors,batch,content,id,meta_description,meta_keywords,published_at,title,url
1550,"[Charlie Bayliss Exclusive, Karin Von Hippel]",2,REUTERS/ GETTY Karin von Hippel warned Europe ...,262071,EUROPE should brace itself for more terror att...,[],1477447320,BRITAIN TERROR WARNING: ISIS jihadis ‘WILL unl...,http://www.express.co.uk/news/world/725238/Mos...
1551,"[Alix Culbertson, Lord Hague]",2,GETTY William Hague has blasted President-elec...,262072,TORY heavyweight William Hague has branded Don...,[],1479882840,William Hague brands Donald Trump ‘a JOKE’ who...,http://www.express.co.uk/news/world/735343/Wil...
1562,[],2,Coronation Street spoilers: Todd Grimshaw exit...,262083,TV and Celebrity news,[],,TV and Celebrity,http://www.express.co.uk/showbiz
1563,[],2,When is the next meteor shower in the UK? Date...,262084,"Latest UK news, breaking news and current news...",[],,Breaking News and Opinion,http://www.express.co.uk/news
1571,[],2,BONFIRE NIGHT WEATHER: Check the fireworks nig...,262092,"The weather forecast for London, Manchester, B...",[],,Weather Forecast for the UK,http://www.express.co.uk/news/weather
1610,"[Nicole Stinson, Theresa May]",2,Her stark warning comes amid fears her “weak” ...,262131,THERESA May has warned Tory rebels and Remoane...,[],1510272480,Brexit WILL happen: May warns Remoaner MPs she...,http://www.express.co.uk/news/uk/877753/Theres...
1611,[Taryn Tarrant-Cornish],2,The demand comes after the Prime Minister fail...,262132,THERESA May’s only chance to redeem herself an...,[],1510302480,Furious Question Time audience attack May 'fai...,http://www.express.co.uk/news/uk/877755/Questi...
1612,[Matt Drake],2,The former Prime Minister is not hopeful about...,262133,GORDON Brown has attacked Britain’s chances of...,[],1510279740,"'We will have NO control of borders, courts or...",http://www.express.co.uk/news/politics/877758/...
1615,"[Rebecca Perring, Lady Pamela Hicks]",2,But from that momentous day in 1947 the monarc...,262136,THE QUEEN will mark her 70th wedding anniversa...,[],1510237560,Queen and Prince Philip's 70th wedding anniver...,http://www.express.co.uk/news/royal/877508/Que...
1616,[Prince Charles],2,GETTY Prince Charles praised the Commonwealth'...,262137,"THE Commonwealth's diversity of people, cultur...",[],1510185600,Charles praises Commonwealth: Diversity can he...,http://www.express.co.uk/news/royal/877514/Pri...


# Extract the urls for continuing of scraping

In [40]:
with open('data/7_opensources_co/scraped_pages_urls.json', 'w') as _out: 
    ujson.dump(list(df_pages.url.values), _out)