In [27]:
from time import sleep
import multiprocessing
from datetime import datetime

import ujson
import requests
import newspaper
from tqdm import tqdm
from peewee import SqliteDatabase, fn
from playhouse.shortcuts import model_to_dict

from database import Page

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Download-urls-from-archive" data-toc-modified-id="Download-urls-from-archive-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Download urls from archive</a></span><ul class="toc-item"><li><span><a href="#Explore-document-types" data-toc-modified-id="Explore-document-types-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Explore document types</a></span></li></ul></li><li><span><a href="#Scrape-the-content" data-toc-modified-id="Scrape-the-content-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Scrape the content</a></span></li><li><span><a href="#Insert-into-DB" data-toc-modified-id="Insert-into-DB-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Insert into DB</a></span></li></ul></div>

# Download urls from archive

In [14]:
url = 'https://api.nytimes.com/svc/archive/v1/%s/%s.json?api-key=29a51170349f43d9abe651b0e2331ea6'
year_from = 2000  # 1852
year_to = 2018

with open('data/nytimes/archive.json', 'w') as out_archive:
    with tqdm(total=(year_from - year_to) * 12) as progress:
        for year in range(year_from, year_to + 1):
            for month in range(1, 13):
                docs = None
                try:
                    response = requests.get(url % (year, month))
                    docs = response.json()['response']['docs']
                except Exception:
                    sleep(10)

                if docs is None:
                    response = requests.get(url % (year, month))
                    docs = response.json()['response']['docs']

                for doc in docs:
                    out_archive.write(ujson.dumps(doc) + '\n')

                progress.update()

218it [29:43,  7.66s/it]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

## Explore document types

In [15]:
archive_entries_doc_types = {}
with open('data/nytimes/archive.json', 'r') as in_archive: 
    with tqdm() as progress:
        for line in in_archive:
            archive_entry = ujson.loads(line)
            doc_type = archive_entry['document_type']
            
            archive_entries_doc_types.setdefault(doc_type, 0)
            archive_entries_doc_types[doc_type] += 1
            
            progress.update()

2002799it [02:06, 15852.34it/s]


In [16]:
archive_entries_doc_types

{'article': 1550951, 'blogpost': 372608, 'multimedia': 79240}

# Scrape the content

In [2]:
def scrape_article(line):
    archive_entry = ujson.loads(line)
    url = archive_entry['web_url']

    if archive_entry['document_type'] != 'article':
        return None

    try:
        article = newspaper.Article(url, fetch_images=False)
        article.download()
        article.parse()
    except Exception:
        print('Something went wrong parsing url:', url)
        return None

    archive_entry['newspaper'] = {
        'title': article.title,
        'content': article.text,
        'authors': ', '.join(article.authors),
        'keywords': ', '.join(article.keywords),
        'meta_keywords': article.meta_keywords,
        'meta_description': article.meta_description,
        'tags': ', '.join(article.tags),
        'summary': article.summary
    }
    
    return archive_entry

In [5]:
with open('data/nytimes/archive_scraped.jsonl', 'w') as out_archive_scraped:
    with open('data/nytimes/archive.json', 'r') as in_archive: 
        with tqdm() as progress:
            with multiprocessing.Pool(processes=32) as pool:
                for archive_entry in pool.imap(scrape_article, in_archive, chunksize=1):
                    if archive_entry is None:
                        continue

                    out_archive_scraped.write(ujson.dumps(archive_entry) + '\n')
                    progress.update()

5it [00:08,  3.46s/it]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/01/world/mideast-talks-begin-monday.html on URL https://www.nytimes.com/2000/01/01/world/mideast-talks-begin-monday.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/01/world/mideast-talks-begin-monday.html


203it [00:35,  6.00it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/the-year-2000-a-night-of-jubilation-as-times-square-welcomes-the-dawn-of-2000.html on URL https://www.nytimes.com/2000/01/02/nyregion/the-year-2000-a-night-of-jubilation-as-times-square-welcomes-the-dawn-of-2000.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/the-year-2000-a-night-of-jubilation-as-times-square-welcomes-the-dawn-of-2000.html


264it [00:43,  1.98it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/neighborhood-report-winemaker-splits.html on URL https://www.nytimes.com/2000/01/02/nyregion/neighborhood-report-winemaker-splits.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/neighborhood-report-winemaker-splits.html


362it [00:56,  4.55it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/a-110-million-deficit-for-this.html on URL https://www.nytimes.com/2000/01/02/nyregion/a-110-million-deficit-for-this.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/a-110-million-deficit-for-this.html


441it [01:05,  4.70it/s]

Article `download()` failed with 404 Client Error: Not Found for url: http://www.nytimes.com/2000/01/02/nyregion/photographer-s-journal-087289.html on URL https://www.nytimes.com/2000/01/02/nyregion/photographer-s-journal-087289.html
Something went wrong parsing url: https://www.nytimes.com/2000/01/02/nyregion/photographer-s-journal-087289.html


455it [01:07,  6.70it/s]


KeyboardInterrupt: 

# Insert into DB

In [45]:
path_data = '/Volumes/ExternalSSD/FakeNewsRecognition/'
peewee_database_nytimes = SqliteDatabase(path_data + 'nytimes/news_cleaned_nytimes.db')
Page._meta.database = peewee_database_nytimes

In [46]:
Page.create_table()

In [47]:
pages_to_insert = []
with tqdm() as progress:
    with open(path_data + 'nytimes/archive_scraped.jsonl', 'r') as in_archive_scraped:
        for line in in_archive_scraped:
            page = ujson.loads(line)
            
            if 'main' not in page['headline'] and 'content_kicker' not in page['headline']:
                continue

            pages_to_insert.append({
                'batch': 0,
                'scraped_page_id': 0,
                'domain': 'nytimes.com',
                'type': 'reliable',
                'scraped_at': datetime.now(),
                'url': page['web_url'],
                'title': page['headline']['main'] if 'main' in page['headline'] else page['headline']['content_kicker'],
                'content': page['newspaper']['content'],
                'authors': page['newspaper']['authors'],
                'keywords': page['newspaper']['keywords'],
                'meta_keywords': page['newspaper']['meta_keywords'],
                'meta_description': page['newspaper']['meta_description'],
                'tags': page['newspaper']['tags'],
                'summary': page['newspaper']['summary']
            })

            progress.update()

            if len(pages_to_insert) > 25:
                with peewee_database_nytimes.atomic():
                    Page.insert_many(pages_to_insert).execute()
                    pages_to_insert = []

with peewee_database_nytimes.atomic():
    Page.insert_many(pages_to_insert).execute()

1542967it [15:11, 1692.09it/s]


In [39]:
page

{'_id': '54b4171b38f0d8623a353a07',
 'abstract': None,
 'blog': [],
 'byline': None,
 'document_type': 'article',
 'headline': {'sub': 'May 22, 2006  ;'},
 'keywords': [],
 'lead_paragraph': 'Lottery Numbers',
 'multimedia': [],
 'news_desk': 'Metro',
 'newspaper': {'authors': '',
  'content': 'We’re interested in your feedback on this page. Tell us what you think.',
  'keywords': '',
  'meta_description': 'Lottery Numbers',
  'meta_keywords': [''],
  'summary': '',
  'tags': '',
  'title': '- The New York Times'},
 'print_page': '3',
 'pub_date': '2006-05-23T00:00:00Z',
 'section_name': 'N.Y. / Region',
 'slideshow_credits': None,
 'snippet': 'Lottery Numbers...',
 'source': 'The New York Times',
 'subsection_name': None,
 'type_of_material': 'News',
 'web_url': 'https://www.nytimes.com/2006/05/23/nyregion/23lottery.html',
 'word_count': '168'}