# **Scraping Articles from U.S. Library of Congress database, "Chronicling America"**
This database contains America's historic newspaper pages from 1789-1963.  We'll be searching for the phrase "spanish flu"

## __Setup__

In [None]:
!pip install wget
import wget
import requests
import json



In [None]:
#Point to file location on Google Drive
import os
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
root = os.getcwd()
download_destination = 'gdrive/My Drive/COVID-19/data/spanish_flu_data'
cwd = os.path.join(root, download_destination)
os.chdir(cwd)
print('Current working directory: ', os.getcwd())

Mounted at /content/gdrive
Current working directory:  /content/gdrive/My Drive/COVID-19/data/spanish_flu_data


## __Scraping__

We'll now define the function that will scrape a single page,and then cycle through each page in the search to gather all relevant articles.  Note that if you'd like to search for a different phrase, then you can modify the (long) URL in the second code cell down.

##**Getting PDFs**
We'll get the PDFs here, so that we can run them through our own OCR.

In [None]:
import time

#Download files for a single search results page
Base_URL = "https://chroniclingamerica.loc.gov"
def get_pdf(ids, format='pdf'):
    article_metadata = []
    for id in ids:
        current_article = {}
        
        #Build and query url, and split results as metadata
        url = Base_URL+id[:-1]+'.json'
        resp = requests.get(url) #get download options
        pdf_url = requests.get(resp.json()[format]) #download url
        
        #Save metadata
        split_url = id.split('/')
        current_article['id'] = split_url[2]
        current_article['date_issued'] = split_url[3]
        current_article['ed'] = split_url[4]
        current_article['seq'] = split_url[5]
        current_article['paper'] = resp.json()['title']['name']
        
        #Download the PDF
        pdf_name = ''.join((current_article['id'], '.pdf'))
        open(pdf_name, 'wb').write(pdf_url.content)
        article_metadata.append(current_article)
        time.sleep(3)
        
    return article_metadata

# articles = get_pdf(ids)
# print('Number of articles retrieved: ', len(article_metadata))

In [None]:
#Download 100 per results at a time and save the text to file in json format
num_results = 320
page_increment = 1
results_per_page = 50


for page_num in range(1, num_results, page_increment):
    #Setup query page, and get url extensions containing persistent IDs
    print('Querying page: ', page_num)
    search_URL = "https://chroniclingamerica.loc.gov/search/pages/results/?date1=1918&rows={}&searchType=basic&format=json&state=&date2=1920&proxtext=spanish+flu&y=50&x=14&dateFilterType=yearRange&page={}&sort=relevance".format(results_per_page, page_num)
    response = requests.get(search_URL)
    data = response.json()
    ids = [i["id"] for i in data["items"]]
    
    #Harvest articles & metadata
    base_URL = "https://chroniclingamerica.loc.gov"
    articles = get_pdf(ids)

    #Save to disk
    filename = ''.join(('Metadata_LOC - ', str(page_num), '_', str(page_num+page_increment), '.json'))
    with open(filename, 'w') as f:
        json.dump(articles, f)
    print('Saved as {}'.format(filename))

    print('{} articles harvested!'.format(len(articles)))

##**Getting Text**
With the following code, we'll get the text that Chronicling America secured via their own OCR implementation.

In [None]:
#Download files for a single search results page
Base_URL = "https://chroniclingamerica.loc.gov"
def get_text(ids):
    articles = []
    for id in ids:
        current_article = {}
        
        #Build and query url, and split results as metadata
        url = Base_URL+id[:-1]+'.json'
        resp = requests.get(url)
        split_url = id.split('/')
        
        #Save metadata
        current_article['id'] = split_url[2]
        current_article['date_issued'] = split_url[3]
        current_article['ed'] = split_url[4]
        current_article['seq'] = split_url[5]
        current_article['paper'] = resp.json()['title']['name']
        
        #Get text
        article_text = requests.get(resp.json()['text'])
        current_article['text'] = article_text.text
        articles.append(current_article)

    return articles

articles = get_text(ids)
print('Number of articles retrieved: ', len(articles))

Number of articles retrieved:  50


Using the function above, we'll cycle through each page and retrieve the article text and metadata that we need.

In [None]:
#Download 100 per results at a time and save the text to file in json format
num_results = 320
page_increment = 1
results_per_age = 50

for page_num in range(1, num_results, page_increment):
    #Setup query page, and get url extensions containing persistent IDs
    print('Querying page: ', page_num)
    # search_URL = "https://chroniclingamerica.loc.gov/search/pages/results/?date1=1918&rows={}&searchType=basic&format=json&state=&date2=1920&proxtext=spanish+flu&y=50&x=14&dateFilterType=yearRange&page={}&sort=relevance".format(results_per_page, page_num)
    response = requests.get(search_URL)
    data = response.json()
    ids = [i["id"] for i in data["items"]]
    
    #Harvest articles & metadata
    base_URL = "https://chroniclingamerica.loc.gov"
    articles = get_text(ids)

    #Save to disk
    filename = ''.join(('LOC - ', str(page_num), '_', str(page_num+page_increment), '.json'))
    with open(filename, 'w') as f:
        json.dump(articles, f)
    print('Saved as {}'.format(filename))

    # print('{} articles harvested!'.format(len(articles)))

Querying page:  1
Saved as LOC - 1_2.json
Querying page:  2
Saved as LOC - 2_3.json
Querying page:  3
Saved as LOC - 3_4.json
Querying page:  4
Saved as LOC - 4_5.json
Querying page:  5
Saved as LOC - 5_6.json
Querying page:  6
Saved as LOC - 6_7.json
Querying page:  7
Saved as LOC - 7_8.json
Querying page:  8
Saved as LOC - 8_9.json
Querying page:  9
Saved as LOC - 9_10.json
Querying page:  10
Saved as LOC - 10_11.json
Querying page:  11
Saved as LOC - 11_12.json
Querying page:  12
Saved as LOC - 12_13.json
Querying page:  13
Saved as LOC - 13_14.json
Querying page:  14
Saved as LOC - 14_15.json
Querying page:  15
Saved as LOC - 15_16.json
Querying page:  16
Saved as LOC - 16_17.json
Querying page:  17
Saved as LOC - 17_18.json
Querying page:  18
Saved as LOC - 18_19.json
Querying page:  19
Saved as LOC - 19_20.json
Querying page:  20
Saved as LOC - 20_21.json
Querying page:  21
Saved as LOC - 21_22.json
Querying page:  22
Saved as LOC - 22_23.json
Querying page:  23
Saved as LOC - 23