# **Scraping Articles from U.S. Library of Congress database, "Chronicling America"**
This database contains America's historic newspaper pages from 1789-1963.  We'll be searching for the phrase "spanish flu"

## __Setup__

In [None]:
# !pip install wget
import wget
import requests
import json
import os
import pandas as pd

In [None]:
#Point to file location on Google Drive
import os
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
root = os.getcwd()
download_destination = 'gdrive/My Drive/COVID-19/data/spanish_flu_data'
cwd = os.path.join(root, download_destination)
os.chdir(cwd)
print('Current working directory: ', os.getcwd())

## __Scraping__

We'll now define the function that will scrape a single page,and then cycle through each page in the search to gather all relevant articles.  Note that if you'd like to search for a different phrase, then you can modify the (long) URL in the second code cell down.

##**Getting PDFs**
We'll get the PDFs here, so that we can run them through our own OCR.

In [None]:
#Download files for a single search results page
Base_URL = "https://chroniclingamerica.loc.gov"
def get_pdf(ids, format='pdf'):
    article_metadata = []
    for id in ids:
        current_article = {}
        
        #Build and query url, and split results as metadata
        url = Base_URL+id[:-1]+'.json'
        resp = requests.get(url) #get download options
        pdf_url = requests.get(resp.json()[format]) #download url
        
        #Save metadata
        split_url = id.split('/')
        current_article['id'] = split_url[2]
        current_article['date_issued'] = split_url[3]
        current_article['ed'] = split_url[4]
        current_article['seq'] = split_url[5]
        current_article['paper'] = resp.json()['title']['name']
        
        #Download the PDF
        pdf_name = ''.join((current_article['id'], '.pdf'))
        if not os.path.isfile(filename):
            open(pdf_name, 'wb').write(pdf_url.content)
            article_metadata.append(current_article)
    
    return article_metadata

# articles = get_pdf(ids)
# print('Number of articles retrieved: ', len(article_metadata))

In [None]:
num_results = 320
page_increment = 1
results_per_page = 50

import pandas as pd

metadata = pd.DataFrame()

for page_num in range(1, num_results, page_increment):
    search_URL = "https://chroniclingamerica.loc.gov/search/pages/results/?date1=1918&sort=relevance&date2=1920&searchType=basic&sequence=0&format=json&state=&rows={}&proxtext=spanish+flu&y=50&x=14&dateFilterType=yearRange&page={}".format(results_per_page, page_num)
    resp = requests.get(search_URL)
    results = resp.json()

    for idx, article in enumerate(results['items']):
        temp = pd.Series(article, name=idx)
        metadata = metadata.append(temp)
        
metadata.to_pickle('metadata.pkl')

In [None]:
metadata.head()

Unnamed: 0,alt_title,batch,city,country,county,date,edition,edition_label,end_year,frequency,...,section_label,sequence,start_year,state,subject,title,title_normal,type,url,ocr_spa
0,[],msar_garnet_ver01,[Water Valley],Mississippi,[Yalobusha],19181108,,,1929.0,Weekly,...,,7.0,1888.0,[Mississippi],[Mississippi--Water Valley.--fast--(OCoLC)fst0...,The North Mississippi herald.,north mississippi herald.,page,https://chroniclingamerica.loc.gov/lccn/sn8706...,
1,[Evening journal and the daily Republican],deu_hockessin_ver01,[Wilmington],Delaware,[New Castle],19181017,,,1932.0,Daily (except Sun.),...,,8.0,1888.0,[Delaware],[Delaware--Wilmington.--fast--(OCoLC)fst012039...,Evening journal. [volume],evening journal.,page,https://chroniclingamerica.loc.gov/lccn/sn8504...,
2,[Omaha Sunday bee],nbu_coreopsis_ver01,[Omaha],Nebraska,[Douglas],19181009,[Morning ed.].,,1922.0,Daily,...,,5.0,1870.0,[Nebraska],"[Nebraska--Omaha.--fast--(OCoLC)fst01204995, O...",Omaha daily bee.,omaha daily bee.,page,https://chroniclingamerica.loc.gov/lccn/sn9902...,
3,[Washington times herald],dlc_jetblue_ver01,[Washington],District of Columbia,[None],19181029,,FINAL EDITION,1939.0,Daily (except Sunday),...,SPORTING PAGE,14.0,1902.0,[District of Columbia],"[Washington (D.C.)--fast--(OCoLC)fst01204505, ...",The Washington times. [volume],washington times.,page,https://chroniclingamerica.loc.gov/lccn/sn8402...,
4,[],kyu_gold_ver01,[Paris],Kentucky,[Bourbon],19181022,,,1999.0,Semiweekly,...,,2.0,1895.0,[Kentucky],"[Bourbon County (Ky.)--Newspapers., Kentucky--...",The Bourbon news. [volume],bourbon news.,page,https://chroniclingamerica.loc.gov/lccn/sn8606...,


In [None]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15946 entries, 0 to 49
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   alt_title             15946 non-null  object 
 1   batch                 15946 non-null  object 
 2   city                  15946 non-null  object 
 3   country               15946 non-null  object 
 4   county                15946 non-null  object 
 5   date                  15946 non-null  object 
 6   edition               2418 non-null   object 
 7   edition_label         15946 non-null  object 
 8   end_year              15946 non-null  float64
 9   frequency             15941 non-null  object 
 10  id                    15946 non-null  object 
 11  language              15946 non-null  object 
 12  lccn                  15946 non-null  object 
 13  note                  15946 non-null  object 
 14  ocr_eng               15945 non-null  object 
 15  page                  

In [None]:
#Import pickled file
metadata = pd.read_pickle('metadata.pkl')

#Expand rows with list objects so that we can deduplicate
metadata_copy = metadata.copy()
cols_with_lists = []

for col in metadata_copy.columns:
    if isinstance(metadata_copy[col].iloc[0], list):
        cols_with_lists.append(col)
        metadata_copy = metadata_copy.explode(column=col)
        
#Deduplicate rows that didn't have list objects
metadata_copy = metadata_copy.drop_duplicates(subset=[col for col in metadata_copy if col not in cols_with_lists])
metadata_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2196 entries, 0 to 45
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   alt_title             1192 non-null   object 
 1   batch                 2196 non-null   object 
 2   city                  2196 non-null   object 
 3   country               2196 non-null   object 
 4   county                2062 non-null   object 
 5   date                  2196 non-null   object 
 6   edition               218 non-null    object 
 7   edition_label         2196 non-null   object 
 8   end_year              2196 non-null   float64
 9   frequency             2191 non-null   object 
 10  id                    2196 non-null   object 
 11  language              2196 non-null   object 
 12  lccn                  2196 non-null   object 
 13  note                  2181 non-null   object 
 14  ocr_eng               2195 non-null   object 
 15  page                  2

In [None]:
for lccn, url, article_date, sequence in zip(metadata_copy['lccn'],metadata_copy['url'], metadata_copy['date'], metadata_copy['sequence']):
    resp = requests.get(url)
    pdf_url = resp.json()['pdf']
    
    #Download the PDF
    pdf_name = ''.join((lccn, '_', article_date, '_', str(int(sequence)), '.pdf'))
#     if not os.path.isfile(pdf_name):
    open(pdf_name, 'wb').write(requests.get(pdf_url).content)

In [None]:
#Download 100 per results at a time and save the text to file in json format
num_results = 320
page_increment = 1
results_per_page = 50


for page_num in range(1, num_results, page_increment):
    #Setup query page, and get url extensions containing persistent IDs
    print('Querying page: ', page_num)
    search_URL = "https://chroniclingamerica.loc.gov/search/pages/results/?date1=1918&rows={}&searchType=basic&format=json&state=&date2=1920&proxtext=spanish+flu&y=50&x=14&dateFilterType=yearRange&page={}&sort=relevance".format(results_per_page, page_num)
    response = requests.get(search_URL)
    data = response.json()
    ids = [i["id"] for i in data["items"]]
    
    #Harvest articles & metadata
    base_URL = "https://chroniclingamerica.loc.gov"
    articles = get_pdf(ids)

    #Save to disk
    filename = ''.join(('Metadata_LOC - ', str(page_num), '_', str(page_num+page_increment), '.json'))
    with open(filename, 'w') as f:
        json.dump(articles, f)
    print('Saved as {}'.format(filename))

    print('{} articles harvested!'.format(len(articles)))

Querying page:  1
Saved as Metadata_LOC - 1_2.json
0 articles harvested!
Querying page:  2
Saved as Metadata_LOC - 2_3.json
0 articles harvested!
Querying page:  3
Saved as Metadata_LOC - 3_4.json
0 articles harvested!
Querying page:  4
Saved as Metadata_LOC - 4_5.json
0 articles harvested!
Querying page:  5
Saved as Metadata_LOC - 5_6.json
0 articles harvested!
Querying page:  6
Saved as Metadata_LOC - 6_7.json
0 articles harvested!
Querying page:  7
Saved as Metadata_LOC - 7_8.json
0 articles harvested!
Querying page:  8
Saved as Metadata_LOC - 8_9.json
0 articles harvested!
Querying page:  9
Saved as Metadata_LOC - 9_10.json
0 articles harvested!
Querying page:  10
Saved as Metadata_LOC - 10_11.json
0 articles harvested!
Querying page:  11
Saved as Metadata_LOC - 11_12.json
0 articles harvested!
Querying page:  12
Saved as Metadata_LOC - 12_13.json
0 articles harvested!
Querying page:  13
Saved as Metadata_LOC - 13_14.json
0 articles harvested!
Querying page:  14
Saved as Metadata_L

##**Getting Text**
With the following code, we'll get the text that Chronicling America secured via their own OCR implementation.

In [None]:
#Download files for a single search results page
Base_URL = "https://chroniclingamerica.loc.gov"
def get_text(ids):
    articles = []
    for id in ids:
        current_article = {}
        
        #Build and query url, and split results as metadata
        url = Base_URL+id[:-1]+'.json'
        resp = requests.get(url)
        split_url = id.split('/')
        
        #Save metadata
        current_article['id'] = split_url[2]
        current_article['date_issued'] = split_url[3]
        current_article['ed'] = split_url[4]
        current_article['seq'] = split_url[5]
        current_article['paper'] = resp.json()['title']['name']
        
        #Get text
        article_text = requests.get(resp.json()['text'])
        current_article['text'] = article_text.text
        articles.append(current_article)

    return articles

articles = get_text(ids)
print('Number of articles retrieved: ', len(articles))

Number of articles retrieved:  50


Using the function above, we'll cycle through each page and retrieve the article text and metadata that we need.

In [None]:
#Download 100 per results at a time and save the text to file in json format
num_results = 320
page_increment = 1
results_per_age = 50

for page_num in range(1, num_results, page_increment):
    #Setup query page, and get url extensions containing persistent IDs
    print('Querying page: ', page_num)
    # search_URL = "https://chroniclingamerica.loc.gov/search/pages/results/?date1=1918&rows={}&searchType=basic&format=json&state=&date2=1920&proxtext=spanish+flu&y=50&x=14&dateFilterType=yearRange&page={}&sort=relevance".format(results_per_page, page_num)
    response = requests.get(search_URL)
    data = response.json()
    ids = [i["id"] for i in data["items"]]
    
    #Harvest articles & metadata
    base_URL = "https://chroniclingamerica.loc.gov"
    articles = get_text(ids)

    #Save to disk
    filename = ''.join(('LOC - ', str(page_num), '_', str(page_num+page_increment), '.json'))
    with open(filename, 'w') as f:
        json.dump(articles, f)
    print('Saved as {}'.format(filename))

    # print('{} articles harvested!'.format(len(articles)))

Querying page:  1
Saved as LOC - 1_2.json
Querying page:  2
Saved as LOC - 2_3.json
Querying page:  3
Saved as LOC - 3_4.json
Querying page:  4
Saved as LOC - 4_5.json
Querying page:  5
Saved as LOC - 5_6.json
Querying page:  6
Saved as LOC - 6_7.json
Querying page:  7
Saved as LOC - 7_8.json
Querying page:  8
Saved as LOC - 8_9.json
Querying page:  9
Saved as LOC - 9_10.json
Querying page:  10
Saved as LOC - 10_11.json
Querying page:  11
Saved as LOC - 11_12.json
Querying page:  12
Saved as LOC - 12_13.json
Querying page:  13
Saved as LOC - 13_14.json
Querying page:  14
Saved as LOC - 14_15.json
Querying page:  15
Saved as LOC - 15_16.json
Querying page:  16
Saved as LOC - 16_17.json
Querying page:  17
Saved as LOC - 17_18.json
Querying page:  18
Saved as LOC - 18_19.json
Querying page:  19
Saved as LOC - 19_20.json
Querying page:  20
Saved as LOC - 20_21.json
Querying page:  21
Saved as LOC - 21_22.json
Querying page:  22
Saved as LOC - 22_23.json
Querying page:  23
Saved as LOC - 23