In [1]:
# imports
import requests
import json
import math
import pandas as pd
import spacy

In [39]:
# initial search
url = 'https://chroniclingamerica.loc.gov/search/pages/results/?state=California&date1=1770&date2=1963&proxtext=gold+rush&x=28&y=9&dateFilterType=yearRange&rows=20&searchType=basic&format=json'
response = requests.get(url)
raw = response.text
results = json.loads(raw)

In [40]:
results.keys()

dict_keys(['totalItems', 'endIndex', 'startIndex', 'itemsPerPage', 'items'])

In [41]:
# explore items
print(type(results['items']))

<class 'list'>


In [42]:
print(results['items'][0])



In [31]:
print('totalItems:', results['totalItems'])
print('endIndex:', results['endIndex'])
print('startIndex:', results['startIndex'])
print('itemsPerPage:', results['itemsPerPage'])
print('Length and type of items:', len(results['items']), type(results['items']))

totalItems: 30966
endIndex: 20
startIndex: 1
itemsPerPage: 20
Length and type of items: 20 <class 'list'>


In [58]:
# find total amount of pages
total_pages = math.ceil(results['totalItems'] / results['itemsPerPage'])
print(total_pages)

62


In [59]:
# create empty list for data
data = []

In [60]:
# set search parameters
start_date = '1770'
end_date = '1963'
search_term = 'gold+rush'
state = 'California'

In [61]:
# loop through search results and collect data
for i in range(1, 11):
    url = (f'https://chroniclingamerica.loc.gov/search/pages/results/?state={state}&date1={start_date}'
           f'&date2={end_date}&proxtext={search_term}&x=16&y=8&dateFilterType=yearRange&rows=20'
           f'&searchType=basic&format=json&page={i}')
    response = requests.get(url)
    raw = response.text
    print(f'page {i} status code:', response.status_code)
    results = json.loads(raw)
    items_ = results['items']
    for item_ in items_:
        row_data = {}
        try:
          row_data['title'] = item_['title_normal']
        except:
          row_data['city'] = "none"
        try:
          row_data['city'] = item_['city']
        except:
          row_data['city'] = "none"
        try:
          row_data['date'] = item_['date']
        except:
          row_data['date'] = "none"
        try:
          row_data['raw_text'] = item_['ocr_eng']
        except:
          row_data['raw_text'] = 'none'
    data.append(row_data)

page 1 status code: 200
page 2 status code: 200
page 3 status code: 200
page 4 status code: 200
page 5 status code: 200
page 6 status code: 200
page 7 status code: 200
page 8 status code: 200
page 9 status code: 200
page 10 status code: 200


In [62]:
print(data)



In [63]:
# put data into DataFrame
df = pd.DataFrame.from_dict(data)

In [64]:
df.head()

Unnamed: 0,title,city,date,raw_text
0,imperial valley press.,[El Centro],19361227,What Mammy Pleasant’s Sinister Power?\nThe Sto...
1,imperial valley press.,[El Centro],19251228,2\nSOCIETY AND CLUE NEWS\nESTHER RALSTON\nAND ...
2,san francisco call.,[San Francisco],19030909,themselves to realize that the men who\nflocke...
3,imperial valley press.,[El Centro],19121204,THE EL CENTRO NATIONAL BANK\n•\nDIRECTORS\nW. ...
4,san francisco call.,[San Francisco],19130309,62\nRochester Begins Like All New Camps\nEVERY...


In [65]:
# convert date column from string to date-time object
df['date'] = pd.to_datetime(df['date'])

In [66]:
df.head()

Unnamed: 0,title,city,date,raw_text
0,imperial valley press.,[El Centro],1936-12-27,What Mammy Pleasant’s Sinister Power?\nThe Sto...
1,imperial valley press.,[El Centro],1925-12-28,2\nSOCIETY AND CLUE NEWS\nESTHER RALSTON\nAND ...
2,san francisco call.,[San Francisco],1903-09-09,themselves to realize that the men who\nflocke...
3,imperial valley press.,[El Centro],1912-12-04,THE EL CENTRO NATIONAL BANK\n•\nDIRECTORS\nW. ...
4,san francisco call.,[San Francisco],1913-03-09,62\nRochester Begins Like All New Camps\nEVERY...


In [67]:
# sort by date
df = df.sort_values(by='date')

In [68]:
df.head()

Unnamed: 0,title,city,date,raw_text
2,san francisco call.,[San Francisco],1903-09-09,themselves to realize that the men who\nflocke...
3,imperial valley press.,[El Centro],1912-12-04,THE EL CENTRO NATIONAL BANK\n•\nDIRECTORS\nW. ...
4,san francisco call.,[San Francisco],1913-03-09,62\nRochester Begins Like All New Camps\nEVERY...
6,imperial valley press.,[El Centro],1924-06-06,"Friday, June 6,1924\nWomens Interests\nDR. EDI..."
1,imperial valley press.,[El Centro],1925-12-28,2\nSOCIETY AND CLUE NEWS\nESTHER RALSTON\nAND ...


In [69]:
# write fuction to process text
# load nlp model
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes('ner', 'parser')

def process_text(text):
    """Remove new line characters and lemmatize text. Returns string of lemmas"""
    text = text.replace('\n', ' ')
    doc = nlp(text)
    tokens = [token for token in doc]
    no_stops = [token for token in tokens if not token.is_stop]
    no_punct = [token for token in no_stops if token.is_alpha]
    lemmas = [token.lemma_ for token in no_punct]
    lemmas_lower = [lemma.lower() for lemma in lemmas]
    lemmas_string = ' '.join(lemmas_lower)
    return lemmas_string

In [70]:
# apply process_text function
# this may take a few minutes
df['lemmas'] = df['raw_text'].apply(process_text)

In [71]:
# save to csv
df.to_csv(f'{search_term}{start_date}-{end_date}.csv', index=False)