In [1]:
import requests
import json

states_dict = {"al":"alabama","ak":"alaska","az":"arizona","ar":"arkansas","ca":"california","co":"colorado","ct":"connecticut","de":"delaware","fl":"florida","ga":"georgia","hi":"hawaii","id":"idaho","il":"illinois","in":"indiana","ia":"iowa","ks":"kansas","ky":"kentucky","la":"louisiana","me":"maine","md":"maryland","ma":"massachusetts","mi":"michigan","mn":"minnesota","ms":"mississippi","mo":"missouri","mt":"montana","ne":"nebraska","nv":"nevada","nh":"new hampshire","nj":"new jersey","nm":"new mexico","ny":"new york","nc":"north carolina","nd":"north dakota","oh":"ohio","ok":"oklahoma","or":"oregon","pa":"pennsylvania","ri":"rhode island","sc":"south carolina","sd":"south dakota","tn":"tennessee","tx":"texas","ut":"utah","vt":"vermont","va":"virginia","wa":"washington","wv":"west virginia","wi":"wisconsin","wy":"wyoming"}

def write_to_file(fn, data):
    with open(fn, 'w') as outfile:
        json.dump(data, outfile)

def read_from_file(fn):
    with open(fn) as json_file:
        data = json.load(json_file)
    return data    

def query_q1():
    r_all = requests.get(url="https://chroniclingamerica.loc.gov/newspapers.json")
    data_all = r_all.json()
    
    write_to_file("1.json", data_all)
    return data_all

query_q1()
data_all = read_from_file("1.json")
print("Total number of newspapers: ", len(data_all["newspapers"]))
states_list = []
state_wise_count = {}
for item in data_all["newspapers"]:
    if(item["state"] not in state_wise_count):
        state_wise_count[item["state"]] = 1
        states_list.append(item["state"])
    else:    
        state_wise_count[item["state"]] += 1
    

def get_newspaper_count_for_all_states(states_dict):
    total_newspaper_count_dict = {}
    counts = []

    for (_, state) in states_dict.items():
        r = requests.get(url="https://chroniclingamerica.loc.gov/search/titles/results/?terms="+state+"&format=json")
        data = r.json()
        total_newspaper_count_dict[state] = data["totalItems"] or 0
        counts.append(total_newspaper_count_dict[state])
    write_to_file("total_newspaper_count_dict.json", {"total_newspaper_count_dict":total_newspaper_count_dict, "counts":counts})
    return total_newspaper_count_dict, counts

statewise_total_newpaper_count = read_from_file("total_newspaper_count_dict.json")
total_newspaper_count_dict, counts = statewise_total_newpaper_count["total_newspaper_count_dict"], statewise_total_newpaper_count["counts"]


idx_of_state_with_min_publications = counts.index(min(counts))
idx_of_state_with_max_publications = counts.index(max(counts))

list_of_states = list(total_newspaper_count_dict.keys())
state_with_min_publications = list_of_states[idx_of_state_with_min_publications]
state_with_max_publications = list_of_states[idx_of_state_with_max_publications]

print("State with the highest number of publications,", state_with_max_publications+" :",counts[idx_of_state_with_max_publications])
print("State with the least number of publications,", state_with_min_publications+" :",counts[idx_of_state_with_min_publications])
print("Alabama, total number of newspapers:", total_newspaper_count_dict["alabama"])


def query_q5():
    state = "oregon"
    r = requests.get(url="https://chroniclingamerica.loc.gov/search/titles/results/?terms="+state+"&format=json")
    data = r.json()
    write_to_file("5.json", data)
    return data

my_state = "oregon"
print("Total number of newspapers for my state, "+my_state+" :", total_newspaper_count_dict[my_state])

def save_all_newspaper_info_for_q5(total_newspaper_count_dict, my_state):
    total_no_pages = math.ceil(total_newspaper_count_dict[my_state]/50)
    res = []
    for page_no in range(1,total_no_pages+1):
        r = requests.get(url="https://chroniclingamerica.loc.gov/search/titles/results/?terms="+my_state+"&format=json&page="+str(page_no))
        data = r.json()
        for item in data["items"]:
            res.append({'title':item["title"], 'start_year':item["start_year"]})
            #print("Title of newspaper:", item["title"])
            #print("Starting year     :", item["start_year"])
            #print("") 
    
    write_to_file('5-2.json', res)        
    return res

all_news_my_state = read_from_file("5-2.json")
print("\n")
print("Printing information for all newspapers from "+my_state+":")
for n_item in range(1,6):
    print("Title     :", all_news_my_state[n_item-1]["title"])
    print("Start year:", all_news_my_state[n_item-1]["start_year"])
print("Please refer to 5-2.json file for all 1541 results")



def get_replacement_position(start_year, top5_start_year):
    i = 5-1  
    while(start_year<top5_start_year[i] and i>0):
        i-=1
    return i

def get_updated_top_5_results(idx, start_year, item, top5_start_year, top5):
    if(idx==0):
        top5_start_year = [start_year] + top5_start_year[:-1]
        top5 = [{'title' : item["title"], 'start_year' : item["start_year"]}] + top5[:-1]     
        
    elif(idx<4):
        top5_start_year = top5_start_year[:idx+1] + [start_year] + top5_start_year[idx+1:-1]
        top5 = top5[:idx+1] + [{'title' : item["title"], 'start_year' : item["start_year"]}] + top5[idx+1:-1]     
    return top5_start_year, top5

print("\n")
print("=================================")
def query_q6(data_all):
    print("Querying all newspaper publications")
    total_no_pages_all_publications = math.ceil(data_all["totalItems"]/50)
    top5 = [{}, {}, {}, {}, {}]
    top5_start_year = [2019, 2019, 2019, 2019, 2019]

    for page_no in range(1,total_no_pages_all_publications+1):
        r = requests.get(url="https://chroniclingamerica.loc.gov/search/titles/results/?terms=&format=json&page="+str(page_no))
        data = r.json()

        for item in data["items"]:
            if(item["start_year"]>top5_start_year[-1]):
                continue
            elif(item["start_year"]<top5_start_year[-1] and item["start_year"]>1689):
                idx = get_replacement_position(item["start_year"], top5_start_year)
                top5_start_year, top5 = get_updated_top_5_results(idx, item["start_year"], item, top5_start_year, top5)
    write_to_file('6.json', top5)
    return top5_start_year, top5

'''
top5_start_year, top5 = query_q6(data_all)
print("Printing top 5 publications with the earliest start years:\n")

for result in top5:
    print(result["title"]+" - "+str(result["start_year"]))
'''

Total number of newspapers:  155856
State with the highest number of publications, ohio : 7847
State with the least number of publications, rhode island : 150
Alabama, total number of newspapers: 285
Total number of newspapers for my state, oregon : 1541


Printing information for all newspapers from oregon:
Title     : Oregon emerald.
Start year: 1909
Title     : Oregon news.
Start year: 1959
Title     : Oregon summer emerald.
Start year: 1966
Title     : Ōshū nippō = The Oregon news. [microfiche]
Start year: 1908
Title     : The Eastern Oregon Republican. [volume]
Start year: 1888
Please refer to 5-2.json file for all 1541 results




'\ntop5_start_year, top5 = query_q6(data_all)\nprint("Printing top 5 publications with the earliest start years:\n")\n\nfor result in top5:\n    print(result["title"]+" - "+str(result["start_year"]))\n'

In [3]:
r_all = requests.get(url="https://chroniclingamerica.loc.gov/newspapers.json")
data_all = r_all.json()

In [8]:
len(data_all['newspapers'])

3066

In [9]:
data_all['newspapers'][0]

{'lccn': 'sn86072192',
 'url': 'https://chroniclingamerica.loc.gov/lccn/sn86072192.json',
 'state': 'Alabama',
 'title': 'The Age-herald. [volume]'}

In [11]:
x = set()

In [12]:
x.add('a')

In [13]:
x

{'a'}