In [6]:
import requests
import pandas as pd

def fetch_meteorite_mentions():
    URL = "https://chroniclingamerica.loc.gov/search/pages/results/"
    meteorite_params = {
        "date1": "1800",
        "date2": "1963",
        "proxtext": "meteorite",
        "searchType": "basic",
        "dateFilterType": "yearRange",
        "format": "json",
        "rows": 20,
        "page": 1
    }

    frames = []
    
    while True:
        response = requests.get(URL, params=meteorite_params)
        print("Fetching page:", meteorite_params['page'])  # Debug print
        
        if response.status_code != 200:
            print(f"Failed to fetch data: {response.status_code}")
            break
        
        data = response.json()
        
        if 'items' not in data or not data['items']:
            print("No more items to fetch.")
            break
        
        current_page_data = []
        for item in data['items']:
            current_page_data.append({
                "date": item.get("date", ""),
                "title": item.get("title", ""),
                "id": item.get("id", ""),
                "place": item.get("place_of_publication", ""),
                "frequency": item.get("frequency", ""),
            })
        
        if current_page_data:
            frames.append(pd.DataFrame(current_page_data))
        
        # Check for the presence of the next page using the 'endIndex' and 'totalItems'
        total_pages = (data['totalItems'] + 19) // 20  # Assuming 'rows' is fixed at 20
        if meteorite_params['page'] < total_pages:
            meteorite_params['page'] += 1
        else:
            print("Fetched all pages.")
            break
    
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

result_df = fetch_meteorite_mentions()
result_df.to_csv("cham_api_1800_1963.csv", index=False)
print(result_df)

Fetching page: 1
Fetching page: 2
Fetching page: 3
Fetching page: 4
Fetching page: 5
Fetching page: 6
Fetching page: 7
Fetching page: 8
Fetching page: 9
Fetching page: 10
Fetching page: 11
Fetching page: 12
Fetching page: 13
Fetching page: 14
Fetching page: 15
Fetching page: 16
Fetching page: 17
Fetching page: 18
Fetching page: 19
Fetching page: 20
Fetching page: 21
Fetching page: 22
Fetching page: 23
Fetching page: 24
Fetching page: 25
Fetching page: 26
Fetching page: 27
Fetching page: 28
Fetching page: 29
Fetching page: 30
Fetching page: 31
Fetching page: 32
Fetching page: 33
Fetching page: 34
Fetching page: 35
Fetching page: 36
Fetching page: 37
Fetching page: 38
Fetching page: 39
Fetching page: 40
Fetching page: 41
Fetching page: 42
Fetching page: 43
Fetching page: 44
Fetching page: 45
Fetching page: 46
Fetching page: 47
Fetching page: 48
Fetching page: 49
Fetching page: 50
Fetching page: 51
Fetching page: 52
Fetching page: 53
Fetching page: 54
Fetching page: 55
Fetching page: 56
F

In [3]:
# import requests
# import pandas as pd

# URL = "https://chroniclingamerica.loc.gov/search/pages/results/?date1=1900&rows=&searchType=basic&state=&date2=1963&proxtext=meteorite+found&y=0&x=0&dateFilterType=yearRange&page=1&format=json"

# response_df = pd.read_json(URL)
# response_df

# #Pull out each page's date, title, place, and text and convert them to a dataframe
# pages_dict = {"date" : [], "title" : [], "id" : [], "place" : [], "frequency" : [], "text" : []}
# for item in response_df["items"]:
#   pages_dict["date"].append(item["date"])
#   pages_dict["title"].append(item["title"])
#   pages_dict["id"].append(item["id"])
#   pages_dict["place"].append(item["place_of_publication"])
#   pages_dict["frequency"].append(item["frequency"])
#   pages_dict["text"].append(item["ocr_eng"])

# pages_df = pd.DataFrame(pages_dict)
# pages_df

# pages_df.to_csv("cham_api_1900_1963.csv")
