In [1]:
# Import Dependencies
import requests
import pandas as pd
import json
from splinter import Browser
from bs4 import BeautifulSoup as bs
from unicodedata import normalize
import numpy as np
import time
from io import StringIO

# Finding First Source #

In [2]:
# Get URL
first_url = "https://www.ncei.noaa.gov/access/billions/events-US-1980-2024.json"

In [3]:
# Read JSON
res = requests.get(first_url).json()

In [4]:
# Extract Necessary Data from JSON
res_data = res['data']

In [5]:
# Create a DataFrame from JSON
million_dollar_disasters = pd.DataFrame(res_data)

In [6]:
# Confirm number of rows in the data
len(million_dollar_disasters)

383

In [7]:
# Look at one row of data
million_dollar_disasters.sample(1)

Unnamed: 0,name,disaster,begDate,endDate,adjCost,unadjCost,deaths
137,Southeast Tornadoes and Severe Weather (Februa...,Severe Storm,20080205,20080206,1793.2,1211.6,57


In [8]:
# Export to CSV
million_dollar_disasters.to_csv('million_dollar_disasters.csv')

# Finding 2nd Source #

In [9]:
# Set up the browser
browser = Browser('chrome')

In [10]:
# Set URL & Visit URL
html = "https://public.opendatasoft.com/explore/dataset/openaq/table/?disjunctive.city&disjunctive.location&disjunctive.measurements_parameter&sort=measurements_lastupdated&refine.country_name_en=United+States"
browser.visit(html)

In [11]:
# Parse the data
html = browser.html
soup = bs(html, 'html.parser')

In [12]:
# Extract Header List for DataFrame columns
header_list = []
records_headers = soup.find_all('div',class_='odswidget-table__label')
for row in records_headers:
    header_list.append(row.text)
header_list

['Country Label', 'Pollutant', 'Value', 'Unit', 'Last Updated', 'Source Name']

In [13]:
# Extract rows for DataFrame
row_list = []
n = 0
table = soup.find('tr')

for row in table:
    throwaway = soup.find('tr', class_= f'odswidget-table__internal-table-row record-{n}').find('td', class_="odswidget-table__cell").find('div', class_="odswidget-table__cell-container")
    values = [throwaway.findNext('div').text,
    throwaway.findNext('div').findNext('div').text,
    throwaway.findNext('div').findNext('div').findNext('div').text,
    throwaway.findNext('div').findNext('div').findNext('div').findNext('div').text,
    throwaway.findNext('div').findNext('div').findNext('div').findNext('div').findNext('div').text,
    throwaway.findNext('div').findNext('div').findNext('div').findNext('div').findNext('div').findNext('div').text]

    row_list.append(values)
    n += 1

In [14]:
# Create DataFrame
df_2 = pd.DataFrame(row_list, columns=header_list)
df_2
len(df_2)

17

In [15]:
# Quit the browsing session
browser.quit()

Note: We were only able to extract 16 rows by scraping so we decided to go the JSON route.

# Finding 3rd Source #

In [16]:
# Define URL
third_url = "https://public.opendatasoft.com/api/records/1.0/search/?rows=100&disjunctive.city=true&disjunctive.location=true&disjunctive.measurements_parameter=true&sort=measurements_lastupdated&refine.country_name_en=United+States&start=0&fields=country_name_en,measurements_parameter,measurements_value,measurements_unit,measurements_lastupdated,measurements_sourcename&dataset=openaq&timezone=America%2FChicago&lang=en"

In [17]:
# Get the data
res_2 = requests.get(third_url).json()
res_2

{'nhits': 6051,
 'parameters': {'dataset': 'openaq',
  'lang': 'en',
  'rows': 100,
  'start': 0,
  'sort': ['measurements_lastupdated'],
  'refine': {'country_name_en': 'United States'},
  'disjunctive': {'city': 'true',
   'location': 'true',
   'measurements_parameter': 'true'},
  'format': 'json',
  'timezone': 'America/Chicago',
  'fields': ['country_name_en',
   'measurements_parameter',
   'measurements_value',
   'measurements_unit',
   'measurements_lastupdated',
   'measurements_sourcename']},
 'records': [{'datasetid': 'openaq',
   'recordid': '1e2db59bc1c67d7e1f818ff0e4a5c4d5bfdb618f',
   'fields': {'measurements_unit': 'ppm',
    'measurements_value': 0.037,
    'measurements_sourcename': 'AirNow',
    'measurements_lastupdated': '2023-05-30T18:00:00-05:00',
    'measurements_parameter': 'O3',
    'country_name_en': 'United States'}},
  {'datasetid': 'openaq',
   'recordid': '488082fdc2cce20bb0321437c9db425e35d8d3be',
   'fields': {'measurements_unit': 'µg/m³',
    'measur

In [18]:
# Drill down to data needed
res_2_data_list = []
res_2_data = res_2['records']
for record in res_2_data:
    record = res_2['records'][2]['fields']
    res_2_data_list.append(record)

In [19]:
# Create DataFrame
air_pollution_df = pd.DataFrame(res_2_data_list)
air_pollution_df.sample(1)

Unnamed: 0,measurements_unit,measurements_value,measurements_sourcename,measurements_lastupdated,measurements_parameter,country_name_en
12,ppm,0.001,AirNow,2023-05-30T18:00:00-05:00,NO2,United States


In [20]:
# Verify number of rows in DataFrame
len(air_pollution_df)

100

In [21]:
# Export to CSV
air_pollution_df.to_csv('air_pollution_df.csv')

Note: We extracted the limit of 100 lines only for this assignment due to time constraints