In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, engine
import requests
from IPython.display import JSON
from pprint import pprint as pp

from config import local_mysql_password, local_mysql_user, NOAA_token

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

## Fetch the list of countries in the fao data

In [36]:
fao_file_path = "./data/fao_data/"

# Load the crop yield data
file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
fao_crop_yield_data = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')

regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]
fao_countries = fao_crop_yield_data.Area.loc[~fao_crop_yield_data.Area.isin(regions+special_groups)].unique().tolist()
len(fao_countries)

211

# NOAA data api

In [3]:
base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/"
headers = {
    'token': NOAA_token
    }

## Fetch all available location categories

In [34]:
endpoint='locationcategories'
querystring = {}

url = f"{base_url}{endpoint}"
response = requests.request("GET", url, headers=headers, params=querystring)
pp(response.json())

{'metadata': {'resultset': {'count': 12, 'limit': 25, 'offset': 1}},
 'results': [{'id': 'CITY', 'name': 'City'},
             {'id': 'CLIM_DIV', 'name': 'Climate Division'},
             {'id': 'CLIM_REG', 'name': 'Climate Region'},
             {'id': 'CNTRY', 'name': 'Country'},
             {'id': 'CNTY', 'name': 'County'},
             {'id': 'HYD_ACC', 'name': 'Hydrologic Accounting Unit'},
             {'id': 'HYD_CAT', 'name': 'Hydrologic Cataloging Unit'},
             {'id': 'HYD_REG', 'name': 'Hydrologic Region'},
             {'id': 'HYD_SUB', 'name': 'Hydrologic Subregion'},
             {'id': 'ST', 'name': 'State'},
             {'id': 'US_TERR', 'name': 'US Territory'},
             {'id': 'ZIP', 'name': 'Zip Code'}]}


## Fetch all available countries

In [7]:
endpoint='locations'
url = f"{base_url}{endpoint}#?locationcategoryid=CNTRY"
querystring = {
    'locationcategoryid': 'CNTRY',
    'limit': 1000
}

noaa_countries_response = requests.request("GET", url, headers=headers, params=querystring)

noaa_countries = {}
for country in noaa_countries_response.json()['results']:
    noaa_countries[country['name']] = country['id']

print(len(noaa_countries))

201


## Compare NOAA countries to FAO countries

In [40]:
count = 0
for country in fao_countries:
    if country not in noaa_countries.keys():
        print(country)
        count += 1
print(count)

Antigua and Barbuda
Bahamas
Barbados
Belgium-Luxembourg
Bolivia (Plurinational State of)
Bosnia and Herzegovina
Brunei Darussalam
Cabo Verde
China, Hong Kong SAR
China, Macao SAR
China, mainland
China, Taiwan Province of
Comoros
Cook Islands
Côte d'Ivoire
Czechia
Czechoslovakia
Democratic People's Republic of Korea
Democratic Republic of the Congo
Djibouti
Eswatini
Ethiopia PDR
Faroe Islands
French Guyana
French Polynesia
Gambia
Grenada
Guadeloupe
Haiti
Iran (Islamic Republic of)
Lao People's Democratic Republic
Maldives
Marshall Islands
Martinique
Micronesia (Federated States of)
Nauru
New Caledonia
North Macedonia
Palestine
Puerto Rico
Republic of Korea
Republic of Moldova
Réunion
Russian Federation
Saint Kitts and Nevis
Saint Lucia
Saint Vincent and the Grenadines
Samoa
Sao Tome and Principe
Serbia and Montenegro
Seychelles
Solomon Islands
Somalia
Sudan (former)
Syrian Arab Republic
Timor-Leste
Tokelau
Trinidad and Tobago
Tuvalu
United Kingdom of Great Britain and Northern Ireland
U

In [41]:
count = 0
for country in noaa_countries.keys():
    if country not in fao_countries:
        print(country)
        count += 1
print(count)

Aruba
Antigua & Barbuda
Antarctica
Bermuda
The Bahamas
Bosnia & Herzegovina
Bolivia
Solomon Is.
Brunei
Congo, DRC
Cayman Is.
Cape Verde
Cook Is.
Jarvis I.
Czech Republic
Falkland Is.
Micronesia
Faroe Is.
French Southern & Antarctic Lands
The Gambia
Gibraltar
Guernsey
Greenland
Gaza Strip
Heard I. & McDonald Is.
Isle of Man
British Indian Ocean Territory
Iran
Cote d'Ivoire
Jersey
North Korea
South Korea
Christmas I.
Laos
Moldova
Macedonia
Norfolk I.
Sint Maarten
Pitcairn Is.
Palau
Reunion
Marshall Is.
Russia
St. Helena
St. Lucia
Syria
Trinidad & Tobago
Turks & Caicos Is.
Tanzania
Curacao
United Kingdom
United States
St. Vincent & the Grenadines
Venezuela
British Virgin Is.
Vietnam
Swaziland
57


## Fetch all available datasets

In [None]:
endpoint='datasets'
url = f"{base_url}{endpoint}"
querystring = {'limit': 1000}
datasets = requests.request("GET", url, headers=headers, params=querystring)
for dataset in datasets.json()['results']:
    print(dataset['id'])
    print(dataset['name'])
    print('-----------------------')

In [45]:
noaa_datasets = []
for dataset in datasets.json()['results']:
    data = {
        'id': dataset['id'],
        'name': dataset['name']
    }
    noaa_datasets.append(data)
pp(noaa_datasets)

[{'id': 'GHCND', 'name': 'Daily Summaries'},
 {'id': 'GSOM', 'name': 'Global Summary of the Month'},
 {'id': 'GSOY', 'name': 'Global Summary of the Year'},
 {'id': 'NEXRAD2', 'name': 'Weather Radar (Level II)'},
 {'id': 'NEXRAD3', 'name': 'Weather Radar (Level III)'},
 {'id': 'NORMAL_ANN', 'name': 'Normals Annual/Seasonal'},
 {'id': 'NORMAL_DLY', 'name': 'Normals Daily'},
 {'id': 'NORMAL_HLY', 'name': 'Normals Hourly'},
 {'id': 'NORMAL_MLY', 'name': 'Normals Monthly'},
 {'id': 'PRECIP_15', 'name': 'Precipitation 15 Minute'},
 {'id': 'PRECIP_HLY', 'name': 'Precipitation Hourly'}]


In [None]:
data_set_ids_monthly = {
    'GSOM': 'Global Summary of the Month',
    'NORMAL_MLY': 'Normals Monthly',
}
data_set_ids_daily = {
    'GHCND': 'Daily Summaries',
    'NORMAL_DLY': 'Normals Daily',
}
data_set_ids_yearly = {
    'GSOY': 'Global Summary of the Year',
    'NORMAL_ANN': 'Normals Annual/Seasonal',
}
data_set_ids_other = {
    'NEXRAD2': 'Weather Radar (Level II)',
    'NEXRAD3': 'Weather Radar (Level III)',
    'NORMAL_HLY': 'Normals Hourly',
    'PRECIP_15': 'Precipitation 15 Minute',
    'PRECIP_HLY': 'Precipitation Hourly',
}

##  Fetch all availabe data types

In [4]:
endpoint='datatypes'
url = f"{base_url}{endpoint}"
querystring = {'limit': 1000}
datatypes = requests.request("GET", url, headers=headers, params=querystring)
pp(datatypes.json())

{'metadata': {'resultset': {'count': 1565, 'limit': 1000, 'offset': 1}},
 'results': [{'datacoverage': 1,
              'id': 'ACMC',
              'maxdate': '1996-05-28',
              'mindate': '1994-03-19',
              'name': 'Average cloudiness midnight to midnight from 30-second '
                      'ceilometer data'},
             {'datacoverage': 1,
              'id': 'ACMH',
              'maxdate': '2005-12-31',
              'mindate': '1965-01-01',
              'name': 'Average cloudiness midnight to midnight from manual '
                      'observations'},
             {'datacoverage': 1,
              'id': 'ACSC',
              'maxdate': '1996-05-28',
              'mindate': '1994-02-01',
              'name': 'Average cloudiness sunrise to sunset from 30-second '
                      'ceilometer data'},
             {'datacoverage': 1,
              'id': 'ACSH',
              'maxdate': '2005-12-31',
              'mindate': '1965-01-01',
              

In [48]:
len(datatypes.json()['results'])

1000

# Examine each dataset

{'id': 'GHCND', 'name': 'Daily Summaries'},

{'id': 'GSOM', 'name': 'Global Summary of the Month'},

{'id': 'GSOY', 'name': 'Global Summary of the Year'},

{'id': 'NEXRAD2', 'name': 'Weather Radar (Level II)'},

{'id': 'NEXRAD3', 'name': 'Weather Radar (Level III)'},

{'id': 'NORMAL_ANN', 'name': 'Normals Annual/Seasonal'},

{'id': 'NORMAL_DLY', 'name': 'Normals Daily'}, - Only exists for 2010

{'id': 'NORMAL_HLY', 'name': 'Normals Hourly'},

{'id': 'NORMAL_MLY', 'name': 'Normals Monthly'},

{'id': 'PRECIP_15', 'name': 'Precipitation 15 Minute'},

{'id': 'PRECIP_HLY', 'name': 'Precipitation Hourly'}

In [12]:
endpoint='datatypes'
url = f"{base_url}{endpoint}"
querystring = {
    'limit': 1000,
    'datasetid': 'NEXRAD2'
    }
NORMAL_DLY_datatypes= requests.request("GET", url, headers=headers, params=querystring)
pp(NORMAL_DLY_datatypes.json())

{'metadata': {'resultset': {'count': 1, 'limit': 1000, 'offset': 1}},
 'results': [{'datacoverage': 0.95,
              'id': 'ALL',
              'maxdate': '2022-02-01',
              'mindate': '1991-06-05',
              'name': 'Base Data'}]}


### 'id': 'GHCND', 'name': 'Daily Summaries'

In [49]:
'''
Daily reports
{'id': 'GHCND', 'name': 'Daily Summaries'},
{'id': 'NORMAL_DLY', 'name': 'Normals Daily'},
{'id': 'PRECIP_HLY', 'name': 'Precipitation Hourly'}

{'id': 'NEXRAD2', 'name': 'Weather Radar (Level II)'},
{'id': 'NEXRAD3', 'name': 'Weather Radar (Level III)'},


'''

endpoint='datatypes'
url = f"{base_url}{endpoint}"
querystring = {
    'limit': 1000,
    'datasetid': 'GHCND'
    }
GHCND_datatypes= requests.request("GET", url, headers=headers, params=querystring)
pp(GHCND_datatypes.json())

GHCND_data_categories = [
             {'datacoverage': 1,
              'id': 'ACMH',
              'maxdate': '2005-12-31',
              'mindate': '1965-01-01',
              'name': 'Average cloudiness midnight to midnight from manual '
                      'observations'},
                      
             {'datacoverage': 1,
              'id': 'AWND',
              'maxdate': '2022-02-01',
              'mindate': '1982-01-01',
              'name': 'Average wind speed'},
             {'datacoverage': 1,
              'id': 'DAEV',
              'maxdate': '2012-07-23',
              'mindate': '1948-08-02',
              'name': 'Number of days included in the multiday evaporation '
                      'total (MDEV)'},
             {'datacoverage': 1,
              'id': 'DAPR',
              'maxdate': '2022-02-02',
              'mindate': '1832-05-11',
              'name': 'Number of days included in the multiday precipitation '
                      'total (MDPR)'},
             {'datacoverage': 1,
              'id': 'DASF',
              'maxdate': '2021-02-19',
              'mindate': '1877-01-02',
              'name': 'Number of days included in the multiday snow fall total '
                      '(MDSF) '},
             {'datacoverage': 1,
              'id': 'DATN',
              'maxdate': '2022-02-01',
              'mindate': '1863-05-04',
              'name': 'Number of days included in the multiday minimum '
                      'temperature (MDTN)'},
             {'datacoverage': 1,
              'id': 'DATX',
              'maxdate': '2022-01-31',
              'mindate': '1863-05-04',
              'name': 'Number of days included in the multiday maximum '
                      'temperature (MDTX)'},
             {'datacoverage': 1,
              'id': 'DAWM',
              'maxdate': '2010-06-21',
              'mindate': '1935-09-23',
              'name': 'Number of days included in the multiday wind movement '
                      '(MDWM)'},
             {'datacoverage': 1,
              'id': 'DWPR',
              'maxdate': '2022-02-01',
              'mindate': '1832-05-11',
              'name': 'Number of days with non-zero precipitation included in '
                      'multiday precipitation total (MDPR)'},
             {'datacoverage': 1,
              'id': 'EVAP',
              'maxdate': '2021-12-31',
              'mindate': '1893-01-09',
              'name': 'Evaporation of water from evaporation pan'},
             {'datacoverage': 1,
              'id': 'FMTM',
              'maxdate': '2013-03-31',
              'mindate': '1982-01-01',
              'name': 'Time of fastest mile or fastest 1-minute wind'},
             {'datacoverage': 1,
              'id': 'FRGB',
              'maxdate': '1964-12-31',
              'mindate': '1946-10-29',
              'name': 'Base of frozen ground layer'},
             {'datacoverage': 1,
              'id': 'FRGT',
              'maxdate': '1964-12-31',
              'mindate': '1946-10-29',
              'name': 'Top of frozen ground layer'},
             {'datacoverage': 1,
              'id': 'FRTH',
              'maxdate': '1964-12-31',
              'mindate': '1948-01-01',
              'name': 'Thickness of frozen ground layer'},
             {'datacoverage': 1,
              'id': 'GAHT',
              'maxdate': '1964-12-31',
              'mindate': '1913-07-25',
              'name': 'Difference between river and gauge height'},
             {'datacoverage': 1,
              'id': 'MDEV',
              'maxdate': '2010-08-16',
              'mindate': '1948-08-02',
              'name': 'Multiday evaporation total (use with DAEV)'},
             {'datacoverage': 1,
              'id': 'MDPR',
              'maxdate': '2022-02-02',
              'mindate': '1832-05-11',
              'name': 'Multiday precipitation total (use with DAPR and DWPR, '
                      'if available)'},
             {'datacoverage': 1,
              'id': 'MDSF',
              'maxdate': '2021-02-19',
              'mindate': '1859-12-29',
              'name': 'Multiday snowfall total '},
             {'datacoverage': 1,
              'id': 'MDTN',
              'maxdate': '2022-02-01',
              'mindate': '1863-05-04',
              'name': 'Multiday minimum temperature (use with DATN)'},
             {'datacoverage': 1,
              'id': 'MDTX',
              'maxdate': '2022-01-31',
              'mindate': '1863-05-04',
              'name': 'Multiday maximum temperature (use with DATX)'},
             {'datacoverage': 1,
              'id': 'MDWM',
              'maxdate': '2010-06-21',
              'mindate': '1935-09-23',
              'name': 'Multiday wind movement'},
             {'datacoverage': 1,
              'id': 'MNPN',
              'maxdate': '2021-12-31',
              'mindate': '1950-11-28',
              'name': 'Daily minimum temperature of water in an evaporation '
                      'pan'},
             {'datacoverage': 1,
              'id': 'MXPN',
              'maxdate': '2021-12-31',
              'mindate': '1921-12-09',
              'name': 'Daily maximum temperature of water in an evaporation '
                      'pan'},
]

{'metadata': {'resultset': {'count': 136, 'limit': 1000, 'offset': 1}},
 'results': [{'datacoverage': 1,
              'id': 'ACMC',
              'maxdate': '1996-05-28',
              'mindate': '1994-03-19',
              'name': 'Average cloudiness midnight to midnight from 30-second '
                      'ceilometer data'},
             {'datacoverage': 1,
              'id': 'ACMH',
              'maxdate': '2005-12-31',
              'mindate': '1965-01-01',
              'name': 'Average cloudiness midnight to midnight from manual '
                      'observations'},
             {'datacoverage': 1,
              'id': 'ACSC',
              'maxdate': '1996-05-28',
              'mindate': '1994-02-01',
              'name': 'Average cloudiness sunrise to sunset from 30-second '
                      'ceilometer data'},
             {'datacoverage': 1,
              'id': 'ACSH',
              'maxdate': '2005-12-31',
              'mindate': '1965-01-01',
              '

In [52]:
len(GHCND_datatypes.json()['results'])

136

In [11]:
country = 'Ireland'
#  start_date = '1961-01-01'
start_date = '2020-01-01'
end_date = '2020-12-31'
data_set_id = 'NORMAL_DLY'

querystring = {
    'datasetid': data_set_id,
    # datatypeid =
    'locationid': noaa_countries[country],
    'units': 'metric',
    'startdate': start_date,
    'enddate': end_date,
    # 'includemetadata': False
    'limit': 1000
}

# Fetch all available data categories
endpoint='data'
url = f"{base_url}{endpoint}"

GSOM_response = requests.request("GET", url, headers=headers, params=querystring)
pp(GSOM_response.json())

{}


In [10]:
GSOM_response.content

b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><response><statusCode>400</statusCode><userMessage>There was an error with the request.</userMessage><developerMessage>The date range must be less than 1 year.</developerMessage></response>'

In [None]:
country = 'Ireland'
start_date = '1961-01-01'
end_date = '2020-12-31'
locationcategoryid = 'CNTRY'
#locationId = 
#datatypeid =
querystring = {
    'datasetid': 'GSOM',
    # 'locationid': noaa_countries[country],
    'units': 'metric',
    'startdate': start_date,
    'enddate': end_date,
    # 'includemetadata': False
}

# Fetch all available data categories
endpoint='data'
url = f"{base_url}{endpoint}"

response = requests.request("GET", url, headers=headers, params=querystring)
pp(response.json())

In [None]:
end_point = "data"
stationid = "GHCND:USC00010008"
data_set = "GSOM"
units = "metric"
start_date = "2010-05-01"
end_date = "2010-05-31"
url = f"{base_url}{endpoint}?datasetid={data_set}&stationid={stationid}&units={units}&startdate={start_date}&end_date={end_date}"


## Fetch all available data categories

In [14]:
# Fetch all available data categories
endpoint='datacategories'
querystring = {'limit': 1000}
url = f"{base_url}{endpoint}"

datacategories = requests.request("GET", url, headers=headers, params=querystring)
pp(datacategories.json())

{'metadata': {'resultset': {'count': 42, 'limit': 1000, 'offset': 1}},
 'results': [{'id': 'ANNAGR', 'name': 'Annual Agricultural'},
             {'id': 'ANNDD', 'name': 'Annual Degree Days'},
             {'id': 'ANNPRCP', 'name': 'Annual Precipitation'},
             {'id': 'ANNTEMP', 'name': 'Annual Temperature'},
             {'id': 'AUAGR', 'name': 'Autumn Agricultural'},
             {'id': 'AUDD', 'name': 'Autumn Degree Days'},
             {'id': 'AUPRCP', 'name': 'Autumn Precipitation'},
             {'id': 'AUTEMP', 'name': 'Autumn Temperature'},
             {'id': 'COMP', 'name': 'Computed'},
             {'id': 'COMPAGR', 'name': 'Computed Agricultural'},
             {'id': 'DD', 'name': 'Degree Days'},
             {'id': 'DUALPOLMOMENT', 'name': 'Dual-Pol Moments'},
             {'id': 'ECHOTOP', 'name': 'Echo Tops'},
             {'id': 'EVAP', 'name': 'Evaporation'},
             {'id': 'HYDROMETEOR', 'name': 'Hydrometeor Type'},
             {'id': 'LAND', 'name': 'L

In [15]:
# Fetch all available data categories
endpoint='locationcategories'
url = f"{base_url}{endpoint}"

response = requests.request("GET", url, headers=headers, params=querystring)
pp(response.json())

{'metadata': {'resultset': {'count': 12, 'limit': 25, 'offset': 1}},
 'results': [{'id': 'CITY', 'name': 'City'},
             {'id': 'CLIM_DIV', 'name': 'Climate Division'},
             {'id': 'CLIM_REG', 'name': 'Climate Region'},
             {'id': 'CNTRY', 'name': 'Country'},
             {'id': 'CNTY', 'name': 'County'},
             {'id': 'HYD_ACC', 'name': 'Hydrologic Accounting Unit'},
             {'id': 'HYD_CAT', 'name': 'Hydrologic Cataloging Unit'},
             {'id': 'HYD_REG', 'name': 'Hydrologic Region'},
             {'id': 'HYD_SUB', 'name': 'Hydrologic Subregion'},
             {'id': 'ST', 'name': 'State'},
             {'id': 'US_TERR', 'name': 'US Territory'},
             {'id': 'ZIP', 'name': 'Zip Code'}]}


In [15]:
aws_url = 'http://noaa-ghcn-pds.s3.amazonaws.com/csv/'

# Fetch all available data categories
endpoint='2020.csv'
querystring = {}
url = f"{base_url}{endpoint}"

aws_data = requests.request("GET", url, headers=headers, params=querystring)
pp(aws_data.json())

JSONDecodeError: [Errno Extra data] "statusCode","userMessage","developerMessage"
"404","The requested resource could not be found.","The requested resource could not be found: /cdo-web/api/v2/2020.csv"
: 12

### Example query

In [None]:

# data?datasetid=GSOM&stationid=GHCND:USC00010008&units=standardstartdate=2010-05-01&enddate=2010-05-31
end_point = "data"
stationid = "GHCND:USC00010008"
data_set = "GSOM"
units = "metric"
start_date = "2010-05-01"
end_date = "2010-05-31"
url = f"{base_url}{endpoint}?datasetid={data_set}&stationid={stationid}&units={units}&startdate={start_date}&end_date={end_date}"

# https://www.ncei.noaa.gov/access/services/data/v1?
# dataset=global-marine
# &dataTypes=WIND_DIR,WIND_SPEED
# &stations=AUCE
# &startDate=2016-01-01
# &endDate=2016-01-02
# &boundingBox=90,-180,-90,180
# https://www.ncei.noaa.gov/access/services/data/v1?dataset=global-summar…

querystring = {
    'datasetid': 'GSOM',
    'stationid': 'GHCND:USC00010008',
    'units': 'standard',
    'startdate': '2010-05-01',
    'enddate': '2010-05-31',
}

# Fetch all available data categories
endpoint='data'
url = f"{base_url}{endpoint}"

response = requests.request("GET", url, headers=headers, params=querystring)
pp(response.json())

# Data exploration and cleaning 
## NOAA Global Historical Climatology Network Daily (GHCN-D) from CSV file
AWS bucket url: https://noaa-ghcn-pds.s3.amazonaws.com/index.html#csv/

In [86]:
from datetime import datetime
custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d")

file_name = '2021'
noaa_ghcn_aws_data_2021 = pd.read_csv(
    f"./data/noaa_ghcn_aws_data/{file_name}.csv", 
    header=None, 
    names=['station_id', 'date', 'element', 'data_value', 'measurement_id', 'quality_id', 'source_id', 'observation_time'],
    # nrows=1000, 
    parse_dates=['date'],
    date_parser=custom_date_parser,
    dtype={'observation_time':str}
    )
noaa_ghcn_aws_data_2021.observation_time.replace({np.nan: '0000', '2400': '0000'}, inplace=True)
noaa_ghcn_aws_data_2021.observation_time = pd.to_datetime(noaa_ghcn_aws_data_2021.observation_time, format='%H%M').dt.time
noaa_ghcn_aws_data_2021.head()

Unnamed: 0,station_id,date,element,data_value,measurement_flag,quality_flag,source_flag,observation_time
0,AE000041196,2021-01-01,TMAX,278,,,S,00:00:00
1,AE000041196,2021-01-01,PRCP,0,D,,S,00:00:00
2,AE000041196,2021-01-01,TAVG,214,H,,S,00:00:00
3,AEM00041194,2021-01-01,TMAX,266,,,S,00:00:00
4,AEM00041194,2021-01-01,TMIN,178,,,S,00:00:00


In [111]:
noaa_ghcn_aws_data_2021.head()

Unnamed: 0,station_id,date,element,data_value,measurement_id,quality_id,source_id,observation_time
0,AE000041196,2021-01-01,TMAX,278,,,S,00:00:00
1,AE000041196,2021-01-01,PRCP,0,D,,S,00:00:00
2,AE000041196,2021-01-01,TAVG,214,H,,S,00:00:00
3,AEM00041194,2021-01-01,TMAX,266,,,S,00:00:00
4,AEM00041194,2021-01-01,TMIN,178,,,S,00:00:00


In [119]:
# Replace NaN values in measurement_id, quality_id and source_id columns to match id descriptions
noaa_ghcn_aws_data_2021.measurement_id.replace(np.nan, 'None', inplace=True)
noaa_ghcn_aws_data_2021.quality_id.replace(np.nan, 'None', inplace=True)
noaa_ghcn_aws_data_2021.source_id.replace(np.nan, 'None', inplace=True)
noaa_ghcn_aws_data_2021.head()

Unnamed: 0,station_id,date,element,data_value,measurement_id,quality_id,source_id,observation_time
0,AE000041196,2021-01-01,TMAX,278,,,S,00:00:00
1,AE000041196,2021-01-01,PRCP,0,D,,S,00:00:00
2,AE000041196,2021-01-01,TAVG,214,H,,S,00:00:00
3,AEM00041194,2021-01-01,TMAX,266,,,S,00:00:00
4,AEM00041194,2021-01-01,TMIN,178,,,S,00:00:00


In [87]:
noaa_ghcn_aws_data_2021.shape

(34620379, 8)

In [88]:
noaa_ghcn_aws_data_2021.element.unique()

array(['TMAX', 'PRCP', 'TAVG', 'TMIN', 'SNWD', 'AWND', 'WDF2', 'WSF2',
       'DATX', 'MDTX', 'DATN', 'MDTN', 'DAPR', 'MDPR', 'SNOW', 'WESF',
       'WESD', 'TOBS', 'WDF5', 'WSF5', 'WT01', 'EVAP', 'PGTM', 'SN32',
       'SX32', 'MNPN', 'MXPN', 'WT11', 'WDMV', 'SN31', 'SX31', 'WT03',
       'SN52', 'SX52', 'SN33', 'SN35', 'SX33', 'SX35', 'WDFG', 'WSFG',
       'WT06', 'WT04', 'THIC', 'SN51', 'SX51', 'SN53', 'SN55', 'SX53',
       'SX55', 'SN36', 'SN56', 'SX36', 'SX56', 'AWDR', 'WSFI', 'WT08',
       'WT02', 'WT09', 'WT05', 'PSUN', 'TSUN', 'DWPR', 'WT07', 'WT17',
       'DASF', 'MDSF', 'WT10', 'WT18'], dtype=object)

In [125]:
# Check for multiple measurements at different times
grouped = noaa_ghcn_aws_data_2021.groupby(['station_id', 'date', 'element']).count().sort_values('observation_time')
grouped

In [1]:
# We can drop the observation_time column because there is only one obervation per day
noaa_ghcn_aws_data_2021.drop('observation_time', inplace=True)
noaa_ghcn_aws_data_2021.head()

NameError: name 'noaa_ghcn_aws_data_2021' is not defined

In [97]:
# Check for multiple temperature measurements on different soil types and depths
grouped = noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('SX')].groupby(['station_id', 'date']).count()

In [101]:
grouped[grouped.element > 1].sort_values('element') # We need to create all possible soil temp element_ids in case they show up in other yyears

Unnamed: 0_level_0,Unnamed: 1_level_0,element,data_value,measurement_flag,quality_flag,source_flag,observation_time
station_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
USC00033821,2021-01-01,2,2,0,0,2,2
USC00206012,2021-05-23,2,2,0,0,2,2
USC00206012,2021-05-22,2,2,0,0,2,2
USC00206012,2021-05-21,2,2,0,0,2,2
USC00206012,2021-05-20,2,2,0,0,2,2
...,...,...,...,...,...,...,...
USC00218450,2021-06-27,8,8,0,0,8,8
USC00218450,2021-06-28,8,8,0,1,8,8
USC00218450,2021-06-29,8,8,0,0,8,8
USC00218450,2021-06-22,8,8,0,0,8,8


In [106]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id=='USC00218450')&(noaa_ghcn_aws_data_2021.date=='2021-06-27')][['element', 'data_value']]

Unnamed: 0,element,data_value
17141532,TMAX,228
17141533,TMIN,172
17141534,PRCP,18
17141535,SNOW,0
17141536,SNWD,0
17141537,SN32,211
17141538,SN33,217
17141539,SN35,206
17141540,SN36,178
17141541,SN51,211


In [93]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element=='FMTM'].head(100)

Unnamed: 0,station_id,date,element,data_value,measurement_flag,quality_flag,source_flag,observation_time


## Create dataframes for datatype descriptions

In [None]:
measurement_ids = [
    {'measurement_id': 'None', 'description': 'no measurement information applicable'},
    {'measurement_id': 'B', 'description': 'precipitation total formed from two 12-hour totals'},
    {'measurement_id': 'D', 'description': 'precipitation total formed from four six-hour totals'},
    {'measurement_id': 'H', 'description': 'represents highest or lowest hourly temperature (TMAX or TMIN) or the average of hourly values (TAVG)'},
    {'measurement_id': 'K', 'description': 'converted from knots'},
    {'measurement_id': 'L', 'description': 'temperature appears to be lagged with respect to reported hour of observation'},
    {'measurement_id': 'O', 'description': 'converted from oktas'},
    {'measurement_id': 'P', 'description': 'identified as “missing presumed zero” in DSI 3200 and 3206'},
    {'measurement_id': 'T', 'description': 'trace of precipitation, snowfall, or snow depth'},
    {'measurement_id': 'W', 'description': 'converted from 16-point WBAN code (for wind direction)'},
]
quality_ids = [
    {'quality_id': 'None', 'description': 'did not fail any quality assurance check'},
    {'quality_id': 'D', 'description': 'failed duplicate check'},
    {'quality_id': 'G', 'description': 'failed gap check'},
    {'quality_id': 'I', 'description': 'failed internal consistency check'},
    {'quality_id': 'K', 'description': 'failed streak/frequent-value check'},
    {'quality_id': 'L', 'description': 'failed check on length of multiday period'},
    {'quality_id': 'M', 'description': 'failed mega consistency check'},
    {'quality_id': 'N', 'description': 'failed naught check'},
    {'quality_id': 'O', 'description': 'failed climatological outlier check'},
    {'quality_id': 'R', 'description': 'failed lagged range check'},
    {'quality_id': 'S', 'description': 'failed spatial consistency check'},
    {'quality_id': 'T', 'description': 'failed temporal consistency check'},
    {'quality_id': 'W', 'description': 'temperature too warm for snow'},
    {'quality_id': 'X', 'description': 'failed bounds check'},
    {'quality_id': 'Z', 'description': 'flagged as a result of an official Datzilla Investigation'},
]
source_ids = [
    {'source_id': 'None', 'description': 'No source (i.e., data value missing)'},
    {'source_id': '0', 'description': 'U.S. Cooperative Summary of the Day (NCDC DSI-3200)'},
    {'source_id': '6', 'description': 'CDMP Cooperative Summary of the Day (NCDC DSI-3206)'},
    {'source_id': '7', 'description': 'U.S. Cooperative Summary of the Day - Transmitted via WxCoder3 (NCDC SI-3207)'},
    {'source_id': 'A', 'description': 'U.S. Automated Surface Observing System (ASOS) real-time data (since January 1, 2006)'},
    {'source_id': 'a', 'description': 'Australian data from the Australian Bureau of Meteorology'},
    {'source_id': 'B', 'description': 'U.S. ASOS data for October 2000-December 2005 (NCDC DSI-3211)'},
    {'source_id': 'b', 'description': 'Belarus update'},
    {'source_id': 'C', 'description': 'Environment Canada'},
    {'source_id': 'E', 'description': 'European Climate Assessment and Dataset (Klein Tank et al., 2002)'},
    {'source_id': 'F', 'description': 'U.S. Fort data'},
    {'source_id': 'G', 'description': 'Official Global Climate Observing System (GCOS) or other government-supplied data'},
    {'source_id': 'H', 'description': 'High Plains Regional Climate Center real-time data'},
    {'source_id': 'I', 'description': 'International collection (non U.S. data received through personal contacts)'},
    {'source_id': 'K', 'description': 'U.S. Cooperative Summary of the Day data digitized from paper observer forms (from 2011 to present)'},
    {'source_id': 'M', 'description': 'Monthly METAR Extract (additional ASOS data)'},
    {'source_id': 'N', 'description': 'Community Collaborative Rain, Hail,and Snow (CoCoRaHS)'},
    {'source_id': 'Q', 'description': 'Data from several African countries that had been “quarantined”, that is, withheld from public release until permission was granted from the respective meteorological services'},
    {'source_id': 'R', 'description': 'NCEI Reference Network Database (Climate Reference Network and Regional Climate Reference Network)'},
    {'source_id': 'r', 'description': 'All-Russian Research Institute of Hydro-meteorological Information-World Data Center'},
    {'source_id': 'S', 'description': 'Global Summary of the Day (NCDC DSI-9618)NOTE: “S” values are derived from hourly synoptic reports exchanged on the Global Telecommunications System (GTS). Daily values derived in this fashion may differ significantly from “true” daily data, particularly for precipitation (i.e., use with caution).'},
    {'source_id': 's', 'description': 'China Meteorological Administration/National Meteorological Information Center/Climatic Data Center (http://cdc.cma.gov.cn)'},
    {'source_id': 'T', 'description': "SNOwpack TELemtry (SNOTEL) data obtained from the U.S. Department of Agriculture's Natural Resources Conservation Service"},
    {'source_id': 'U', 'description': 'Remote Automatic Weather Station (RAWS) data obtained from the Western Regional Climate Center'},
    {'source_id': 'u', 'description': 'Ukraine update'},
    {'source_id': 'W', 'description': "WBAN/ASOS Summary of the Day from NCDC's Integrated Surface Data (ISD)."},
    {'source_id': 'X', 'description': 'U.S. First-Order Summary of the Day (NCDC DSI-3210)'},
    {'source_id': 'Z', 'description': 'Datzilla official additions or replacements'},
    {'source_id': 'z', 'description': 'Uzbekistan update'},
]

### Convert element ids to dataframe

In [124]:
soil_temp_units = 'tenths of degrees C'
# Minimum soil temperature element_id=SN*#, where * corresponds to a code for ground cover and # corresponds to a code for soil depth.
# Maximum soil temperature element_id=SX*#, where * corresponds to a code for ground cover and # corresponds to a code for soil depth.

ground_cover_map = { 
    '0': "unknown",
    '1': "grass",
    '2': "fallow",
    '3': "bare ground",
    '4': "brome grass",
    '5': "sod",
    '6': "straw mulch",
    '7': "grass muck",
    '8': "bare muck",
}

soil_depth_map = {
    '1': "5 cm",
    '2': "10 cm",
    '3': "20 cm",
    '4': "50 cm",
    '5': "100 cm",
    '6': "150 cm",
    '7': "180 cm",
}

min_soil_temp_element_ids = []
max_soil_temp_element_ids = []

for gc_id, gc_desc in ground_cover_map.items():
    for sd_id, sd_desc in soil_depth_map.items():
        min_soil_temp_element_ids.append({
            'element_id': 'SN'+gc_id+sd_id, 
            'description': f'Minimum soil temperature: Ground cover = {gc_desc}, Soil depth = {sd_desc}',
            'units': soil_temp_units
            })
        max_soil_temp_element_ids.append({
            'element_id': 'SX'+gc_id+sd_id, 
            'description': f'Maximum soil temperature: Ground cover = {gc_desc}, Soil depth = {sd_desc}',
            'units': soil_temp_units
            })
pd.DataFrame(min_soil_temp_element_ids).head()


Unnamed: 0,element_id,description,units
0,SN01,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
1,SN02,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
2,SN03,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
3,SN04,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
4,SN05,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C


In [None]:
element_ids = [
    {'element_id': 'PRCP', 'description': 'Precipitation', 'units': 'tenths of mm'},
    {'element_id': 'SNOW', 'description': 'Snowfall', 'units': 'mm'},
    {'element_id': 'SNWD', 'description': 'Snow depth', 'units': 'mm'},
    {'element_id': 'TMAX', 'description': 'Maximum temperature', 'units': 'tenths of degrees C'},
    {'element_id': 'TMIN', 'description': 'Minimum temperature', 'units': 'tenths of degrees C'},
    {'element_id': 'ACMC', 'description': 'Average cloudiness midnight to midnight from 30-second ceilometer data', 'units': 'percent'},
    {'element_id': 'ACMH', 'description': 'Average cloudiness midnight to midnight from manual observations', 'units': 'percent'},
    {'element_id': 'ACSC', 'description': 'Average cloudiness sunrise to sunset from 30-second ceilometer data', 'units': 'percent'},
    {'element_id': 'ACSH', 'description': 'Average cloudiness sunrise to sunset from manual observations', 'units': 'percent'},
    {'element_id': 'AWDR', 'description': 'Average daily wind direction', 'units': 'degrees'},
    {'element_id': 'AWND', 'description': 'Average daily wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'EVAP', 'description': 'Evaporation of water from evaporation pan', 'units': 'tenths of mm'},
    {'element_id': 'FRGB', 'description': 'Base of frozen ground layer', 'units': 'cm'},
    {'element_id': 'FRGT', 'description': 'Top of frozen ground layer', 'units': 'cm'},
    {'element_id': 'FRTH', 'description': 'Thickness of frozen ground layer', 'units': 'cm'},
    {'element_id': 'GAHT', 'description': 'Difference between river and gauge height', 'units': 'cm'},
    {'element_id': 'MNPN', 'description': 'Daily minimum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'element_id': 'MXPN', 'description': 'Daily maximum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'element_id': 'PGTM', 'description': 'Peak gust time', 'units': '(hours and minutes, i.e., HHMM)'},
    {'element_id': 'PSUN', 'description': 'Daily percent of possible sunshine', 'units': 'percent'},
    {'element_id': 'THIC', 'description': 'Thickness of ice on water', 'units': 'tenths of mm'},
    {'element_id': 'TOBS', 'description': 'Temperature at the time of observation', 'units': 'tenths of degrees C'},
    {'element_id': 'TSUN', 'description': 'Daily total sunshine', 'units': '(minutes)'},
    {'element_id': 'WDF1', 'description': 'Direction of fastest 1-minute wind', 'units': 'degrees'},
    {'element_id': 'WDF2', 'description': 'Direction of fastest 2-minute wind', 'units': 'degrees'},
    {'element_id': 'WDF5', 'description': 'Direction of fastest 5-second wind', 'units': 'degrees'},
    {'element_id': 'WDFG', 'description': 'Direction of peak wind gust', 'units': 'degrees'},
    {'element_id': 'WDFI', 'description': 'Direction of highest instantaneous wind', 'units': 'degrees'},
    {'element_id': 'WDFM', 'description': 'Fastest mile wind direction', 'units': 'degrees'},
    {'element_id': 'WDMV', 'description': '24-hour wind movement', 'units': 'km'},
    {'element_id': 'WESD', 'description': 'Water equivalent of snow on the ground', 'units': 'tenths of mm'},
    {'element_id': 'WESF', 'description': 'Water equivalent of snowfall', 'units': 'tenths of mm'},
    {'element_id': 'WSF1', 'description': 'Fastest 1-minute wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSF2', 'description': 'Fastest 2-minute wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSF5', 'description': 'Fastest 5-second wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSFG', 'description': 'Peak gust wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSFI', 'description': 'Highest instantaneous wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSFM', 'description': 'Fastest mile wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'TAVG', 'description': "Average temperature', 'units': 'tenths of degrees C"}, # [Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]
    {'element_id': 'WT01', 'description': "Weather type: Fog, ice fog, or freezing fog (may include heavy fog)"},
    {'element_id': 'WT02', 'description': "Weather type: Heavy fog or heaving freezing fog (not always distinguished from fog)"},
    {'element_id': 'WT03', 'description': "Weather type: Thunder"},
    {'element_id': 'WT04', 'description': "Weather type: Ice pellets, sleet, snow pellets, or small hail"},
    {'element_id': 'WT05', 'description': "Weather type: Hail (may include small hail)"},
    {'element_id': 'WT06', 'description': "Weather type: Glaze or rime"},
    {'element_id': 'WT07', 'description': "Weather type: Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction"},
    {'element_id': 'WT08', 'description': "Weather type: Smoke or haze"},
    {'element_id': 'WT09', 'description': "Weather type: Blowing or drifting snow"},
    {'element_id': 'WT10', 'description': "Weather type: Tornado, waterspout, or funnel cloud"},
    {'element_id': 'WT11', 'description': "Weather type: High or damaging winds"},
    {'element_id': 'WT12', 'description': "Weather type: Blowing spray"},
    {'element_id': 'WT13', 'description': "Weather type: Mist"},
    {'element_id': 'WT14', 'description': "Weather type: Drizzle"},
    {'element_id': 'WT15', 'description': "Weather type: Freezing drizzle"},
    {'element_id': 'WT16', 'description': "Weather type: Rain (may include freezing rain, drizzle, and freezing drizzle)"},
    {'element_id': 'WT17', 'description': "Weather type: Freezing rain"},
    {'element_id': 'WT18', 'description': "Weather type: Snow, snow pellets, snow grains, or ice crystals"},
    {'element_id': 'WT19', 'description': "Weather type: Unknown source of precipitation"},
    {'element_id': 'WT21', 'description': "Weather type: Ground fog"},
    {'element_id': 'WT22', 'description': "Weather type: Ice fog or freezing fog"},

    {'element_id': 'WV01', 'description': "Weather in vicinity: Fog, ice fog, or freezing fog (may include heavy fog)"},
    {'element_id': 'WV03', 'description': "Weather in vicinity: Thunder"},
    {'element_id': 'WV07', 'description': "Weather in vicinity: Ash, dust, sand, or other blowing obstruction"},
    {'element_id': 'WV18', 'description': "Weather in vicinity: Snow or ice crystals"},
    {'element_id': 'WV20', 'description': "Weather in vicinity: Rain or snow shower"},

    {'element_id': 'FMTM', 'description': 'Time of fastest mile or fastest 1-minute wind', 'units': '(hours and minutes,i.e., HHMM)'},

    {'element_id': 'DASF', 'description': 'Number of days included in the multiday snowfall total', 'units': '(MDSF)'},
    {'element_id': 'MDSF', 'description': 'Multiday snowfall total'},

    {'element_id': 'DAWM', 'description': 'Number of days included in the multiday wind movement', 'units': '(MDWM)'},
    {'element_id': 'MDWM', 'description': 'Multiday wind movement', 'units': 'km'},

    {'element_id': 'MDEV', 'description': 'Multiday evaporation total', 'units': '(tenths of mm; use with DAEV)'},
    {'element_id': 'DAEV', 'description': 'Number of days included in the multiday evaporation total', 'units': '(MDEV)'},

    {'element_id': 'MDPR', 'description': 'Multiday precipitation total', 'units': '(tenths of mm; use with DAPR and DWPR, if available)'},
    {'element_id': 'DWPR', 'description': 'Number of days with non-zero precipitation included in multiday precipitation total', 'units': '(MDPR)'},
    {'element_id': 'DAPR', 'description': 'Number of days included in the multiday precipitation total', 'units': '(MDPR)'},

    {'element_id': 'MDTN', 'description': 'Multiday minimum temperature', 'units': 'tenths of degrees C; (use with DATN)'},
    {'element_id': 'DATN', 'description': 'Number of days included in the multiday minimum temperature', 'units': '(MDTN)'},

    {'element_id': 'MDTX', 'description': 'Multiday maximum temperature', 'units': 'tenths of degrees C; (use with DATX)'},
    {'element_id': 'DATX', 'description': 'Number of days included in the multiday maximum temperature', 'units': '(MDTX)'},
]

In [107]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id=='USC00218450')&(noaa_ghcn_aws_data_2021.date=='2021-06-27')][['element', 'data_value']]

Unnamed: 0,element,data_value
17141532,TMAX,228
17141533,TMIN,172
17141534,PRCP,18
17141535,SNOW,0
17141536,SNWD,0
17141537,SN32,211
17141538,SN33,217
17141539,SN35,206
17141540,SN36,178
17141541,SN51,211
