In [109]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey
from pprint import pprint as pp

from config import local_mysql_password, local_mysql_user

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

## Import and preprocess FAO crop yield data

In [240]:
fao_file_path = "./data/fao_data/"

# Load the crop yield data
file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
fao_crop_yield_data = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')
# Replace spaces column names with underscores and make lower case
fao_crop_yield_data.columns = fao_crop_yield_data.columns.str.replace(' ','_').str.lower()
# Rename confusing data aggregation flags
fao_crop_yield_data.flag.replace(np.nan, 'O', inplace=True)
fao_crop_yield_data.flag.replace('*', 'U', inplace=True)
# Remove 0 crop yield values to prevent bad training data
fao_crop_yield_data = fao_crop_yield_data[fao_crop_yield_data.value != 0.0]
# Drop the year_code becuase it is always the same as the year value
fao_crop_yield_data.drop('year_code', axis=1, inplace=True)
# Drop areas which are not countries
regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]
fao_countries = fao_crop_yield_data.area.loc[~fao_crop_yield_data.area.isin(regions+special_groups)].unique().tolist()
fao_crop_yield_data = fao_crop_yield_data[fao_crop_yield_data.area.isin(fao_countries)]
# Rename columns so they aren't confused with the NOAA data
fao_crop_yield_data.rename({
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
    'flag': 'fao_data_quality_flag', 
    'item': 'product', 
    'item_code': 'product_id',
    'element': 'production_type',
    'element_code': 'production_type_id'
    }, axis=1, inplace=True)

fao_crop_yield_data.head()


Unnamed: 0,fao_country_code,fao_country_name,product_id,product,production_type_id,production_type,year,unit,value,fao_data_quality_flag
1,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,ha,5900.0,F
2,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,ha,6000.0,F
3,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,ha,6000.0,F
4,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,ha,6000.0,F
5,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1980,ha,5800.0,F


In [241]:
fao_crop_yield_data.shape

(2802701, 10)

In [242]:
# Extract the country data
fao_country_data = fao_crop_yield_data[['fao_country_code', 'fao_country_name']].copy()
fao_country_data.drop_duplicates(inplace=True)
fao_country_data.shape

(211, 2)

In [243]:
# Load the noaa ghcnd countries
noaa_file_path = './data/noaa_ghcn_aws_data/'
noaa_countries_file_name = 'ghcnd-countries'
noaa_countries = pd.read_fwf(f"{noaa_file_path}{noaa_countries_file_name}.txt", header=None, names = ["FIPS_country_code", "ghcnd_country"])
noaa_countries.shape


(219, 2)

In [244]:
fao_to_noaa_country_map = {
    'Bahamas': 'Bahamas, The',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Brunei Darussalam': 'Brunei',
    'Cabo Verde': 'Cape Verde',
    'Congo': 'Congo (Brazzaville)',
    'Cook Islands': 'Cook Islands [New Zealand]',
    'Czechia': 'Czech Republic',
    "Democratic People's Republic of Korea": 'Korea, North',
    'Democratic Republic of the Congo': 'Congo (Kinshasa)',
    'French Guyana': 'French Guiana [France]',
    'Gambia': 'Gambia, The',
    'Guadeloupe': 'Guadeloupe [France]',
    'Iran (Islamic Republic of)': 'Iran',
    'Martinique': 'Martinique [France]',
    'Micronesia (Federated States of)': 'Federated States of Micronesia',
    'Myanmar': 'Burma',
    'New Caledonia': 'New Caledonia [France]',
    'Niue': 'Niue [New Zealand]',
    'Puerto Rico': 'Puerto Rico [United States]',
    'Republic of Korea': 'Korea, South',
    'Russian Federation': 'Russia',
    'Samoa': 'American Samoa [United States]',
    'Syrian Arab Republic': 'Syria',
    'Tokelau': 'Tokelau [New Zealand]',
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
    'United Republic of Tanzania': 'Tanzania',
    'United States of America': 'United States',
    'Venezuela (Bolivarian Republic of)': 'Venezuela',
    'Viet Nam': 'Vietnam',
    'North Macedonia': 'Macedonia',
    'Réunion': 'Reunion [France]',
    "Côte d'Ivoire": "Cote D'Ivoire",
    "China, Macao SAR": "Macau S.A.R",
    "Lao People's Democratic Republic": "Laos",
    "Republic of Moldova": "Moldova",
    }

country_data = []
for fao_country_name, noaa_country_name in fao_to_noaa_country_map.items():
        country_data.append({
            'fao_country_code': fao_country_data[fao_country_data.fao_country_name==fao_country_name].fao_country_code.values[0],
            'fao_country_name': fao_country_name,
            'noaa_country_name': noaa_country_name,
            'noaa_country_code': noaa_countries[noaa_countries.ghcnd_country==noaa_country_name].FIPS_country_code.values[0],
        })

for country in fao_countries:
    if country in noaa_countries.ghcnd_country.tolist() and country not in fao_to_noaa_country_map.keys():
        country_data.append({
            'fao_country_code': fao_country_data[fao_country_data.fao_country_name==country].fao_country_code.values[0],
            'fao_country_name': country,
            'noaa_country_name': country,
            'noaa_country_code': noaa_countries[noaa_countries.ghcnd_country==country].FIPS_country_code.values[0],
        })
    elif country not in fao_to_noaa_country_map.keys():
        country_data.append({
            'fao_country_code': fao_country_data[fao_country_data.fao_country_name==country].fao_country_code.values[0],
            'fao_country_name': country,
            'noaa_country_name': np.nan,
            'noaa_country_code': np.nan,
        })
country_data = pd.DataFrame(country_data).sort_values(['noaa_country_name', 'fao_country_name'])
country_data.dropna(inplace=True)
country_data.head()

Unnamed: 0,fao_country_code,fao_country_name,noaa_country_name,noaa_country_code
35,2,Afghanistan,Afghanistan,AF
36,3,Albania,Albania,AL
37,4,Algeria,Algeria,AG
21,244,Samoa,American Samoa [United States],AQ
38,7,Angola,Angola,AO


In [248]:
# Clean up the crop yield data
# Remove rows which belong to dropped countries/regions
fao_crop_yield_data = fao_crop_yield_data[fao_crop_yield_data.fao_country_code.isin(country_data.fao_country_code)]

# Remove rows relating to unnecessary production_types.
# Yield = production/area_harvested so we can drop those two rows. The rest are for animals, which do not depend on weather.
production_types_to_drop = ['Area harvested', 'Production', 'Stocks', 'Laying', 'Producing Animals/Slaughtered', 'Yield/Carcass Weight', 'Milk Animals', 'Prod Popultn']
print(fao_crop_yield_data.shape)
fao_crop_yield_data = fao_crop_yield_data[~fao_crop_yield_data.production_type.isin(production_types_to_drop)]
print(fao_crop_yield_data.shape)

# Removing unnecessary columns
fao_crop_yield_data.drop(['fao_country_name', 'product', 'production_type_id', 'production_type'], axis=1, inplace=True, errors='ignore')

(570893, 7)
(570893, 7)


In [249]:
# Split the country_data to create two tables with primary keys
fao_country_codes = country_data[['fao_country_code', 'fao_country_name', 'noaa_country_code']]
noaa_country_codes = country_data[['noaa_country_code', 'noaa_country_name', 'fao_country_code']]
noaa_country_codes.head()

Unnamed: 0,noaa_country_code,noaa_country_name,fao_country_code
35,AF,Afghanistan,2
36,AL,Albania,3
37,AG,Algeria,4
21,AQ,American Samoa [United States],244
38,AO,Angola,7


In [250]:
# Import fao data aggregation codes
file_name = 'Production_Crops_Livestock_E_Flags'
fao_data_quality_description = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')
# Make column names lower case
fao_data_quality_description.columns = fao_data_quality_description.columns.str.lower()
fao_data_quality_description.rename({'flag': 'fao_data_quality_flag'}, axis=1, inplace=True)
# Rename confusing data aggregation flags
fao_data_quality_description.replace('<blank>', 'O', inplace=True)
fao_data_quality_description.replace('*', 'U', inplace=True)
fao_data_quality_description

Unnamed: 0,fao_data_quality_flag,description
0,U,Unofficial figure
1,O,Official data
2,A,Aggregate; may include official; semi-official...
3,F,FAO estimate
4,Fc,Calculated data
5,Im,FAO data based on imputation methodology
6,M,Data not available


In [253]:
# Import product item codes
file_name = 'Production_Crops_Livestock_E_ItemCodes'
fao_product_ids = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')
# Replace spaces column names with underscores and make lower case
fao_product_ids.columns = fao_product_ids.columns.str.replace(' ','_').str.lower()
# Rename columns so they aren't confused with the NOAA data
fao_product_ids.rename({'item': 'product', 'item_code': 'product_id'}, axis=1, inplace=True)
# Remove product_ids that don't exist in the fao_crop_yield data
fao_product_ids = fao_product_ids[fao_product_ids.product_id.isin(fao_crop_yield_data.product_id)]

fao_product_ids.head()

(306, 3)
(185, 3)


Unnamed: 0,product_id,cpc_code,product
0,101,'01195,Canary seed
5,1020,'02292,Milk; whole fresh goat
8,1025,'02954,Skins; goat; fresh
9,103,'01199.02,Grain; mixed
17,1062,'0231,Eggs; hen; in shell


## Push FAO crop yield data to database

In [234]:
class DatabaseInterface:
    def __init__(self,
                 db_name,
                 user,
                 password,
                 host='localhost',
                 port=3306,
                 driver='mysql+pymysql'):
        
        self.con = create_engine(f'{driver}://{user}:{password}@{host}')
        self.con.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")
        self.db_engine = create_engine(f"{driver}://{user}:{password}@{host}:{port}/{db_name}", echo=True)

    def insert_data(self, df: pd.DataFrame, table_name: str, if_exists: str = 'append'):
        df.to_sql(table_name, self.db_engine, if_exists=if_exists, index=False)

    def close_connection(self):
        self.db_engine.dispose()

In [256]:
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)
meta = MetaData()


dbi.db_engine.execute(f"DROP TABLE IF EXISTS fao_crop_yields")
dbi.db_engine.execute(f"DROP TABLE IF EXISTS fao_data_quality_description")
dbi.db_engine.execute(f"DROP TABLE IF EXISTS fao_product_ids")
dbi.db_engine.execute(f"SET FOREIGN_KEY_CHECKS=0;")
dbi.db_engine.execute(f"DROP TABLE IF EXISTS fao_country_codes")
dbi.db_engine.execute(f"DROP TABLE IF EXISTS noaa_country_codes")
dbi.db_engine.execute(f"SET FOREIGN_KEY_CHECKS=1;")

fao_data_quality_description_table = Table(
   'fao_data_quality_description', meta, 
   Column('fao_data_quality_flag', String(2), primary_key = True), 
   Column('description', String(100)),
)

fao_product_ids_table = Table(
   'fao_product_ids', meta, 
   Column('product_id', Integer, primary_key = True), 
   Column('cpc_code', String(9)),
   Column('product', String(100)),
)

fao_country_code_table = Table(
   'fao_country_codes', meta, 
   Column('fao_country_code', Integer, primary_key = True),
   Column('fao_country_name', String(100)),
   Column('noaa_country_code', String(2), ForeignKey("noaa_country_codes.noaa_country_code")), 
)

noaa_country_code_table = Table(
   'noaa_country_codes', meta, 
   Column('noaa_country_code', String(2), primary_key = True), 
   Column('noaa_country_name', String(100)),
   Column('fao_country_code', Integer, ForeignKey("fao_country_codes.fao_country_code")),
)

fao_crop_yields_table = Table(
   'fao_crop_yields', meta, 
   Column('id', Integer, primary_key = True), 
   Column('fao_country_code', Integer, ForeignKey("fao_country_codes.fao_country_code")), 
   Column('product_id', Integer, ForeignKey("fao_product_ids.product_id")),
   Column('year', Integer),
   Column('unit', String(100)),
   Column('value', Float(20)),
   Column('fao_data_quality_flag', String(2), ForeignKey("fao_data_quality_description.fao_data_quality_flag")),
)							
meta.create_all(dbi.db_engine)


2022-02-05 14:17:54,179 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2022-02-05 14:17:54,180 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-02-05 14:17:54,183 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2022-02-05 14:17:54,184 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-02-05 14:17:54,188 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2022-02-05 14:17:54,189 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-02-05 14:17:54,192 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS fao_crop_yields
2022-02-05 14:17:54,193 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-02-05 14:17:54,320 INFO sqlalchemy.engine.Engine COMMIT
2022-02-05 14:17:54,321 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS fao_data_quality_description
2022-02-05 14:17:54,323 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-02-05 14:17:54,352 INFO sqlalchemy.engine.Engine COMMIT
2022-02-05 14:17:54,355 INFO sqlalchemy.engine.Engine DROP TABLE IF EXISTS fao_product_ids
2022-02-05 14:17:54,356 INF

In [257]:
print("Inserting fao_data_quality_descriptions")
dbi.insert_data(fao_data_quality_description, 'fao_data_quality_description')

print("Inserting fao_product_ids")
dbi.insert_data(fao_product_ids, 'fao_product_ids')

 # The foreign keys for fao and noaa country codes have a circular dependency so we have to turn off the checks while inserting the data
dbi.db_engine.execute(f"SET FOREIGN_KEY_CHECKS=0;")
print("Inserting fao_country_codes")
dbi.insert_data(fao_country_codes, 'fao_country_codes')

print("Inserting noaa_country_codes")
dbi.insert_data(noaa_country_codes, 'noaa_country_codes')
dbi.db_engine.execute(f"SET FOREIGN_KEY_CHECKS=1;")

print("Inserting fao_crop_yields")
dbi.insert_data(fao_crop_yield_data, 'fao_crop_yields')

print("Closing connection")
dbi.close_connection()
print("Done")

Inserting fao_data_quality_descriptions
2022-02-05 14:18:03,514 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-02-05 14:18:03,515 INFO sqlalchemy.engine.Engine [cached since 9.057s ago] {'table_schema': 'crop_yield_prediction', 'table_name': 'fao_data_quality_description'}
2022-02-05 14:18:03,524 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-02-05 14:18:03,526 INFO sqlalchemy.engine.Engine INSERT INTO fao_data_quality_description (fao_data_quality_flag, description) VALUES (%(fao_data_quality_flag)s, %(description)s)
2022-02-05 14:18:03,527 INFO sqlalchemy.engine.Engine [generated in 0.00138s] ({'fao_data_quality_flag': 'U', 'description': 'Unofficial figure'}, {'fao_data_quality_flag': 'O', 'description': 'Official data'}, {'fao_data_quality_flag': 'A', 'description': 'Aggregate; may include official; semi-official; estimated or calculated data'}, {'fao_data_quality_flag': 'F

# Data exploration and cleaning 
## NOAA Global Historical Climatology Network Daily (GHCN-D) from CSV file
AWS bucket url: https://noaa-ghcn-pds.s3.amazonaws.com/index.html#csv/

In [190]:
from datetime import datetime
custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d")

noaa_ghcnd_file_path = './data/noaa_ghcn_aws_data/'

file_name = '2021'
noaa_ghcn_aws_data_2021 = pd.read_csv(
    f"{noaa_ghcnd_file_path}{file_name}.csv", 
    header=None, 
    names=['station_id', 'date', 'element', 'data_value', 'measurement_id', 'quality_id', 'source_id', 'observation_time'],
    nrows=1000000, 
    parse_dates=['date'],
    date_parser=custom_date_parser,
    dtype={'observation_time':str}
    )
noaa_ghcn_aws_data_2021.observation_time.replace({np.nan: '0000', '2400': '0000'}, inplace=True)
noaa_ghcn_aws_data_2021.observation_time = pd.to_datetime(noaa_ghcn_aws_data_2021.observation_time, format='%H%M').dt.time
noaa_ghcn_aws_data_2021.head()

Inserting fao_data_quality_descriptions
2022-02-05 13:12:52,721 INFO sqlalchemy.engine.Engine SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = %(table_schema)s AND table_name = %(table_name)s
2022-02-05 13:12:52,724 INFO sqlalchemy.engine.Engine [cached since 13.23s ago] {'table_schema': 'crop_yield_prediction', 'table_name': 'fao_data_quality_description'}
2022-02-05 13:12:52,729 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-02-05 13:12:52,736 INFO sqlalchemy.engine.Engine INSERT INTO fao_data_quality_description (fao_data_quality_flag, description) VALUES (%(fao_data_quality_flag)s, %(description)s)
2022-02-05 13:12:52,738 INFO sqlalchemy.engine.Engine [generated in 0.00181s] ({'fao_data_quality_flag': 'U', 'description': 'Unofficial figure'}, {'fao_data_quality_flag': 'O', 'description': 'Official data'}, {'fao_data_quality_flag': 'A', 'description': 'Aggregate; may include official; semi-official; estimated or calculated data'}, {'fao_data_quality_flag': 'F

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fe9f806b970>

In [None]:
# Replace NaN values in measurement_id, quality_id and source_id columns to match id descriptions
noaa_ghcn_aws_data_2021.measurement_id.replace(np.nan, 'None', inplace=True)
noaa_ghcn_aws_data_2021.quality_id.replace(np.nan, 'None', inplace=True)
noaa_ghcn_aws_data_2021.source_id.replace(np.nan, 'None', inplace=True)
noaa_ghcn_aws_data_2021.head()

In [None]:
noaa_ghcn_aws_data_2021.shape

In [None]:
noaa_ghcn_aws_data_2021.element.unique()

In [None]:
# Check for multiple measurements at different times
grouped = noaa_ghcn_aws_data_2021.groupby(['station_id', 'date', 'element']).count().sort_values('observation_time')
grouped

In [None]:
# We can drop the observation_time column because there is only one obervation per day
noaa_ghcn_aws_data_2021.drop('observation_time', axis=1, inplace=True)
noaa_ghcn_aws_data_2021.head()

In [None]:
# Extract country code from station_id
noaa_ghcn_aws_data_2021['country_code'] = noaa_ghcn_aws_data_2021.station_id.str[:2]
noaa_ghcn_aws_data_2021.head()

In [None]:
# Check for multiple temperature measurements on different soil types and depths
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('MD')]

In [None]:
grouped[grouped.element > 1].sort_values('element') # We need to create all possible soil temp element_ids in case they show up in other years

In [None]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id=='USC00218450')&(noaa_ghcn_aws_data_2021.date=='2021-06-27')][['element', 'data_value']]

In [None]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element=='FMTM'].head(100)

## Create dataframes for datatype descriptions

In [None]:
measurement_ids = [
    {'measurement_id': 'None', 'description': 'no measurement information applicable'},
    {'measurement_id': 'B', 'description': 'precipitation total formed from two 12-hour totals'},
    {'measurement_id': 'D', 'description': 'precipitation total formed from four six-hour totals'},
    {'measurement_id': 'H', 'description': 'represents highest or lowest hourly temperature (TMAX or TMIN) or the average of hourly values (TAVG)'},
    {'measurement_id': 'K', 'description': 'converted from knots'},
    {'measurement_id': 'L', 'description': 'temperature appears to be lagged with respect to reported hour of observation'},
    {'measurement_id': 'O', 'description': 'converted from oktas'},
    {'measurement_id': 'P', 'description': 'identified as “missing presumed zero” in DSI 3200 and 3206'},
    {'measurement_id': 'T', 'description': 'trace of precipitation, snowfall, or snow depth'},
    {'measurement_id': 'W', 'description': 'converted from 16-point WBAN code (for wind direction)'},
]
quality_ids = [
    {'quality_id': 'None', 'description': 'did not fail any quality assurance check'},
    {'quality_id': 'D', 'description': 'failed duplicate check'},
    {'quality_id': 'G', 'description': 'failed gap check'},
    {'quality_id': 'I', 'description': 'failed internal consistency check'},
    {'quality_id': 'K', 'description': 'failed streak/frequent-value check'},
    {'quality_id': 'L', 'description': 'failed check on length of multiday period'},
    {'quality_id': 'M', 'description': 'failed mega consistency check'},
    {'quality_id': 'N', 'description': 'failed naught check'},
    {'quality_id': 'O', 'description': 'failed climatological outlier check'},
    {'quality_id': 'R', 'description': 'failed lagged range check'},
    {'quality_id': 'S', 'description': 'failed spatial consistency check'},
    {'quality_id': 'T', 'description': 'failed temporal consistency check'},
    {'quality_id': 'W', 'description': 'temperature too warm for snow'},
    {'quality_id': 'X', 'description': 'failed bounds check'},
    {'quality_id': 'Z', 'description': 'flagged as a result of an official Datzilla Investigation'},
]
source_ids = [
    {'source_id': 'None', 'description': 'No source (i.e., data value missing)'},
    {'source_id': '0', 'description': 'U.S. Cooperative Summary of the Day (NCDC DSI-3200)'},
    {'source_id': '6', 'description': 'CDMP Cooperative Summary of the Day (NCDC DSI-3206)'},
    {'source_id': '7', 'description': 'U.S. Cooperative Summary of the Day - Transmitted via WxCoder3 (NCDC SI-3207)'},
    {'source_id': 'A', 'description': 'U.S. Automated Surface Observing System (ASOS) real-time data (since January 1, 2006)'},
    {'source_id': 'a', 'description': 'Australian data from the Australian Bureau of Meteorology'},
    {'source_id': 'B', 'description': 'U.S. ASOS data for October 2000-December 2005 (NCDC DSI-3211)'},
    {'source_id': 'b', 'description': 'Belarus update'},
    {'source_id': 'C', 'description': 'Environment Canada'},
    {'source_id': 'E', 'description': 'European Climate Assessment and Dataset (Klein Tank et al., 2002)'},
    {'source_id': 'F', 'description': 'U.S. Fort data'},
    {'source_id': 'G', 'description': 'Official Global Climate Observing System (GCOS) or other government-supplied data'},
    {'source_id': 'H', 'description': 'High Plains Regional Climate Center real-time data'},
    {'source_id': 'I', 'description': 'International collection (non U.S. data received through personal contacts)'},
    {'source_id': 'K', 'description': 'U.S. Cooperative Summary of the Day data digitized from paper observer forms (from 2011 to present)'},
    {'source_id': 'M', 'description': 'Monthly METAR Extract (additional ASOS data)'},
    {'source_id': 'N', 'description': 'Community Collaborative Rain, Hail,and Snow (CoCoRaHS)'},
    {'source_id': 'Q', 'description': 'Data from several African countries that had been “quarantined”, that is, withheld from public release until permission was granted from the respective meteorological services'},
    {'source_id': 'R', 'description': 'NCEI Reference Network Database (Climate Reference Network and Regional Climate Reference Network)'},
    {'source_id': 'r', 'description': 'All-Russian Research Institute of Hydro-meteorological Information-World Data Center'},
    {'source_id': 'S', 'description': 'Global Summary of the Day (NCDC DSI-9618)NOTE: “S” values are derived from hourly synoptic reports exchanged on the Global Telecommunications System (GTS). Daily values derived in this fashion may differ significantly from “true” daily data, particularly for precipitation (i.e., use with caution).'},
    {'source_id': 's', 'description': 'China Meteorological Administration/National Meteorological Information Center/Climatic Data Center (http://cdc.cma.gov.cn)'},
    {'source_id': 'T', 'description': "SNOwpack TELemtry (SNOTEL) data obtained from the U.S. Department of Agriculture's Natural Resources Conservation Service"},
    {'source_id': 'U', 'description': 'Remote Automatic Weather Station (RAWS) data obtained from the Western Regional Climate Center'},
    {'source_id': 'u', 'description': 'Ukraine update'},
    {'source_id': 'W', 'description': "WBAN/ASOS Summary of the Day from NCDC's Integrated Surface Data (ISD)."},
    {'source_id': 'X', 'description': 'U.S. First-Order Summary of the Day (NCDC DSI-3210)'},
    {'source_id': 'Z', 'description': 'Datzilla official additions or replacements'},
    {'source_id': 'z', 'description': 'Uzbekistan update'},
]

In [None]:
soil_temp_units = 'tenths of degrees C'
# Minimum soil temperature element_id=SN*#, where * corresponds to a code for ground cover and # corresponds to a code for soil depth.
# Maximum soil temperature element_id=SX*#, where * corresponds to a code for ground cover and # corresponds to a code for soil depth.

ground_cover_map = { 
    '0': "unknown",
    '1': "grass",
    '2': "fallow",
    '3': "bare ground",
    '4': "brome grass",
    '5': "sod",
    '6': "straw mulch",
    '7': "grass muck",
    '8': "bare muck",
}

soil_depth_map = {
    '1': "5 cm",
    '2': "10 cm",
    '3': "20 cm",
    '4': "50 cm",
    '5': "100 cm",
    '6': "150 cm",
    '7': "180 cm",
}

min_soil_temp_element_ids = []
max_soil_temp_element_ids = []

for gc_id, gc_desc in ground_cover_map.items():
    for sd_id, sd_desc in soil_depth_map.items():
        min_soil_temp_element_ids.append({
            'element_id': 'SN'+gc_id+sd_id, 
            'description': f'Minimum soil temperature: Ground cover = {gc_desc}, Soil depth = {sd_desc}',
            'units': soil_temp_units
            })
        max_soil_temp_element_ids.append({
            'element_id': 'SX'+gc_id+sd_id, 
            'description': f'Maximum soil temperature: Ground cover = {gc_desc}, Soil depth = {sd_desc}',
            'units': soil_temp_units
            })
pd.DataFrame(min_soil_temp_element_ids).head()


In [None]:
element_ids = [
    {'element_id': 'PRCP', 'description': 'Precipitation', 'units': 'tenths of mm'},
    {'element_id': 'SNOW', 'description': 'Snowfall', 'units': 'mm'},
    {'element_id': 'SNWD', 'description': 'Snow depth', 'units': 'mm'},
    {'element_id': 'TMAX', 'description': 'Maximum temperature', 'units': 'tenths of degrees C'},
    {'element_id': 'TMIN', 'description': 'Minimum temperature', 'units': 'tenths of degrees C'},
    {'element_id': 'ACMC', 'description': 'Average cloudiness midnight to midnight from 30-second ceilometer data', 'units': 'percent'},
    {'element_id': 'ACMH', 'description': 'Average cloudiness midnight to midnight from manual observations', 'units': 'percent'},
    {'element_id': 'ACSC', 'description': 'Average cloudiness sunrise to sunset from 30-second ceilometer data', 'units': 'percent'},
    {'element_id': 'ACSH', 'description': 'Average cloudiness sunrise to sunset from manual observations', 'units': 'percent'},
    {'element_id': 'AWDR', 'description': 'Average daily wind direction', 'units': 'degrees'},
    {'element_id': 'AWND', 'description': 'Average daily wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'EVAP', 'description': 'Evaporation of water from evaporation pan', 'units': 'tenths of mm'},
    {'element_id': 'FRGB', 'description': 'Base of frozen ground layer', 'units': 'cm'},
    {'element_id': 'FRGT', 'description': 'Top of frozen ground layer', 'units': 'cm'},
    {'element_id': 'FRTH', 'description': 'Thickness of frozen ground layer', 'units': 'cm'},
    {'element_id': 'GAHT', 'description': 'Difference between river and gauge height', 'units': 'cm'},
    {'element_id': 'MNPN', 'description': 'Daily minimum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'element_id': 'MXPN', 'description': 'Daily maximum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'element_id': 'PGTM', 'description': 'Peak gust time', 'units': '(hours and minutes, i.e., HHMM)'},
    {'element_id': 'PSUN', 'description': 'Daily percent of possible sunshine', 'units': 'percent'},
    {'element_id': 'THIC', 'description': 'Thickness of ice on water', 'units': 'tenths of mm'},
    {'element_id': 'TOBS', 'description': 'Temperature at the time of observation', 'units': 'tenths of degrees C'},
    {'element_id': 'TSUN', 'description': 'Daily total sunshine', 'units': '(minutes)'},
    {'element_id': 'WDF1', 'description': 'Direction of fastest 1-minute wind', 'units': 'degrees'},
    {'element_id': 'WDF2', 'description': 'Direction of fastest 2-minute wind', 'units': 'degrees'},
    {'element_id': 'WDF5', 'description': 'Direction of fastest 5-second wind', 'units': 'degrees'},
    {'element_id': 'WDFG', 'description': 'Direction of peak wind gust', 'units': 'degrees'},
    {'element_id': 'WDFI', 'description': 'Direction of highest instantaneous wind', 'units': 'degrees'},
    {'element_id': 'WDFM', 'description': 'Fastest mile wind direction', 'units': 'degrees'},
    {'element_id': 'WDMV', 'description': '24-hour wind movement', 'units': 'km'},
    {'element_id': 'WESD', 'description': 'Water equivalent of snow on the ground', 'units': 'tenths of mm'},
    {'element_id': 'WESF', 'description': 'Water equivalent of snowfall', 'units': 'tenths of mm'},
    {'element_id': 'WSF1', 'description': 'Fastest 1-minute wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSF2', 'description': 'Fastest 2-minute wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSF5', 'description': 'Fastest 5-second wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSFG', 'description': 'Peak gust wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSFI', 'description': 'Highest instantaneous wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'WSFM', 'description': 'Fastest mile wind speed', 'units': 'tenths of meters per second'},
    {'element_id': 'TAVG', 'description': "Average temperature', 'units': 'tenths of degrees C"}, # [Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]
    {'element_id': 'WT01', 'description': "Weather type: Fog, ice fog, or freezing fog (may include heavy fog)"},
    {'element_id': 'WT02', 'description': "Weather type: Heavy fog or heaving freezing fog (not always distinguished from fog)"},
    {'element_id': 'WT03', 'description': "Weather type: Thunder"},
    {'element_id': 'WT04', 'description': "Weather type: Ice pellets, sleet, snow pellets, or small hail"},
    {'element_id': 'WT05', 'description': "Weather type: Hail (may include small hail)"},
    {'element_id': 'WT06', 'description': "Weather type: Glaze or rime"},
    {'element_id': 'WT07', 'description': "Weather type: Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction"},
    {'element_id': 'WT08', 'description': "Weather type: Smoke or haze"},
    {'element_id': 'WT09', 'description': "Weather type: Blowing or drifting snow"},
    {'element_id': 'WT10', 'description': "Weather type: Tornado, waterspout, or funnel cloud"},
    {'element_id': 'WT11', 'description': "Weather type: High or damaging winds"},
    {'element_id': 'WT12', 'description': "Weather type: Blowing spray"},
    {'element_id': 'WT13', 'description': "Weather type: Mist"},
    {'element_id': 'WT14', 'description': "Weather type: Drizzle"},
    {'element_id': 'WT15', 'description': "Weather type: Freezing drizzle"},
    {'element_id': 'WT16', 'description': "Weather type: Rain (may include freezing rain, drizzle, and freezing drizzle)"},
    {'element_id': 'WT17', 'description': "Weather type: Freezing rain"},
    {'element_id': 'WT18', 'description': "Weather type: Snow, snow pellets, snow grains, or ice crystals"},
    {'element_id': 'WT19', 'description': "Weather type: Unknown source of precipitation"},
    {'element_id': 'WT21', 'description': "Weather type: Ground fog"},
    {'element_id': 'WT22', 'description': "Weather type: Ice fog or freezing fog"},

    {'element_id': 'WV01', 'description': "Weather in vicinity: Fog, ice fog, or freezing fog (may include heavy fog)"},
    {'element_id': 'WV03', 'description': "Weather in vicinity: Thunder"},
    {'element_id': 'WV07', 'description': "Weather in vicinity: Ash, dust, sand, or other blowing obstruction"},
    {'element_id': 'WV18', 'description': "Weather in vicinity: Snow or ice crystals"},
    {'element_id': 'WV20', 'description': "Weather in vicinity: Rain or snow shower"},

    {'element_id': 'FMTM', 'description': 'Time of fastest mile or fastest 1-minute wind', 'units': '(hours and minutes,i.e., HHMM)'},

    {'element_id': 'DASF', 'description': 'Number of days included in the multiday snowfall total', 'units': '(MDSF)'},
    {'element_id': 'MDSF', 'description': 'Multiday snowfall total'},

    {'element_id': 'DAWM', 'description': 'Number of days included in the multiday wind movement', 'units': '(MDWM)'},
    {'element_id': 'MDWM', 'description': 'Multiday wind movement', 'units': 'km'},

    {'element_id': 'DAEV', 'description': 'Number of days included in the multiday evaporation total', 'units': '(MDEV)'},
    {'element_id': 'MDEV', 'description': 'Multiday evaporation total', 'units': '(tenths of mm; use with DAEV)'},

    {'element_id': 'DWPR', 'description': 'Number of days with non-zero precipitation included in multiday precipitation total', 'units': '(MDPR)'},
    {'element_id': 'DAPR', 'description': 'Number of days included in the multiday precipitation total', 'units': '(MDPR)'},
    {'element_id': 'MDPR', 'description': 'Multiday precipitation total', 'units': '(tenths of mm; use with DAPR and DWPR, if available)'},

    {'element_id': 'DATN', 'description': 'Number of days included in the multiday minimum temperature', 'units': '(MDTN)'},
    {'element_id': 'MDTN', 'description': 'Multiday minimum temperature', 'units': 'tenths of degrees C; (use with DATN)'},

    {'element_id': 'DATX', 'description': 'Number of days included in the multiday maximum temperature', 'units': '(MDTX)'},
    {'element_id': 'MDTX', 'description': 'Multiday maximum temperature', 'units': 'tenths of degrees C; (use with DATX)'},
]
element_ids.extend(max_soil_temp_element_ids)
element_ids.extend(min_soil_temp_element_ids)
len(element_ids)

In [None]:
# Fetch all available data categories
endpoint='datatypes'
querystring = {'limit': 1000, 'datacategoryid': 'TMAX'}
url = f"{base_url}{endpoint}"

datatype = requests.request("GET", url, headers=headers, params=querystring)
pp(datatype.json())

In [None]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id.str.contains('AS')) & (noaa_ghcn_aws_data_2021.date=='2021-01-11')].head(500)

In [None]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('MD')]

In [None]:
set([item['element_id'] for item in element_ids]) - set(noaa_ghcn_aws_data_2021.element.unique().tolist())

In [None]:
len(noaa_ghcn_aws_data_2021.element.unique().tolist())

In [None]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id=='USC00218450')&(noaa_ghcn_aws_data_2021.date=='2021-06-27')][['element', 'data_value']]