In [4]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey, Date, Text, Boolean
from pprint import pprint as pp
from datetime import datetime

from config import local_mysql_password, local_mysql_user
from helpers import FIPS_10_country_codes, uwm_location_to_fips_country_map

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

## Import FAO crop yield data

In [5]:
fao_file_path = "./data/fao_data/"

# Load the crop yield data
file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
unprocessed_fao_crop_yield_data = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')

### Preprocessing

In [6]:
fao_crop_yield_data = unprocessed_fao_crop_yield_data.copy()
# Replace spaces column names with underscores and make lower case
fao_crop_yield_data.columns = fao_crop_yield_data.columns.str.replace(' ','_').str.lower()
# Rename confusing data aggregation flags
fao_crop_yield_data.flag.replace(np.nan, 'O', inplace=True)
fao_crop_yield_data.flag.replace('*', 'U', inplace=True)
# Remove 0 crop yield values to prevent bad training data
fao_crop_yield_data = fao_crop_yield_data[fao_crop_yield_data.value != 0.0]
# Drop the year_code because it is always the same as the year value
fao_crop_yield_data.drop('year_code', axis=1, inplace=True)
# Drop areas which are not countries
regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]
fao_countries = fao_crop_yield_data.area.loc[~fao_crop_yield_data.area.isin(regions+special_groups)].unique().tolist()
fao_crop_yield_data = fao_crop_yield_data[fao_crop_yield_data.area.isin(fao_countries)]
# Rename columns so they aren't confused with the NOAA data
fao_crop_yield_data.rename({
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
    'flag': 'fao_data_quality_id', 
    'item': 'product', 
    'item_code': 'product_id',
    'element': 'production_type',
    'element_code': 'production_type_id'
    }, axis=1, inplace=True)

fao_crop_yield_data.head()

Unnamed: 0,fao_country_code,fao_country_name,product_id,product,production_type_id,production_type,year,unit,value,fao_data_quality_id
1,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,ha,5900.0,F
2,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,ha,6000.0,F
3,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,ha,6000.0,F
4,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,ha,6000.0,F
5,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1980,ha,5800.0,F


In [4]:
fao_crop_yield_data.shape

(2802701, 10)

### Combine UCM, NOAA and FAO country lists to match crop calendars and weather data to crop yields

In [105]:
# Load the fao countries
fao_country_data = fao_crop_yield_data.groupby(['fao_country_name', 'fao_country_code']).year.agg(['min', 'max'])
fao_country_data.reset_index(inplace=True)
# Merge with the FIPS country codes
country_codes = pd.DataFrame(FIPS_10_country_codes)
country_codes = country_codes.merge(fao_country_data, how='inner', left_on='country_name', right_on='fao_country_name')
# Load the noaa ghcnd countries
noaa_file_path = './data/noaa_ghcn_aws_data/'
noaa_countries_file_name = 'ghcnd-countries'
ghcnd_countries = pd.read_fwf(f"{noaa_file_path}{noaa_countries_file_name}.txt", header=None, names = ["noaa_country_code", "noaa_country_name"])
# Join the noaa countries
country_codes = country_codes.merge(ghcnd_countries, how='inner', left_on='noaa_country_code', right_on='noaa_country_code')

country_codes.drop(['noaa_country_name', 'fao_country_name', 'min', 'max'], axis=1, inplace=True)

country_codes.head()

Unnamed: 0,country_name,FIPS_10_country_code,noaa_country_code,fao_country_code
0,Afghanistan,AF,AF,2
1,Albania,AL,AL,3
2,Algeria,AG,AG,4
3,Angola,AO,AO,7
4,Antigua and Barbuda,AC,AC,8


## Import University of Wisconson-Madison crop calendar

In [106]:
crop_calendar_file_path = "./data/"

# Load the crop calendar
file_name = 'crop_calendar_uni_wisconsin_madison'
unprocessed_uwm_crop_calendar = pd.read_csv(f"{crop_calendar_file_path}{file_name}.csv")
unprocessed_uwm_crop_calendar.columns = unprocessed_uwm_crop_calendar.columns.str.replace('.','_').str.lower()

unprocessed_uwm_crop_calendar.head()

  unprocessed_uwm_crop_calendar.columns = unprocessed_uwm_crop_calendar.columns.str.replace('.','_').str.lower()


Unnamed: 0,unnamed: 0,data_id,location,level,nation_code,state_code,county_code,crop,qualifier,crop_name_in_original_data,plant_start,plant_start_date,plant_end,plant_end_date,plant_median,plant_range,harvest_start,harvest_start_date,harvest_end,harvest_end_date,harvest_median,harvest_range,source,notes,x,full_crop_name,location_code,tot_days,lat_avg,lon_avg,harvested_area,tmin_day_avg,temp_average,precip_average,temp_min,precip_min,temp_max,precip_max,temp_min_month,precip_min_month,temp_max_month,precip_max_month,temp_at_planting,precip_at_planting,sunfrac_at_planting,daylength_at_planting,dsw_at_planting,precip_over_pet_min,precip_over_pet_min_day,precip_over_pet_max,precip_over_pet_max_day,precip_over_pet_at_planting,apr_to_sept_temp_anomaly,apr_to_sept_precip_frac,gdd_base_0_between_plant_and_harvest,gdd_base_4_between_plant_and_harvest,gdd_base_5_between_plant_and_harvest,gdd_base_8_between_plant_and_harvest,gdd_base_10_between_plant_and_harvest,gdd_base_0,gdd_base_4,gdd_base_5,gdd_base_8,gdd_base_10,gdd_base_0_from_plant_until_cold,gdd_base_4_from_plant_until_cold,gdd_base_5_from_plant_until_cold,gdd_base_8_from_plant_until_cold,gdd_base_10_from_plant_until_cold,ndays_growing_season_below_0,ndays_growing_season_below_5,ndays_growing_season_below_10,ndays_growing_season_below_12,ndays_growing_season_below_15,ndays_growing_season_below_17,ndays_below_0,ndays_below_5,ndays_below_10,ndays_below_12,ndays_below_15,ndays_below_17,lgp,lgp_t,lgp_p,lgp_tmin5_tmax100,lgp_t_tmin5_tmax100,lgp_p_tmin5_tmax100,temp_growing_season,precip_growing_season,ndays_since_snowfrac_0_5,ndays_since_snowfrac_0_1,ndays_since_snowfrac_0_05,ndays_since_snowfrac_0,climate_category,misclassified,climate_category_string
0,1,1,Albania,N,55,,,Wheat,Winter,Wheat,289,10/16,349,12/15,319.0,61,121.0,5/1,181.0,6/30,151.0,61.0,GIEWS,,,Wheat.Winter,55,197.0,40.936144,19.920749,1120.28338,-11.043975,13.298533,93.923606,4.623191,31.523111,22.225507,163.806151,1.0,7.0,7.0,11.0,9.907873,163.806151,0.423464,10.886388,87.250043,0.167999,197.0,6.756057,350.0,4.486921,5.123738,0.616576,1754.963575,966.963575,773.447847,391.263188,238.614476,4869.060207,3409.060207,3047.544478,2161.359819,1672.727825,,,106.211578,14.813273,0.0,0.0,18.0,137.0,153.0,174.0,190.0,0.0,18.0,138.0,167.0,208.0,237.0,252.0,365.0,252.0,233.0,346.0,252.0,8.908445,113.915234,276.0,219.0,-152.0,-160.0,1.0,True,1X
1,2,2,Albania,N,55,,,Maize,,Maize,106,4/16,167,6/16,136.5,62,228.0,8/16,304.0,10/31,266.0,77.0,GIEWS,,,Maize,55,129.5,41.149095,19.968137,540.683231,-12.522727,12.599007,94.823209,3.682332,35.848386,21.646772,160.831472,1.0,7.0,7.0,11.0,15.691555,76.18775,0.543914,15.586047,244.024139,0.19579,197.0,6.959926,350.0,0.559177,5.238819,0.650312,2555.724795,2035.724795,1905.724795,1515.724795,1255.724795,4613.974357,3156.279807,2828.215926,2000.261356,1535.126257,,2629.199423,2400.734745,1789.673591,1422.172811,0.0,0.0,0.0,0.0,0.0,13.0,0.0,57.0,148.0,176.0,216.0,247.0,112.0,218.0,259.0,202.0,308.0,259.0,19.659421,53.091993,94.0,37.0,31.0,23.0,1.0,False,1
2,3,4,Albania,N,55,,,Pulses,,Beans,106,4/16,177,6/26,141.5,72,197.0,7/16,243.0,8/31,220.0,47.0,GIEWS,,,Pulses,55,78.5,40.981883,19.946543,281.843038,-11.520141,13.016876,93.816973,4.271594,32.871042,21.983251,161.873432,1.0,7.0,7.0,11.0,16.504997,69.337616,0.568113,15.733263,253.24726,0.176631,197.0,6.768559,350.0,0.481453,5.157698,0.627937,1588.814649,1272.814649,1193.814649,956.814649,798.814649,4766.339416,3306.339416,2954.086708,2093.502256,1614.442635,,,2434.024415,1821.777486,1456.814408,0.0,0.0,0.0,0.0,0.0,5.0,0.0,35.0,142.0,171.0,211.0,241.0,112.0,224.0,253.0,218.0,330.0,253.0,20.111578,45.102474,99.0,42.0,36.0,28.0,1.0,False,1
3,4,5,Albania,N,55,,,Other,Winter vegetables,Winter vegetables,289,10/16,340,12/6,314.5,52,45.0,2/14,105.0,4/15,75.0,61.0,GIEWS,"Cabbage, cauliflower, spinach, onion, garlic",,Other.Winter vegetables,55,125.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,
4,5,6,Albania,N,55,,,Other,Spring field vegetables,Spring vegetables (field planting),106,4/16,137,5/17,121.5,32,229.0,8/17,288.0,10/15,258.5,60.0,GIEWS,"Fruits, tomato",,Other.Spring field vegetables,55,137.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,


In [107]:
uwm_crop_calendar = unprocessed_uwm_crop_calendar.copy()
# Add the new regions to the existing FIPS data and add to the uwm data
uwm_to_fips_country_map.update(dict(zip(country_codes.country_name, country_codes.FIPS_10_country_code)))
uwm_crop_calendar['country_code'] = uwm_crop_calendar.location.map(uwm_to_fips_country_map)
uwm_crop_calendar

Unnamed: 0,unnamed: 0,data_id,location,level,nation_code,state_code,county_code,crop,qualifier,crop_name_in_original_data,plant_start,plant_start_date,plant_end,plant_end_date,plant_median,plant_range,harvest_start,harvest_start_date,harvest_end,harvest_end_date,harvest_median,harvest_range,source,notes,x,full_crop_name,location_code,tot_days,lat_avg,lon_avg,harvested_area,tmin_day_avg,temp_average,precip_average,temp_min,precip_min,temp_max,precip_max,temp_min_month,precip_min_month,temp_max_month,precip_max_month,temp_at_planting,precip_at_planting,sunfrac_at_planting,daylength_at_planting,dsw_at_planting,precip_over_pet_min,precip_over_pet_min_day,precip_over_pet_max,precip_over_pet_max_day,precip_over_pet_at_planting,apr_to_sept_temp_anomaly,apr_to_sept_precip_frac,gdd_base_0_between_plant_and_harvest,gdd_base_4_between_plant_and_harvest,gdd_base_5_between_plant_and_harvest,gdd_base_8_between_plant_and_harvest,gdd_base_10_between_plant_and_harvest,gdd_base_0,gdd_base_4,gdd_base_5,gdd_base_8,gdd_base_10,gdd_base_0_from_plant_until_cold,gdd_base_4_from_plant_until_cold,gdd_base_5_from_plant_until_cold,gdd_base_8_from_plant_until_cold,gdd_base_10_from_plant_until_cold,ndays_growing_season_below_0,ndays_growing_season_below_5,ndays_growing_season_below_10,ndays_growing_season_below_12,ndays_growing_season_below_15,ndays_growing_season_below_17,ndays_below_0,ndays_below_5,ndays_below_10,ndays_below_12,ndays_below_15,ndays_below_17,lgp,lgp_t,lgp_p,lgp_tmin5_tmax100,lgp_t_tmin5_tmax100,lgp_p_tmin5_tmax100,temp_growing_season,precip_growing_season,ndays_since_snowfrac_0_5,ndays_since_snowfrac_0_1,ndays_since_snowfrac_0_05,ndays_since_snowfrac_0,climate_category,misclassified,climate_category_string,country_code
0,1,1,Albania,N,55,,,Wheat,Winter,Wheat,289,10/16,349,12/15,319.0,61,121.0,5/1,181.0,6/30,151.0,61.0,GIEWS,,,Wheat.Winter,55,197.0,40.936144,19.920749,1120.283380,-11.043975,13.298533,93.923606,4.623191,31.523111,22.225507,163.806151,1.0,7.0,7.0,11.0,9.907873,163.806151,0.423464,10.886388,87.250043,0.167999,197.0,6.756057,350.0,4.486921,5.123738,0.616576,1754.963575,966.963575,773.447847,391.263188,238.614476,4869.060207,3409.060207,3047.544478,2161.359819,1672.727825,,,106.211578,14.813273,0.000000,0.0,18.0,137.0,153.0,174.0,190.0,0.0,18.0,138.0,167.0,208.0,237.0,252.0,365.0,252.0,233.0,346.0,252.0,8.908445,113.915234,276.0,219.0,-152.0,-160.0,1.0,True,1X,AL
1,2,2,Albania,N,55,,,Maize,,Maize,106,4/16,167,6/16,136.5,62,228.0,8/16,304.0,10/31,266.0,77.0,GIEWS,,,Maize,55,129.5,41.149095,19.968137,540.683231,-12.522727,12.599007,94.823209,3.682332,35.848386,21.646772,160.831472,1.0,7.0,7.0,11.0,15.691555,76.187750,0.543914,15.586047,244.024139,0.195790,197.0,6.959926,350.0,0.559177,5.238819,0.650312,2555.724795,2035.724795,1905.724795,1515.724795,1255.724795,4613.974357,3156.279807,2828.215926,2000.261356,1535.126257,,2629.199423,2400.734745,1789.673591,1422.172811,0.0,0.0,0.0,0.0,0.0,13.0,0.0,57.0,148.0,176.0,216.0,247.0,112.0,218.0,259.0,202.0,308.0,259.0,19.659421,53.091993,94.0,37.0,31.0,23.0,1.0,False,1,AL
2,3,4,Albania,N,55,,,Pulses,,Beans,106,4/16,177,6/26,141.5,72,197.0,7/16,243.0,8/31,220.0,47.0,GIEWS,,,Pulses,55,78.5,40.981883,19.946543,281.843038,-11.520141,13.016876,93.816973,4.271594,32.871042,21.983251,161.873432,1.0,7.0,7.0,11.0,16.504997,69.337616,0.568113,15.733263,253.247260,0.176631,197.0,6.768559,350.0,0.481453,5.157698,0.627937,1588.814649,1272.814649,1193.814649,956.814649,798.814649,4766.339416,3306.339416,2954.086708,2093.502256,1614.442635,,,2434.024415,1821.777486,1456.814408,0.0,0.0,0.0,0.0,0.0,5.0,0.0,35.0,142.0,171.0,211.0,241.0,112.0,224.0,253.0,218.0,330.0,253.0,20.111578,45.102474,99.0,42.0,36.0,28.0,1.0,False,1,AL
3,4,5,Albania,N,55,,,Other,Winter vegetables,Winter vegetables,289,10/16,340,12/6,314.5,52,45.0,2/14,105.0,4/15,75.0,61.0,GIEWS,"Cabbage, cauliflower, spinach, onion, garlic",,Other.Winter vegetables,55,125.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,,AL
4,5,6,Albania,N,55,,,Other,Spring field vegetables,Spring vegetables (field planting),106,4/16,137,5/17,121.5,32,229.0,8/17,288.0,10/15,258.5,60.0,GIEWS,"Fruits, tomato",,Other.Spring field vegetables,55,137.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,,AL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,1506,1790,Zimbabwe,N,215,,,Cotton,,Cotton,274,10/1,334,11/30,304.0,61,91.0,4/1,151.0,5/31,121.0,61.0,Crop Explorer,,,Cotton,215,182.0,-17.689197,30.844108,2442.275301,-0.660494,20.853647,67.560166,15.814882,1.834750,23.964363,200.504262,7.0,7.0,11.0,1.0,23.836866,54.013311,0.661419,13.490008,271.628623,0.017050,227.0,1.204739,15.0,0.297463,-2.439231,0.137403,4144.118511,3416.118511,3234.118511,2688.118511,2324.118511,7606.671629,6146.671629,5781.671629,4686.671629,3956.671629,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,129.0,365.0,129.0,129.0,365.0,129.0,22.769882,125.169462,,,,,2.0,False,2,ZI
1506,1507,1791,Zimbabwe,N,215,,,Groundnuts,,Peanuts,152,6/1,198,7/17,175.0,47,274.0,10/1,365.0,12/31,319.5,92.0,Crop Explorer,,,Groundnuts,215,144.5,-19.554047,31.002287,1893.610760,0.777459,20.985006,58.489239,15.632047,3.254618,24.192129,157.423968,7.0,7.0,1.0,1.0,15.744653,5.378879,0.758591,11.747908,178.914896,0.032887,197.0,0.880254,15.0,0.057275,-2.691958,0.199742,2868.037491,2288.037491,2143.037491,1708.037491,1418.037491,7653.609739,6193.609739,5828.609739,4733.609739,4003.609739,,,,,,0.0,0.0,0.0,0.0,0.0,41.0,0.0,0.0,0.0,0.0,0.0,64.0,116.0,365.0,116.0,116.0,365.0,116.0,19.779569,19.324644,,,,,4.0,True,4X,ZI
1507,1508,1792,Zimbabwe,N,215,,,Sunflower,,Sunflower,305,11/1,365,12/31,335.0,61,91.0,4/1,151.0,5/31,121.0,61.0,Crop Explorer,,,Sunflower,215,151.0,-19.085826,31.426816,282.718814,0.597860,20.537690,67.228697,15.424960,4.690041,23.476448,180.749423,7.0,7.0,1.0,1.0,23.364629,132.042596,0.544670,13.992673,255.362251,0.045649,227.0,1.044151,15.0,0.774731,-2.548220,0.202444,3397.636591,2793.636591,2642.636591,2189.636591,1887.636591,7490.694747,6030.694747,5665.694747,4570.694747,3840.694747,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,132.0,365.0,132.0,132.0,365.0,132.0,22.500905,125.010097,,,,,4.0,False,4,ZI
1508,1509,1793,Somalia,N,153,,,Sorghum,,Sorghum (Gu),95,4/5,125,5/5,110.0,31,217.0,8/5,248.0,9/5,232.5,32.0,Crop Explorer,"Data given for ""Millet and Sorghum"" - split in...",,Sorghum,153,122.5,1.814811,42.915878,2062.410789,16.378333,27.527942,36.486114,26.046182,3.194088,29.259843,88.566650,7.0,2.0,3.0,4.0,28.736409,87.387417,0.693988,12.867031,257.994566,0.015827,45.0,0.474292,105.0,0.472696,-0.440521,1.228650,3312.822751,2820.822751,2697.822751,2328.822751,2082.822751,10045.238979,8585.238979,8220.238979,7125.238979,6395.238979,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365.0,0.0,0.0,365.0,0.0,26.933518,48.139166,365.0,365.0,365.0,365.0,3.0,False,3,SO


In [108]:
# Clean up the crop yield data
# Remove rows which belong to dropped countries/regions
fao_crop_yield_data = fao_crop_yield_data[fao_crop_yield_data.fao_country_code.isin(country_codes.fao_country_code)]

# Remove rows relating to unnecessary production_types.
# Yield = production/area_harvested so we can drop those two rows. 
print(fao_crop_yield_data.shape)
fao_crop_yield_data = fao_crop_yield_data[~fao_crop_yield_data.production_type.isin(['Area harvested', 'Production'])]
# Products with other production types are animals, which do not depend on weather.
# Find the corresponding product_ids and drop them
print(fao_crop_yield_data.shape)
production_types_to_drop = ['Stocks', 'Laying', 'Producing Animals/Slaughtered', 'Yield/Carcass Weight', 'Milk Animals', 'Prod Popultn']
products_to_drop = fao_crop_yield_data[fao_crop_yield_data.production_type.isin(production_types_to_drop)].product_id.unique()
fao_crop_yield_data = fao_crop_yield_data[~fao_crop_yield_data.product_id.isin(products_to_drop)]
print(fao_crop_yield_data.shape)

# Removing unnecessary columns
fao_crop_yield_data.drop(['fao_country_name', 'product', 'production_type_id', 'production_type'], axis=1, inplace=True, errors='ignore')

# Rename the products as crops since we have removed all other products
fao_crop_yield_data.rename({'product_id': 'crop_id',}, axis=1, inplace=True)

(2538977, 10)
(1012287, 10)
(481809, 10)


In [109]:
# Split the country_data to create two tables with primary keys
fao_country_codes = country_data[['fao_country_code', 'fao_country_name', 'noaa_country_code', 'EU']]
noaa_country_codes = country_data[['noaa_country_code', 'noaa_country_name', 'fao_country_code', 'EU']]
noaa_country_codes.head()

NameError: name 'country_data' is not defined

### Import FAO data aggregation codes

In [110]:
file_name = 'Production_Crops_Livestock_E_Flags'
fao_data_quality_description = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')
# Make column names lower case
fao_data_quality_description.columns = fao_data_quality_description.columns.str.lower()
fao_data_quality_description.rename({'flag': 'fao_data_quality_id'}, axis=1, inplace=True)
# Rename confusing data aggregation flags
fao_data_quality_description.replace('<blank>', 'O', inplace=True)
fao_data_quality_description.replace('*', 'U', inplace=True)
fao_data_quality_description

Unnamed: 0,fao_data_quality_id,description
0,U,Unofficial figure
1,O,Official data
2,A,Aggregate; may include official; semi-official...
3,F,FAO estimate
4,Fc,Calculated data
5,Im,FAO data based on imputation methodology
6,M,Data not available


### Import product item codes

In [166]:

file_name = 'Production_Crops_Livestock_E_ItemCodes'
fao_crop_ids = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{file_name}.csv", encoding='latin-1')
# Replace spaces column names with underscores and make lower case
fao_crop_ids.columns = fao_crop_ids.columns.str.replace(' ','_').str.lower()
# Rename columns so they aren't confused with the NOAA data
fao_crop_ids.rename({'item': 'crop', 'item_code': 'crop_id'}, axis=1, inplace=True)
# Remove product_ids that don't exist in the fao_crop_yield data
fao_crop_ids = fao_crop_ids[fao_crop_ids.crop_id.isin(fao_crop_yield_data.crop_id)]
# Drop the unnecessary cpc_code column
fao_crop_ids.drop('cpc_code', axis=1, inplace=True)
# Clean up the names
fao_crop_ids.crop = fao_crop_ids.crop.str.split(';').str[0]
fao_crop_ids.crop = fao_crop_ids.crop.str.split(' nes').str[0]

fao_crop_ids.shape

(170, 2)

In [161]:
# Find duplicate crops after cleaning the crop names
crop_name_counts = fao_crop_ids.groupby('crop').count().reset_index()
duplicated_crops = crop_name_counts[crop_name_counts.crop_id > 1].crop.tolist()
duplicated_crops

['Beans',
 'Cereals',
 'Cherries',
 'Chillies and peppers',
 'Fruit',
 'Maize',
 'Oilcrops',
 'Onions',
 'Peas',
 'Pulses',
 'Vegetables']

In [137]:
fao_crop_ids[fao_crop_ids.crop.duplicated(keep=False)].sort_values('crop')

sdfdsf

Unnamed: 0,crop_id,crop
84,176,Beans
173,414,Beans
198,531,Cherries
197,530,Cherries
233,689,Chillies and peppers
168,401,Chillies and peppers
225,603,Fruit
202,542,Fruit
201,541,Fruit
226,619,Fruit


In [140]:
fao_crop_yield_data.shape

(481809, 6)

In [163]:
# Add the crop names to combine duplicates
fao_crop_yield_data.merge(fao_crop_ids, on='crop_id')

Unnamed: 0,fao_country_code,crop_id,year,unit,value,fao_data_quality_id,crop
0,2,221,1976,hg/ha,16610.0,Fc,Almonds
1,2,221,1977,hg/ha,15000.0,Fc,Almonds
2,2,221,1978,hg/ha,20000.0,Fc,Almonds
3,2,221,1979,hg/ha,17500.0,Fc,Almonds
4,2,221,1980,hg/ha,17069.0,Fc,Almonds
...,...,...,...,...,...,...,...
481804,170,216,2016,hg/ha,4822.0,Fc,Brazil nuts
481805,170,216,2017,hg/ha,4944.0,Fc,Brazil nuts
481806,170,216,2018,hg/ha,4940.0,Fc,Brazil nuts
481807,170,216,2019,hg/ha,4901.0,Fc,Brazil nuts


In [154]:
fao_crop_yield_data[(fao_crop_yield_data.fao_country_code==351)&(fao_crop_yield_data.year==2020)&(fao_crop_yield_data.crop_id.isin([176, 414]))]

Unnamed: 0,fao_country_code,crop_id,year,unit,value,fao_data_quality_id
484641,351,176,2020,hg/ha,17441.0,Fc
484791,351,414,2020,hg/ha,269875.0,Fc


In [165]:
combined = fao_crop_yield_data.merge(fao_crop_ids, on='crop_id').groupby(['fao_country_code', 'year', 'crop']).sum('value')
combined = combined.reset_index()

combined[(combined.fao_country_code==351)&(combined.year==2020)&(combined.crop=='Beans')]


Unnamed: 0,fao_country_code,year,crop,crop_id,value
444902,351,2020,Beans,590,287316.0


In [None]:
split_vegetables = 'Cabbage, cauliflower, spinach, onion, garlic'
split_fruits = 'Fruits, tomato'

'Rice; paddy': 'Rice',
   'Vegetables Primary': '',
    'Vegetables; fresh nes': '',
    'Vegetables; leguminous nes': '',
    'Mustard seed': '',
    Maize; green'

In [85]:
sorted(fao_product_ids['product'].unique().tolist())[-50:]

['Potatoes',
 'Pulses nes',
 'Pulses; Total',
 'Pumpkins; squash and gourds',
 'Pyrethrum; dried',
 'Quinces',
 'Quinoa',
 'Ramie',
 'Rapeseed',
 'Raspberries',
 'Rice; paddy',
 'Roots and Tubers; Total',
 'Roots and tubers nes',
 'Rubber; natural',
 'Rye',
 'Safflower seed',
 'Seed cotton',
 'Sesame seed',
 'Sisal',
 'Sorghum',
 'Soybeans',
 'Spices nes',
 'Spinach',
 'Strawberries',
 'String beans',
 'Sugar Crops Primary',
 'Sugar beet',
 'Sugar cane',
 'Sugar crops nes',
 'Sunflower seed',
 'Sweet potatoes',
 'Tallowtree seed',
 'Tangerines; mandarins; clementines; satsumas',
 'Taro (cocoyam)',
 'Tea',
 'Tobacco; unmanufactured',
 'Tomatoes',
 'Treenuts; Total',
 'Triticale',
 'Tung nuts',
 'Vanilla',
 'Vegetables Primary',
 'Vegetables; fresh nes',
 'Vegetables; leguminous nes',
 'Vetches',
 'Walnuts; with shell',
 'Watermelons',
 'Wheat',
 'Yams',
 'Yautia (cocoyam)']

In [87]:
uwm_crop_list = ['Wheat', 'Maize', 'Potatoes', 'Barley', 'Rice', 'Sorghum', 'Rye', 'Millet', 'Cassava', 'Soybeans', 'Groundnuts', 'Oats', 'Sunflower', 'Sugarcane', 'Rapeseed', 'Sweet.Potatoes']
uwm_crops_to_keep = uwm_crop_calendar[uwm_crop_calendar.crop.isin(uwm_crop_list)]
uwm_crops_to_keep.shape

(1247, 97)

In [33]:
uwm_crop_types = uwm_crop_calendar[['crop', 'crop_name_in_original_data', 'full_crop_name', 'notes']].copy()
uwm_crop_types.crop.unique()

array(['Wheat', 'Maize', 'Pulses', 'Other', 'Potatoes', 'Multiple',
       'Barley', 'Rice', 'Sorghum', 'Sugarbeets', 'Millet', 'Cotton',
       'Rye', 'Cassava', 'Yams', 'Soybeans', 'Sweet.Potatoes', 'Oats',
       'Groundnuts', 'Sunflower', 'Sugarcane', 'Rapeseed'], dtype=object)

In [91]:
uwm_crop_types[uwm_crop_types.crop.isin(['Multiple'])].sort_values('crop_name_in_original_data')

Unnamed: 0,crop,crop_name_in_original_data,full_crop_name,notes
190,Multiple,All Cereals (Belg),Multiple.2,Belg is the secondary season; WJS: I'm not sur...
276,Multiple,Cereals,Multiple,Not sure what crops to apply this to
350,Multiple,Crops in low lying areas,Multiple.low-lying,Not sure what crops to apply this to
327,Multiple,Forage,Multiple.Forage,
7,Multiple,Forage,Multiple.Forage,
33,Multiple,Spring Cereals,Multiple.Spring,Not sure what crops to apply this to
514,Multiple,Spring Coarse Grains,Multiple.Spring,Not sure what crops to apply this to
182,Multiple,Spring Coarse Grains,Multiple.Spring,Not sure what crops to apply this to
482,Multiple,Spring Coarse Grains,Multiple.Spring,Not sure what crops to apply this to
299,Multiple,Spring Coarse Grains,Multiple.Spring,Not sure what crops to apply this to


In [83]:
# Drop: Cotton 
# Keep crop: Wheat Maize Potatoes Barley Rice Sorghum Rye Millet
# Keep crop_name_in_original_data: Multiple Pulses
# Check Other
other_notes = ['Cabbage', 'cauliflower', 'spinach', 'onion', 'garlic', 'Fruits', 'tomato']
other_notes.extend(uwm_crop_types[uwm_crop_types.crop=='Other'].crop_name_in_original_data.tolist())
other_notes

['Cabbage',
 'cauliflower',
 'spinach',
 'onion',
 'garlic',
 'Fruits',
 'tomato',
 'Winter vegetables',
 'Spring vegetables (field planting)',
 'Spring vegetables (greenhouse planting)',
 'Teff (Meher)',
 'Winter vegetables',
 'Spring vegetables (field planting)',
 'Spring vegetables (greenhouse planting)',
 'Flaxseed',
 'Mustard',
 'Mustard',
 'Mustard',
 'Jute',
 'Mustard (Rabi)',
 'Castor (Kharif)',
 'Tobacco',
 'Tomato',
 'Tapioca',
 'Safflower',
 'Safflower',
 'Mustard',
 'Mustard',
 'Mustard',
 'Mustard',
 'Mustard',
 'Mustard',
 'Mustard',
 'Jute',
 'Jute',
 'Jute',
 'Mustard',
 'Flax',
 'Flax',
 'Buckwheat',
 'Buckwheat',
 'Buckwheat',
 'Buckwheat',
 'Flax',
 'Castorseed',
 'Flaxseed',
 'Nigerseed',
 'Sesame',
 'Bananas',
 'Sesame',
 'Flaxseed',
 'Sesame',
 'Castor beans',
 'Tobacco (long rains)',
 'Tobacco (short rains)',
 'Sesame (Gu)',
 'Sesame (Deyr)',
 'Bananas',
 'Onions',
 'Tomatoes',
 'Tobacco (Gu)',
 'Tobacco (Dyer)',
 'Flaxseed',
 'Sesame',
 'Castor beans',
 'Tobacco

In [31]:
uwm_crop_calendar.full_crop_name.unique()

array(['Wheat.Winter', 'Maize', 'Pulses', 'Other.Winter vegetables',
       'Other.Spring field vegetables',
       'Other.Spring greenhouses vegetables', 'Potatoes',
       'Multiple.Forage', 'Barley.Winter', 'Rice', 'Sorghum',
       'Sugarbeets', 'Millet', 'Multiple.Spring', 'Multiple.Winter',
       'Cotton', 'Potatoes.early', 'Rye.Winter', 'Barley', 'Maize.2',
       'Rice.irrigated', 'Cassava', 'Yams', 'Soybeans', 'Sweet.Potatoes',
       'Pulses.2', 'Sorghum.2', 'Rice.2', 'Wheat', 'Maize.3', 'Sorghum.3',
       'Rice.Transplanted', 'Maize.Transplanted', 'Maize.Exclude',
       'Other.Teff', 'Oats', 'Multiple.2', 'Groundnuts', 'Pulses.3',
       'Multiple', 'Millet.2', 'Barley.2', 'Sunflower',
       'Multiple.low-lying', 'Multiple.Walo', 'Rice.irrigated 1',
       'Rice.irrigated 2', 'Sweet.Potatoes.Exclude', 'Soybeans.2',
       'Sugarcane', 'Rapeseed', 'Rice.3', 'Rapeseed.Winter',
       'Sugarcane.irrigated', 'Rice.Summer-autumn', 'Rice.2.exclude',
       'Maize.2.exclude', '

In [29]:
uwm_crop_calendar[['crop', 'crop_name_in_original_data', 'full_crop_name', 'notes']]

Unnamed: 0,crop,crop_name_in_original_data,full_crop_name,notes
0,Wheat,Wheat,Wheat.Winter,
1,Maize,Maize,Maize,
2,Pulses,Beans,Pulses,
3,Other,Winter vegetables,Other.Winter vegetables,"Cabbage, cauliflower, spinach, onion, garlic"
4,Other,Spring vegetables (field planting),Other.Spring field vegetables,"Fruits, tomato"
...,...,...,...,...
1505,Cotton,Cotton,Cotton,
1506,Groundnuts,Peanuts,Groundnuts,
1507,Sunflower,Sunflower,Sunflower,
1508,Sorghum,Sorghum (Gu),Sorghum,"Data given for ""Millet and Sorghum"" - split in..."


In [None]:
uwm_crop_calendar[['crop', 'crop_name_in_original_data', 'full_crop_name', 'notes']]

uwm_crop_calendar = uwm_crop_calendar[[
    'Location', 'Nation.code', 'Crop', 'Qualifier',
    'Crop.name.in.original.data', 'Plant.start', 'Plant.start.date',
    'Plant.end', 'Plant.end.date', 'Plant.median', 'Plant.range',
    'Harvest.start', 'Harvest.start.date', 'Harvest.end',
    'Harvest.end.date', 'Harvest.median', 'Harvest.range', 'Source',
    'Notes', 'full.crop.name', 'lat.avg',
    'lon.avg']]

In [289]:
yield_product_names = pd.merge(fao_crop_yield_data, fao_product_ids, left_on='product_id', right_on='product_id')
yield_product_names[yield_product_names['product']=='Hides; cattle; fresh']

Unnamed: 0,fao_country_code,product_id,year,unit,value,fao_data_quality_id,cpc_code,product
48663,2,919,1961,hg/An,200.0,Fc,'02951.01,Hides; cattle; fresh
48664,2,919,1962,hg/An,200.0,Fc,'02951.01,Hides; cattle; fresh
48665,2,919,1963,hg/An,200.0,Fc,'02951.01,Hides; cattle; fresh
48666,2,919,1964,hg/An,200.0,Fc,'02951.01,Hides; cattle; fresh
48667,2,919,1965,hg/An,200.0,Fc,'02951.01,Hides; cattle; fresh
...,...,...,...,...,...,...,...,...
58288,181,919,2016,hg/An,260.0,Fc,'02951.01,Hides; cattle; fresh
58289,181,919,2017,hg/An,261.0,Fc,'02951.01,Hides; cattle; fresh
58290,181,919,2018,hg/An,260.0,Fc,'02951.01,Hides; cattle; fresh
58291,181,919,2019,hg/An,260.0,Fc,'02951.01,Hides; cattle; fresh


## Push FAO crop yield data to database

In [13]:
class DatabaseInterface:
    def __init__(self,
                 db_name,
                 user,
                 password,
                 host='localhost',
                 port=3306,
                 driver='mysql+pymysql',
                 show_query_output=False):

        with create_engine(f'{driver}://{user}:{password}@{host}').connect() as db_connection:
            db_connection.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}")
        self.engine = create_engine(f"{driver}://{user}:{password}@{host}:{port}/{db_name}", echo=show_query_output)

    def insert_data(self, df: pd.DataFrame, table_name: str, dtypes: dict = {}, if_exists: str = 'replace', use_df_index: bool = False, chunk_size: int = 500):
        df.to_sql(
            name=table_name, 
            con=self.engine, 
            if_exists=if_exists, 
            index=use_df_index,
            chunksize=chunk_size,
            dtype=dtypes
        )

    def set_existing_field_as_primary_key(self, table_name: str, primary_key: str, constraints: str = ''):
        with self.engine.connect() as con:
            con.execute(f"ALTER TABLE `{table_name}` ADD PRIMARY KEY (`{primary_key}`) {constraints};")

    def add_new_field_as_primary_key(self, table_name: str, primary_key: str, primary_key_dtype: str, constraints: str = ''):
        with self.engine.connect() as con:
            con.execute(f"ALTER TABLE `{table_name}` ADD `{primary_key}` {primary_key_dtype} PRIMARY KEY {constraints};")

    def set_foreign_key(self, table_name: str, foreign_key: str, foreign_table_name: str, foreign_table_key:str):
        with self.engine.connect() as con:
            con.execute(f"ALTER TABLE `{table_name}` ADD FOREIGN KEY (`{foreign_key}`) REFERENCES `{foreign_table_name}`(`{foreign_table_key}`);")

    def close_connection(self):
        self.engine.dispose()

In [331]:
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)

# dbi.engine.execute(f"SET FOREIGN_KEY_CHECKS=0;")
# dbi.engine.execute(f"DROP TABLE IF EXISTS fao_data_quality;")
# dbi.engine.execute(f"DROP TABLE IF EXISTS fao_products;")
# dbi.engine.execute(f"DROP TABLE IF EXISTS fao_countries;")
# dbi.engine.execute(f"DROP TABLE IF EXISTS noaa_countries;")
# dbi.engine.execute(f"DROP TABLE IF EXISTS fao_crop_yields;")
# dbi.engine.execute(f"SET FOREIGN_KEY_CHECKS=1;")

print("Inserting fao_data_quality")
dbi.insert_data(
   df=fao_data_quality_description, 
   table_name='fao_data_quality', 
   dtypes={
      'fao_data_quality_id': String(2),
      'description': Text,
      }
)
dbi.set_existing_field_as_primary_key('fao_data_quality', 'fao_data_quality_id')

print("Inserting fao_products")
dbi.insert_data(
   df=fao_product_ids, 
   table_name='fao_products', 
   dtypes={
      'product_id': Integer,
      'cpc_code': String(20),
      'product': Text,
      }
)
dbi.set_existing_field_as_primary_key('fao_products', 'product_id')

print("Inserting fao_countries")
dbi.insert_data(
   df=fao_country_codes, 
   table_name='fao_countries', 
   dtypes={
      'fao_country_code': Integer,
      'fao_country_name': Text,
      'noaa_country_code': String(2),
      'EU': Boolean,
      }
)
dbi.set_existing_field_as_primary_key('fao_countries', 'fao_country_code')

# Uncomment this line if noaa_weather_data table has already been created
dbi.engine.execute(f"ALTER TABLE noaa_weather_data DROP FOREIGN KEY noaa_weather_data_ibfk_1;")
print("Inserting noaa_countries")
dbi.insert_data(
   df=noaa_country_codes, 
   table_name='noaa_countries', 
   dtypes={
      'noaa_country_code': String(2),
      'noaa_country_name': Text,
      'fao_country_code': Integer,
      'EU': Boolean,
      }
)
dbi.set_existing_field_as_primary_key('noaa_countries', 'noaa_country_code')

print("Inserting fao_crop_yields")
dbi.insert_data(
   df=fao_crop_yield_data, 
   table_name='fao_crop_yields', 
   dtypes={
      'id': Integer,
      'fao_country_code': Integer,
      'product_id': Integer,
      'year': Integer,
      'unit': Text,
      'value': Integer,
      'fao_data_quality_id': String(2),
      }
)
dbi.add_new_field_as_primary_key(table_name='fao_crop_yields', primary_key='id', primary_key_dtype='INT', constraints='AUTO_INCREMENT')

print('Setting foreign keys')
dbi.set_foreign_key(table_name='fao_countries', foreign_key='noaa_country_code', foreign_table_name='noaa_countries', foreign_table_key='noaa_country_code')
dbi.set_foreign_key(table_name='noaa_countries', foreign_key='fao_country_code', foreign_table_name='fao_countries', foreign_table_key='fao_country_code')
dbi.set_foreign_key(table_name='fao_crop_yields', foreign_key='fao_country_code', foreign_table_name='fao_countries', foreign_table_key='fao_country_code')
dbi.set_foreign_key(table_name='fao_crop_yields', foreign_key='product_id', foreign_table_name='fao_products', foreign_table_key='product_id')
dbi.set_foreign_key(table_name='fao_crop_yields', foreign_key='fao_data_quality_id', foreign_table_name='fao_data_quality', foreign_table_key='fao_data_quality_id')

# Uncomment this line if noaa_weather_data table has already been created
dbi.set_foreign_key(table_name='noaa_weather_data', foreign_key='noaa_country_code', foreign_table_name='noaa_countries', foreign_table_key='noaa_country_code')
print("Closing connection")
dbi.close_connection()
print("Done")



Inserting fao_data_quality
Inserting fao_products
Inserting fao_countries
Inserting noaa_countries
Inserting fao_crop_yields
Setting foreign keys
Closing connection
Done


# Data exploration and cleaning 
## NOAA Global Historical Climatology Network Daily (GHCN-D) from CSV file
AWS bucket url: https://noaa-ghcn-pds.s3.amazonaws.com/index.html#csv/

## Create dataframes for datatype descriptions

In [7]:
soil_temp_units = 'tenths of degrees C'
# Minimum soil temperature element_id=SN*#, where * corresponds to a code for ground cover and # corresponds to a code for soil depth.
# Maximum soil temperature element_id=SX*#, where * corresponds to a code for ground cover and # corresponds to a code for soil depth.

ground_cover_map = { 
    '0': "unknown",
    '1': "grass",
    '2': "fallow",
    '3': "bare ground",
    '4': "brome grass",
    '5': "sod",
    '6': "straw mulch",
    '7': "grass muck",
    '8': "bare muck",
}
soil_depth_map = {
    '1': "5 cm",
    '2': "10 cm",
    '3': "20 cm",
    '4': "50 cm",
    '5': "100 cm",
    '6': "150 cm",
    '7': "180 cm",
}

min_soil_temp_data_type_ids = []
max_soil_temp_data_type_ids = []

for gc_id, gc_desc in ground_cover_map.items():
    for sd_id, sd_desc in soil_depth_map.items():
        min_soil_temp_data_type_ids.append({
            'data_type_id': 'SN'+gc_id+sd_id, 
            'description': f'Minimum soil temperature: Ground cover = {gc_desc}, Soil depth = {sd_desc}',
            'units': soil_temp_units
            })
        max_soil_temp_data_type_ids.append({
            'data_type_id': 'SX'+gc_id+sd_id, 
            'description': f'Maximum soil temperature: Ground cover = {gc_desc}, Soil depth = {sd_desc}',
            'units': soil_temp_units
            })
pd.DataFrame(min_soil_temp_data_type_ids).head()


Unnamed: 0,data_type_id,description,units
0,SN01,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
1,SN02,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
2,SN03,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
3,SN04,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C
4,SN05,Minimum soil temperature: Ground cover = unkno...,tenths of degrees C


In [8]:
noaa_data_type_ids = [
    {'data_type_id': 'PRCP', 'description': 'Precipitation', 'units': 'tenths of mm'},
    {'data_type_id': 'SNOW', 'description': 'Snowfall', 'units': 'mm'},
    {'data_type_id': 'SNWD', 'description': 'Snow depth', 'units': 'mm'},
    {'data_type_id': 'TMAX', 'description': 'Maximum temperature', 'units': 'tenths of degrees C'},
    {'data_type_id': 'TMIN', 'description': 'Minimum temperature', 'units': 'tenths of degrees C'},
    {'data_type_id': 'ACMC', 'description': 'Average cloudiness midnight to midnight from 30-second ceilometer data', 'units': 'percent'},
    {'data_type_id': 'ACMH', 'description': 'Average cloudiness midnight to midnight from manual observations', 'units': 'percent'},
    {'data_type_id': 'ACSC', 'description': 'Average cloudiness sunrise to sunset from 30-second ceilometer data', 'units': 'percent'},
    {'data_type_id': 'ACSH', 'description': 'Average cloudiness sunrise to sunset from manual observations', 'units': 'percent'},
    {'data_type_id': 'AWDR', 'description': 'Average daily wind direction', 'units': 'degrees'},
    {'data_type_id': 'AWND', 'description': 'Average daily wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'EVAP', 'description': 'Evaporation of water from evaporation pan', 'units': 'tenths of mm'},
    {'data_type_id': 'FRGB', 'description': 'Base of frozen ground layer', 'units': 'cm'},
    {'data_type_id': 'FRGT', 'description': 'Top of frozen ground layer', 'units': 'cm'},
    {'data_type_id': 'FRTH', 'description': 'Thickness of frozen ground layer', 'units': 'cm'},
    {'data_type_id': 'GAHT', 'description': 'Difference between river and gauge height', 'units': 'cm'},
    {'data_type_id': 'MNPN', 'description': 'Daily minimum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'data_type_id': 'MXPN', 'description': 'Daily maximum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'data_type_id': 'PGTM', 'description': 'Peak gust time', 'units': '(hours and minutes, i.e., HHMM)'},
    {'data_type_id': 'PSUN', 'description': 'Daily percent of possible sunshine', 'units': 'percent'},
    {'data_type_id': 'THIC', 'description': 'Thickness of ice on water', 'units': 'tenths of mm'},
    {'data_type_id': 'TOBS', 'description': 'Temperature at the time of observation', 'units': 'tenths of degrees C'},
    {'data_type_id': 'TSUN', 'description': 'Daily total sunshine', 'units': 'minutes'},
    {'data_type_id': 'WDF1', 'description': 'Direction of fastest 1-minute wind', 'units': 'degrees'},
    {'data_type_id': 'WDF2', 'description': 'Direction of fastest 2-minute wind', 'units': 'degrees'},
    {'data_type_id': 'WDF5', 'description': 'Direction of fastest 5-second wind', 'units': 'degrees'},
    {'data_type_id': 'WDFG', 'description': 'Direction of peak wind gust', 'units': 'degrees'},
    {'data_type_id': 'WDFI', 'description': 'Direction of highest instantaneous wind', 'units': 'degrees'},
    {'data_type_id': 'WDFM', 'description': 'Fastest mile wind direction', 'units': 'degrees'},
    {'data_type_id': 'WDMV', 'description': '24-hour wind movement', 'units': 'km'},
    {'data_type_id': 'WESD', 'description': 'Water equivalent of snow on the ground', 'units': 'tenths of mm'},
    {'data_type_id': 'WESF', 'description': 'Water equivalent of snowfall', 'units': 'tenths of mm'},
    {'data_type_id': 'WSF1', 'description': 'Fastest 1-minute wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSF2', 'description': 'Fastest 2-minute wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSF5', 'description': 'Fastest 5-second wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSFG', 'description': 'Peak gust wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSFI', 'description': 'Highest instantaneous wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSFM', 'description': 'Fastest mile wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'TAVG', 'description': "Average temperature", "units": "tenths of degrees C"}, # [Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]
    {'data_type_id': 'WT01', 'description': "Weather type: Fog, ice fog, or freezing fog (may include heavy fog)", 'units': 'Boolean'},
    {'data_type_id': 'WT02', 'description': "Weather type: Heavy fog or heaving freezing fog (not always distinguished from fog)", 'units': 'Boolean'},
    {'data_type_id': 'WT03', 'description': "Weather type: Thunder", 'units': 'Boolean'},
    {'data_type_id': 'WT04', 'description': "Weather type: Ice pellets, sleet, snow pellets, or small hail", 'units': 'Boolean'},
    {'data_type_id': 'WT05', 'description': "Weather type: Hail (may include small hail)", 'units': 'Boolean'},
    {'data_type_id': 'WT06', 'description': "Weather type: Glaze or rime", 'units': 'Boolean'},
    {'data_type_id': 'WT07', 'description': "Weather type: Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction", 'units': 'Boolean'},
    {'data_type_id': 'WT08', 'description': "Weather type: Smoke or haze", 'units': 'Boolean'},
    {'data_type_id': 'WT09', 'description': "Weather type: Blowing or drifting snow", 'units': 'Boolean'},
    {'data_type_id': 'WT10', 'description': "Weather type: Tornado, waterspout, or funnel cloud", 'units': 'Boolean'},
    {'data_type_id': 'WT11', 'description': "Weather type: High or damaging winds", 'units': 'Boolean'},
    {'data_type_id': 'WT12', 'description': "Weather type: Blowing spray", 'units': 'Boolean'},
    {'data_type_id': 'WT13', 'description': "Weather type: Mist", 'units': 'Boolean'},
    {'data_type_id': 'WT14', 'description': "Weather type: Drizzle", 'units': 'Boolean'},
    {'data_type_id': 'WT15', 'description': "Weather type: Freezing drizzle", 'units': 'Boolean'},
    {'data_type_id': 'WT16', 'description': "Weather type: Rain (may include freezing rain, drizzle, and freezing drizzle)", 'units': 'Boolean'},
    {'data_type_id': 'WT17', 'description': "Weather type: Freezing rain", 'units': 'Boolean'},
    {'data_type_id': 'WT18', 'description': "Weather type: Snow, snow pellets, snow grains, or ice crystals", 'units': 'Boolean'},
    {'data_type_id': 'WT19', 'description': "Weather type: Unknown source of precipitation", 'units': 'Boolean'},
    {'data_type_id': 'WT21', 'description': "Weather type: Ground fog", 'units': 'Boolean'},
    {'data_type_id': 'WT22', 'description': "Weather type: Ice fog or freezing fog", 'units': 'Boolean'},
    {'data_type_id': 'WV01', 'description': "Weather in vicinity: Fog, ice fog, or freezing fog (may include heavy fog)", 'units': 'Boolean'},
    {'data_type_id': 'WV03', 'description': "Weather in vicinity: Thunder", 'units': 'Boolean'},
    {'data_type_id': 'WV07', 'description': "Weather in vicinity: Ash, dust, sand, or other blowing obstruction", 'units': 'Boolean'},
    {'data_type_id': 'WV18', 'description': "Weather in vicinity: Snow or ice crystals", 'units': 'Boolean'},
    {'data_type_id': 'WV20', 'description': "Weather in vicinity: Rain or snow shower", 'units': 'Boolean'},
    {'data_type_id': 'FMTM', 'description': 'Time of fastest mile or fastest 1-minute wind', 'units': 'hours and minutes,i.e., HHMM'},
    {'data_type_id': 'DASF', 'description': 'Number of days included in the multiday snowfall total', 'units': 'MDSF'},
    {'data_type_id': 'MDSF', 'description': 'Multiday snowfall total', 'units': 'mm'},
    {'data_type_id': 'DAWM', 'description': 'Number of days included in the multiday wind movement', 'units': 'MDWM'},
    {'data_type_id': 'MDWM', 'description': 'Multiday wind movement', 'units': 'km'},
    {'data_type_id': 'DAEV', 'description': 'Number of days included in the multiday evaporation total', 'units': 'MDEV'},
    {'data_type_id': 'MDEV', 'description': 'Multiday evaporation total; (use with DAEV)', 'units': 'tenths of mm'},
    {'data_type_id': 'DWPR', 'description': 'Number of days with non-zero precipitation included in multiday precipitation total', 'units': 'MDPR'},
    {'data_type_id': 'DAPR', 'description': 'Number of days included in the multiday precipitation total', 'units': 'MDPR'},
    {'data_type_id': 'MDPR', 'description': 'Multiday precipitation total; (use with DAPR and DWPR, if available)', 'units': 'tenths of mm'},
    {'data_type_id': 'DATN', 'description': 'Number of days included in the multiday minimum temperature', 'units': 'MDTN'},
    {'data_type_id': 'MDTN', 'description': 'Multiday minimum temperature; (use with DATN)', 'units': 'tenths of degrees C'},
    {'data_type_id': 'DATX', 'description': 'Number of days included in the multiday maximum temperature', 'units': 'MDTX'},
    {'data_type_id': 'MDTX', 'description': 'Multiday maximum temperature; (use with DATX)', 'units': 'tenths of degrees C'},
]
noaa_data_type_ids.extend(max_soil_temp_data_type_ids)
noaa_data_type_ids.extend(min_soil_temp_data_type_ids)
noaa_data_type_ids = pd.DataFrame(noaa_data_type_ids)
noaa_data_type_ids.head()

Unnamed: 0,data_type_id,description,units
0,PRCP,Precipitation,tenths of mm
1,SNOW,Snowfall,mm
2,SNWD,Snow depth,mm
3,TMAX,Maximum temperature,tenths of degrees C
4,TMIN,Minimum temperature,tenths of degrees C


### Select data types to keep and to drop

In [9]:
data_types_to_keep = [
    {'data_type_id': 'PRCP', 'description': 'Precipitation', 'units': 'tenths of mm'},
    {'data_type_id': 'SNOW', 'description': 'Snowfall', 'units': 'mm'},
    {'data_type_id': 'SNWD', 'description': 'Snow depth', 'units': 'mm'},
    {'data_type_id': 'TMAX', 'description': 'Maximum temperature', 'units': 'tenths of degrees C'},
    {'data_type_id': 'TMIN', 'description': 'Minimum temperature', 'units': 'tenths of degrees C'},
    {'data_type_id': 'ACMH', 'description': 'Average cloudiness midnight to midnight from manual observations', 'units': 'percent'},
    {'data_type_id': 'AWND', 'description': 'Average daily wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'EVAP', 'description': 'Evaporation of water from evaporation pan', 'units': 'tenths of mm'},
    {'data_type_id': 'FRTH', 'description': 'Thickness of frozen ground layer', 'units': 'cm'},
    {'data_type_id': 'PSUN', 'description': 'Daily percent of possible sunshine', 'units': 'percent'},
    {'data_type_id': 'THIC', 'description': 'Thickness of ice on water', 'units': 'tenths of mm'},
    {'data_type_id': 'TOBS', 'description': 'Temperature at the time of observation', 'units': 'tenths of degrees C'},
    {'data_type_id': 'TSUN', 'description': 'Daily total sunshine', 'units': 'minutes'},
    {'data_type_id': 'WDMV', 'description': '24-hour wind movement', 'units': 'km'},
    {'data_type_id': 'TAVG', 'description': "Average temperature', 'units': 'tenths of degrees C"},
    {'data_type_id': 'WT01', 'description': "Weather type: Fog, ice fog, or freezing fog (may include heavy fog)", 'units': 'Boolean'},
    {'data_type_id': 'WT02', 'description': "Weather type: Heavy fog or heaving freezing fog (not always distinguished from fog)", 'units': 'Boolean'},
    {'data_type_id': 'WT03', 'description': "Weather type: Thunder", 'units': 'Boolean'},
    {'data_type_id': 'WT04', 'description': "Weather type: Ice pellets, sleet, snow pellets, or small hail", 'units': 'Boolean'},
    {'data_type_id': 'WT05', 'description': "Weather type: Hail (may include small hail)", 'units': 'Boolean'},
    {'data_type_id': 'WT06', 'description': "Weather type: Glaze or rime", 'units': 'Boolean'},
    {'data_type_id': 'WT07', 'description': "Weather type: Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction", 'units': 'Boolean'},
    {'data_type_id': 'WT08', 'description': "Weather type: Smoke or haze", 'units': 'Boolean'},
    {'data_type_id': 'WT09', 'description': "Weather type: Blowing or drifting snow", 'units': 'Boolean'},
    {'data_type_id': 'WT10', 'description': "Weather type: Tornado, waterspout, or funnel cloud", 'units': 'Boolean'},
    {'data_type_id': 'WT11', 'description': "Weather type: High or damaging winds", 'units': 'Boolean'},
    {'data_type_id': 'WT12', 'description': "Weather type: Blowing spray", 'units': 'Boolean'},
    {'data_type_id': 'WT13', 'description': "Weather type: Mist", 'units': 'Boolean'},
    {'data_type_id': 'WT14', 'description': "Weather type: Drizzle", 'units': 'Boolean'},
    {'data_type_id': 'WT15', 'description': "Weather type: Freezing drizzle", 'units': 'Boolean'},
    {'data_type_id': 'WT16', 'description': "Weather type: Rain (may include freezing rain, drizzle, and freezing drizzle)", 'units': 'Boolean'},
    {'data_type_id': 'WT17', 'description': "Weather type: Freezing rain", 'units': 'Boolean'},
    {'data_type_id': 'WT18', 'description': "Weather type: Snow, snow pellets, snow grains, or ice crystals", 'units': 'Boolean'},
    {'data_type_id': 'WT19', 'description': "Weather type: Unknown source of precipitation", 'units': 'Boolean'},
    {'data_type_id': 'WT21', 'description': "Weather type: Ground fog", 'units': 'Boolean'},
    {'data_type_id': 'WT22', 'description': "Weather type: Ice fog or freezing fog", 'units': 'Boolean'},
]

data_types_to_drop = [
    {'data_type_id': 'AWDR', 'description': 'Average daily wind direction', 'units': 'degrees'},
    {'data_type_id': 'GAHT', 'description': 'Difference between river and gauge height', 'units': 'cm'},
    {'data_type_id': 'MNPN', 'description': 'Daily minimum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'data_type_id': 'MXPN', 'description': 'Daily maximum temperature of water in an evaporation pan', 'units': 'tenths of degrees C'},
    {'data_type_id': 'WESD', 'description': 'Water equivalent of snow on the ground', 'units': 'tenths of mm'},
    {'data_type_id': 'WESF', 'description': 'Water equivalent of snowfall', 'units': 'tenths of mm'},
    {'data_type_id': 'WSFM', 'description': 'Fastest mile wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'FRGB', 'description': 'Base of frozen ground layer', 'units': 'cm'},
    {'data_type_id': 'FRGT', 'description': 'Top of frozen ground layer', 'units': 'cm'},
    {'data_type_id': 'ACMC', 'description': 'Average cloudiness midnight to midnight from 30-second ceilometer data', 'units': 'percent'},
    {'data_type_id': 'ACSC', 'description': 'Average cloudiness sunrise to sunset from 30-second ceilometer data', 'units': 'percent'},
    {'data_type_id': 'ACSH', 'description': 'Average cloudiness sunrise to sunset from manual observations', 'units': 'percent'},
    {'data_type_id': 'PGTM', 'description': 'Peak gust time', 'units': '(hours and minutes, i.e., HHMM)'},
    {'data_type_id': 'WDFG', 'description': 'Direction of peak wind gust', 'units': 'degrees'},
    {'data_type_id': 'WDFI', 'description': 'Direction of highest instantaneous wind', 'units': 'degrees'},
    {'data_type_id': 'WDFM', 'description': 'Fastest mile wind direction', 'units': 'degrees'},
    {'data_type_id': 'WSF1', 'description': 'Fastest 1-minute wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSF2', 'description': 'Fastest 2-minute wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSF5', 'description': 'Fastest 5-second wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSFG', 'description': 'Peak gust wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WSFI', 'description': 'Highest instantaneous wind speed', 'units': 'tenths of meters per second'},
    {'data_type_id': 'WDF1', 'description': 'Direction of fastest 1-minute wind', 'units': 'degrees'},
    {'data_type_id': 'WDF2', 'description': 'Direction of fastest 2-minute wind', 'units': 'degrees'},
    {'data_type_id': 'WDF5', 'description': 'Direction of fastest 5-second wind', 'units': 'degrees'},
    {'data_type_id': 'WV01', 'description': "Weather in vicinity: Fog, ice fog, or freezing fog (may include heavy fog)", 'units': 'Boolean'},
    {'data_type_id': 'WV03', 'description': "Weather in vicinity: Thunder", 'units': 'Boolean'},
    {'data_type_id': 'WV07', 'description': "Weather in vicinity: Ash, dust, sand, or other blowing obstruction", 'units': 'Boolean'},
    {'data_type_id': 'WV18', 'description': "Weather in vicinity: Snow or ice crystals", 'units': 'Boolean'},
    {'data_type_id': 'WV20', 'description': "Weather in vicinity: Rain or snow shower", 'units': 'Boolean'},
    {'data_type_id': 'FMTM', 'description': 'Time of fastest mile or fastest 1-minute wind', 'units': 'hours and minutes,i.e., HHMM'},
    {'data_type_id': 'DASF', 'description': 'Number of days included in the multiday snowfall total', 'units': 'MDSF'},
    {'data_type_id': 'MDSF', 'description': 'Multiday snowfall total', 'units': 'mm'},
    {'data_type_id': 'DAWM', 'description': 'Number of days included in the multiday wind movement', 'units': 'MDWM'},
    {'data_type_id': 'MDWM', 'description': 'Multiday wind movement', 'units': 'km'},
    {'data_type_id': 'DAEV', 'description': 'Number of days included in the multiday evaporation total', 'units': 'MDEV'},
    {'data_type_id': 'MDEV', 'description': 'Multiday evaporation total; (use with DAEV)', 'units': 'tenths of mm'},
    {'data_type_id': 'DWPR', 'description': 'Number of days with non-zero precipitation included in multiday precipitation total', 'units': 'MDPR'},
    {'data_type_id': 'DAPR', 'description': 'Number of days included in the multiday precipitation total', 'units': 'MDPR'},
    {'data_type_id': 'MDPR', 'description': 'Multiday precipitation total; (use with DAPR and DWPR, if available)', 'units': 'tenths of mm'},
    {'data_type_id': 'DATN', 'description': 'Number of days included in the multiday minimum temperature', 'units': 'MDTN'},
    {'data_type_id': 'MDTN', 'description': 'Multiday minimum temperature; (use with DATN)', 'units': 'tenths of degrees C'},
    {'data_type_id': 'DATX', 'description': 'Number of days included in the multiday maximum temperature', 'units': 'MDTX'},
    {'data_type_id': 'MDTX', 'description': 'Multiday maximum temperature; (use with DATX)', 'units': 'tenths of degrees C'},
]
data_ids_to_keep = [data_type['data_type_id'] for data_type in data_types_to_keep]
noaa_data_type_ids['use_in_model'] = noaa_data_type_ids.data_type_id.apply(lambda x: True if x in data_ids_to_keep else False)
noaa_data_type_ids.head()

Unnamed: 0,data_type_id,description,units,use_in_model
0,PRCP,Precipitation,tenths of mm,True
1,SNOW,Snowfall,mm,True
2,SNWD,Snow depth,mm,True
3,TMAX,Maximum temperature,tenths of degrees C,True
4,TMIN,Minimum temperature,tenths of degrees C,True


In [204]:
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)

#dbi.engine.execute(f"DROP TABLE IF EXISTS noaa_data_types;")

print("Inserting noaa_data_types")
dbi.insert_data(
   df=noaa_data_type_ids, 
   table_name='noaa_data_types', 
   dtypes={
      'data_type_id': String(4),
      'description': Text,
      'units': Text,
      'use_in_model': Boolean,
      }
)
dbi.set_existing_field_as_primary_key('noaa_data_types', 'data_type_id')

print("Closing connection")
dbi.close_connection()
print("Done")

Inserting noaa_data_types
Closing connection
Done


## Import the NOAA data csv

In [222]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d")

noaa_ghcnd_file_path = './data/noaa_ghcn_aws_data/'

file_name = '2021'
noaa_ghcn_aws_data_2021 = pd.read_csv(
    f"{noaa_ghcnd_file_path}{file_name}.csv", 
    header=None, 
    usecols=[0,1,2,3],
    names=['station_id', 'date', 'data_type_id', 'data_value'],#, 'measurement_id', 'quality_id', 'source_id', 'observation_time'],
    nrows=100000, 
    parse_dates=['date'],
    date_parser=custom_date_parser,
    # dtype={'observation_time':str}
    )
#noaa_ghcn_aws_data_2021.observation_time.replace({np.nan: '0000', '2400': '0000'}, inplace=True)
#noaa_ghcn_aws_data_2021.observation_time = pd.to_datetime(noaa_ghcn_aws_data_2021.observation_time, format='%H%M').dt.time

# Extract country code from station_id
noaa_ghcn_aws_data_2021['noaa_country_code'] = noaa_ghcn_aws_data_2021.station_id.str[:2]
# Drop countries that are not in the fao dataset
noaa_ghcn_aws_data_2021 = noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.noaa_country_code.isin(noaa_country_codes.noaa_country_code)]
# Drop unnecessary data types
noaa_ghcn_aws_data_2021 = noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.data_type_id.isin(data_ids_to_keep)]
# Pivot the table.
noaa_weather_data = noaa_ghcn_aws_data_2021.pivot(index=['noaa_country_code', 'station_id', 'date'], columns='data_type_id', values='data_value')
# Fill in the weather type columns with zeroes because they are one-hot encoded/binary
# weather_type_cols = noaa_weather_data.columns[noaa_weather_data.columns.str.contains('WT')]
# noaa_weather_data[weather_type_cols] = noaa_weather_data[weather_type_cols].fillna(0)

noaa_weather_data.reset_index(inplace=True)
noaa_weather_data.head()

data_type_id,noaa_country_code,station_id,date,AWND,EVAP,PRCP,PSUN,SNOW,SNWD,TAVG,THIC,TMAX,TMIN,TOBS,TSUN,WDMV,WT01,WT02,WT03,WT04,WT05,WT06,WT08,WT09,WT11
0,AE,AE000041196,2021-01-01,,,0.0,,,,214.0,,278.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AE,AE000041196,2021-01-02,,,0.0,,,,211.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AE,AEM00041194,2021-01-01,,,0.0,,,,217.0,,266.0,178.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AE,AEM00041194,2021-01-02,,,0.0,,,,214.0,,262.0,166.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AE,AEM00041217,2021-01-01,,,,,,,202.0,,262.0,155.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37034,WA,WA007848390,2021-01-01,,,,,,,220.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37035,WA,WA007878380,2021-01-01,,,79.0,,,,209.0,,346.0,189.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37036,WA,WA010517310,2021-01-01,,,,,,,244.0,,332.0,200.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37037,WA,WAM00068006,2021-01-01,,,,,,,283.0,,326.0,199.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
noaa_weather_data.shape

(343038, 25)

In [250]:
[data_type['data_type_id'] for datatype in data_types_to_keep]

NameError: name 'data_type' is not defined

#### Insert data sample to test values

In [251]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d")

noaa_ghcnd_file_path = './data/noaa_ghcn_aws_data/'

file_name = '2021'
noaa_ghcn_aws_data_2021 = pd.read_csv(
    f"{noaa_ghcnd_file_path}{file_name}.csv", 
    header=None, 
    usecols=[0,1,2,3],
    names=['station_id', 'date', 'data_type_id', 'data_value'],#, 'measurement_id', 'quality_id', 'source_id', 'observation_time'],
    nrows=100000, 
    parse_dates=['date'],
    date_parser=custom_date_parser,
    )
# Extract country code from station_id
noaa_ghcn_aws_data_2021['noaa_country_code'] = noaa_ghcn_aws_data_2021.station_id.str[:2]
# Drop countries that are not in the fao dataset
noaa_ghcn_aws_data_2021 = noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.noaa_country_code.isin(noaa_country_codes.noaa_country_code)]

# Pivot the table.
noaa_weather_data = noaa_ghcn_aws_data_2021.pivot(index=['noaa_country_code', 'station_id', 'date'], columns='data_type_id', values='data_value')
# Fill in the weather type columns with zeroes because they are one-hot encoded/binary
weather_type_cols = noaa_weather_data.columns[noaa_weather_data.columns.str.contains('WT')]
noaa_weather_data[weather_type_cols] = noaa_weather_data[weather_type_cols].fillna(0)

noaa_weather_data.reset_index(inplace=True)

# Create the table with a predefined autoincrementing primary key so further data can be added
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)

print("Inserting noaa_weather_data")
dbi.insert_data(noaa_weather_data, 'test_data', if_exists='append')

print("Closing connection")
dbi.close_connection()
print("Done")


Inserting noaa_weather_data
Closing connection
Done


### Insert the NOAA weather data

In [248]:
# Create the table with a predefined autoincrementing primary key so further data can be added
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)

#dbi.engine.execute(f"DROP TABLE IF EXISTS noaa_weather_data;")

meta = MetaData()
meta.bind = dbi.engine
meta.reflect()

noaa_weather_data_table = Table(
   'noaa_weather_data', meta, 
   Column('id', Integer, primary_key=True), 
   Column('noaa_country_code', String(2), ForeignKey("noaa_countries.noaa_country_code")), 
   Column('station_id', Text),
   Column('date', Date),
   Column('PRCP', Float(50)),
   Column('SNOW', Float(50)),
   Column('SNWD', Float(50)),
   Column('TMAX', Float(50)),
   Column('TMIN', Float(50)),
   Column('ACMH', Float(50)),
   Column('AWND', Float(50)),
   Column('EVAP', Float(50)),
   Column('FRTH', Float(50)),
   Column('PSUN', Float(50)),
   Column('THIC', Float(50)),
   Column('TOBS', Float(50)),
   Column('TSUN', Float(50)),
   Column('WDMV', Float(50)),
   Column('TAVG', Float(50)),
   Column('WT01', Boolean),
   Column('WT02', Boolean),
   Column('WT03', Boolean),
   Column('WT04', Boolean),
   Column('WT05', Boolean),
   Column('WT06', Boolean),
   Column('WT07', Boolean),
   Column('WT08', Boolean),
   Column('WT09', Boolean),
   Column('WT10', Boolean),
   Column('WT11', Boolean),
   Column('WT12', Boolean),
   Column('WT13', Boolean),
   Column('WT14', Boolean),
   Column('WT15', Boolean),
   Column('WT16', Boolean),
   Column('WT17', Boolean),
   Column('WT18', Boolean),
   Column('WT19', Boolean),
   Column('WT21', Boolean),
   Column('WT22', Boolean)
)
                                  
noaa_weather_data_table.create(dbi.engine)
# meta.create_all(dbi.engine)

print("Inserting noaa_weather_data")
dbi.insert_data(noaa_weather_data, 'noaa_weather_data', if_exists='append')

print("Closing connection")
dbi.close_connection()
print("Done")

Inserting noaa_weather_data
Closing connection
Done


## Combine the process of importing the CSV and exporting to the database and split the process to handle chunks of data

In [None]:
# Create the table with a predefined autoincrementing primary key so further data can be added
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)

# dbi.engine.execute(f"DROP TABLE IF EXISTS noaa_weather_data;")

meta = MetaData()
meta.bind = dbi.engine
meta.reflect()

noaa_weather_data_table = Table(
   'noaa_weather_data', meta, 
   Column('id', Integer, primary_key=True), 
   Column('noaa_country_code', String(2), ForeignKey("noaa_countries.noaa_country_code")), 
   Column('station_id', Text),
   Column('date', Date),
   Column('PRCP', Float(50)),
   Column('SNOW', Float(50)),
   Column('SNWD', Float(50)),
   Column('TMAX', Float(50)),
   Column('TMIN', Float(50)),
   Column('ACMH', Float(50)),
   Column('AWND', Float(50)),
   Column('EVAP', Float(50)),
   Column('FRTH', Float(50)),
   Column('PSUN', Float(50)),
   Column('THIC', Float(50)),
   Column('TOBS', Float(50)),
   Column('TSUN', Float(50)),
   Column('WDMV', Float(50)),
   Column('TAVG', Float(50)),
   Column('WT01', Boolean),
   Column('WT02', Boolean),
   Column('WT03', Boolean),
   Column('WT04', Boolean),
   Column('WT05', Boolean),
   Column('WT06', Boolean),
   Column('WT07', Boolean),
   Column('WT08', Boolean),
   Column('WT09', Boolean),
   Column('WT10', Boolean),
   Column('WT11', Boolean),
   Column('WT12', Boolean),
   Column('WT13', Boolean),
   Column('WT14', Boolean),
   Column('WT15', Boolean),
   Column('WT16', Boolean),
   Column('WT17', Boolean),
   Column('WT18', Boolean),
   Column('WT19', Boolean),
   Column('WT21', Boolean),
   Column('WT22', Boolean)
)
noaa_weather_data_table.create(dbi.engine)
# meta.create_all(dbi.engine)

print('noaa_weather_data table created')

noaa_ghcnd_file_path = './data/noaa_ghcn_aws_data/'
for file_name in ['2020', '2019', '2018', '2017', '2016', '2015', 
                  '2014', '2013', '2012', '2011', '2010', '2009', '2008']:

    custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d")

    with pd.read_csv(
        f"{noaa_ghcnd_file_path}{file_name}.csv", 
        header=None, 
        usecols=[0,1,2,3],
        names=['station_id', 'date', 'data_type_id', 'data_value'],#, 'measurement_id', 'quality_id', 'source_id', 'observation_time'],
        chunksize=100000, 
        parse_dates=['date'],
        date_parser=custom_date_parser,
        ) as reader:

       chunk_num = 1
       for noaa_ghcn_aws_data_2021 in reader: 

          # Extract country code from station_id
          noaa_ghcn_aws_data_2021['noaa_country_code'] = noaa_ghcn_aws_data_2021.station_id.str[:2]
          # Drop countries that are not in the fao dataset
          noaa_ghcn_aws_data_2021 = noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.noaa_country_code.isin(noaa_country_codes.noaa_country_code)]
          # Drop unnecessary data types
          noaa_ghcn_aws_data_2021 = noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.data_type_id.isin(data_ids_to_keep)]
          # Pivot the table.
          noaa_weather_data = noaa_ghcn_aws_data_2021.pivot(index=['noaa_country_code', 'station_id', 'date'], columns='data_type_id', values='data_value')
          # Fill in the weather type columns with zeroes because they are one-hot encoded/binary
          weather_type_cols = noaa_weather_data.columns[noaa_weather_data.columns.str.contains('WT')]
          noaa_weather_data[weather_type_cols] = noaa_weather_data[weather_type_cols].fillna(0)

          noaa_weather_data.reset_index(inplace=True)

          dbi.insert_data(noaa_weather_data, 'noaa_weather_data', if_exists='append')

          print(f'{file_name}: {chunk_num} chunks processed')
          chunk_num += 1

print("Closing connection")
dbi.close_connection()
print("Done")

# 347 chunks = 39min 26sec

### Load noaa countries from db

In [229]:
dbi = DatabaseInterface(db_name='crop_yield_prediction', 
                        user=local_mysql_user, 
                        password=local_mysql_password)

# noaa_countries_df = dbi.db_engine.execute(f"SELECT * FROM noaa_countries")
noaa_countries_df = pd.read_sql_table(
    'noaa_countries',
    con=dbi.engine
)

print("Closing connection")
dbi.close_connection()
print("Done")
noaa_countries_df

Closing connection
Done


Unnamed: 0,noaa_country_code,noaa_country_name,fao_country_code
0,AC,Antigua and Barbuda,8
1,AE,United Arab Emirates,225
2,AF,Afghanistan,2
3,AG,Algeria,4
4,AJ,Azerbaijan,52
5,AL,Albania,3
6,AM,Armenia,1
7,AO,Angola,7
8,AQ,American Samoa [United States],244
9,AR,Argentina,9


In [104]:
noaa_ghcn_aws_data_2021.head()

Unnamed: 0,station_id,date,data_type_id,data_value,noaa_country_code
0,AE000041196,2021-01-01,TMAX,278,AE
1,AE000041196,2021-01-01,PRCP,0,AE
2,AE000041196,2021-01-01,TAVG,214,AE
3,AEM00041194,2021-01-01,TMAX,266,AE
4,AEM00041194,2021-01-01,TMIN,178,AE


In [103]:
noaa_ghcn_aws_data_2021.isna().sum()

station_id           0
date                 0
data_type_id         0
data_value           0
noaa_country_code    0
dtype: int64

In [125]:
pd.DataFrame(data_types_to_keep)

Unnamed: 0,data_type_id,description,units
0,PRCP,Precipitation,tenths of mm
1,SNOW,Snowfall,mm
2,SNWD,Snow depth,mm
3,TMAX,Maximum temperature,tenths of degrees C
4,TMIN,Minimum temperature,tenths of degrees C
5,ACMH,Average cloudiness midnight to midnight from m...,percent
6,AWND,Average daily wind speed,tenths of meters per second
7,EVAP,Evaporation of water from evaporation pan,tenths of mm
8,FRTH,Thickness of frozen ground layer,cm
9,PSUN,Daily percent of possible sunshine,percent


In [133]:
mean_weather_per_country_2021.columns

Index(['noaa_country_code', 'date', 'AWND', 'EVAP', 'PRCP', 'PSUN', 'SNOW',
       'SNWD', 'TAVG', 'THIC', 'TMAX', 'TMIN', 'TOBS', 'TSUN', 'WDMV', 'WT01',
       'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT08', 'WT09', 'WT11'],
      dtype='object', name='data_type_id')

In [None]:
# Replace NaN values in measurement_id, quality_id and source_id columns to match id descriptions
# noaa_ghcn_aws_data_2021.measurement_id.replace(np.nan, 'None', inplace=True)
# noaa_ghcn_aws_data_2021.quality_id.replace(np.nan, 'None', inplace=True)
# noaa_ghcn_aws_data_2021.source_id.replace(np.nan, 'None', inplace=True)
# noaa_ghcn_aws_data_2021.head()

In [45]:
noaa_ghcn_aws_data_2021.shape

(1000000, 4)

In [46]:
noaa_ghcn_aws_data_2021.element.unique()

array(['TMAX', 'PRCP', 'TAVG', 'TMIN', 'SNWD', 'AWND', 'WDF2', 'WSF2',
       'DATX', 'MDTX', 'DATN', 'MDTN', 'DAPR', 'MDPR', 'SNOW', 'WESF',
       'WESD', 'TOBS', 'WDF5', 'WSF5', 'WT01', 'EVAP', 'PGTM', 'SN32',
       'SX32', 'MNPN', 'MXPN', 'WT11', 'WDMV', 'SN31', 'SX31', 'WT03',
       'SN52', 'SX52', 'SN33', 'SN35', 'SX33', 'SX35', 'WDFG', 'WSFG',
       'WT06', 'WT04', 'THIC', 'SN51', 'SX51', 'SN53', 'SN55', 'SX53',
       'SX55', 'SN36', 'SN56', 'SX36', 'SX56', 'AWDR', 'WSFI', 'WT08',
       'WT02', 'WT09', 'WT05', 'PSUN', 'TSUN', 'DWPR'], dtype=object)

In [None]:
# Check for multiple measurements at different times
grouped = noaa_ghcn_aws_data_2021.groupby(['station_id', 'date', 'element']).count().sort_values('observation_time')
grouped

In [None]:
# We can drop the observation_time column because there is only one obervation per day
noaa_ghcn_aws_data_2021.drop('observation_time', axis=1, inplace=True)
noaa_ghcn_aws_data_2021.head()

Unnamed: 0,station_id,date,element,data_value,noaa_country_code
0,AE000041196,2021-01-01,TMAX,278,AE
1,AE000041196,2021-01-01,PRCP,0,AE
2,AE000041196,2021-01-01,TAVG,214,AE
3,AEM00041194,2021-01-01,TMAX,266,AE
4,AEM00041194,2021-01-01,TMIN,178,AE


## Check for country codes in the NOAA data against the FAO data

In [59]:
noaa_ghcn_aws_data_2021.shape

(1000000, 5)

(996942, 5)

In [72]:
df = noaa_ghcn_aws_data_2021.copy()
df = df[~df.noaa_country_code.isin(new_country_codes['FIPS10-4'])]
df.noaa_country_code.unique()

array(['DA', 'EU', 'JN', 'TE', 'VM', 'WQ'], dtype=object)

In [76]:
new_country_codes[new_country_codes.country_name=='Denmark']

Unnamed: 0,country_name,FIPS10-4,ISO 3166-1
60,Denmark,DK,DK


In [53]:
# Check for multiple temperature measurements on different soil types and depths
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('SX')]

Unnamed: 0,station_id,date,element,data_value,noaa_country_code
44545,USC00010655,2021-01-01,SX32,111,US
44615,USC00012813,2021-01-01,SX32,172,US
44640,USC00013251,2021-01-01,SX32,206,US
45556,USC00033428,2021-01-01,SX32,78,US
45590,USC00033821,2021-01-01,SX31,72,US
...,...,...,...,...,...
994576,USC00509793,2021-01-11,SX52,-28,US
994593,USC00509891,2021-01-11,SX52,-17,US
994594,USC00509891,2021-01-11,SX53,-11,US
994595,USC00509891,2021-01-11,SX55,-6,US


In [55]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('SX')].groupby(['station_id', 'date']).count().sort_values('element')

Unnamed: 0_level_0,Unnamed: 1_level_0,element,data_value,noaa_country_code
station_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USC00010655,2021-01-01,1,1,1
USC00346382,2021-01-04,1,1,1
USC00346382,2021-01-03,1,1,1
USC00346382,2021-01-02,1,1,1
USC00346382,2021-01-01,1,1,1
...,...,...,...,...
USC00218450,2021-01-11,8,8,8
USC00218450,2021-01-02,8,8,8
USC00218450,2021-01-01,8,8,8
USC00218450,2021-01-03,8,8,8


In [57]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('SX')].groupby(['station_id', 'date']).mean('data_value').loc['USC00218450']

Unnamed: 0_level_0,data_value
date,Unnamed: 1_level_1
2021-01-01,11.75
2021-01-02,11.75
2021-01-03,11.75
2021-01-04,11.75
2021-01-05,11.75
2021-01-06,11.75
2021-01-07,11.75
2021-01-08,11.75
2021-01-09,10.5
2021-01-10,10.5


In [None]:
grouped[grouped.element > 1].sort_values('element') # We need to create all possible soil temp element_ids in case they show up in other years

In [None]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id=='USC00218450')&(noaa_ghcn_aws_data_2021.date=='2021-06-27')][['element', 'data_value']]

In [None]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element=='FMTM'].head(100)

In [None]:
# Fetch all available data categories
endpoint='datatypes'
querystring = {'limit': 1000, 'datacategoryid': 'TMAX'}
url = f"{base_url}{endpoint}"

datatype = requests.request("GET", url, headers=headers, params=querystring)
pp(datatype.json())

In [None]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id.str.contains('AS')) & (noaa_ghcn_aws_data_2021.date=='2021-01-11')].head(500)

In [None]:
noaa_ghcn_aws_data_2021[noaa_ghcn_aws_data_2021.element.str.contains('MD')]

In [None]:
set([item['element_id'] for item in element_ids]) - set(noaa_ghcn_aws_data_2021.element.unique().tolist())

In [None]:
len(noaa_ghcn_aws_data_2021.element.unique().tolist())

In [None]:
noaa_ghcn_aws_data_2021[(noaa_ghcn_aws_data_2021.station_id=='USC00218450')&(noaa_ghcn_aws_data_2021.date=='2021-06-27')][['element', 'data_value']]