In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey, Date, Text, Boolean
from pprint import pprint as pp
from datetime import datetime

from config import local_mysql_password, local_mysql_user, FIPS_10_country_codes, uwm_location_to_fips_country_map, uwm_unnecessary_cols

pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 200)

## Load the data

In [2]:
fao_file_path = "./data/fao_data/"
crop_calendar_file_path = "./data/"

# Load the fao crop yield data
fao_crop_yields_file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
initial_fao_crop_yields = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_crop_yields_file_name}.csv", encoding='latin-1')

# Load the fao crop id data
fao_product_ids_file_name = 'Production_Crops_Livestock_E_ItemCodes'
initial_fao_crop_ids = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_product_ids_file_name}.csv", encoding='latin-1')

# Load the crop calendar
file_name = 'crop_calendar_uni_wisconsin_madison'
unprocessed_uwm_crop_calendar = pd.read_csv(f"{crop_calendar_file_path}{file_name}.csv")

## Data preprocessing pipelines

In [4]:
def start_pipeline(df: pd.DataFrame):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def clean_col_names(df: pd.DataFrame):
    ''' Replace spaces column names with underscores and make lower case '''
    df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()
    return df

def rename_columns(df: pd.DataFrame, column_name_map: dict):
    return df.rename(column_name_map, axis=1)

def remove_unnecessary_columns(df: pd.DataFrame, unnecessary_columns: list):
    return df.drop(unnecessary_columns, axis=1)

regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]

def combine_fips_noaa_fao_country_codes(df, fao_crop_yields):
    '''Drop any countries not in the FAO crop yield data and map the country codes from the different sources'''
    # Extract the fao countries from the crop yield data
    fao_country_data = fao_crop_yields.groupby(['fao_country_name', 'fao_country_code']).year.agg(['min', 'max'])
    fao_country_data.reset_index(inplace=True)
    # Merge with the FIPS country codes
    df = df.merge(fao_country_data, how='inner', left_on='fao_country_name', right_on='fao_country_name')
    df.drop(['fao_country_name'], axis=1, inplace=True)
    df.rename(columns={'min': 'fao_year_min', 'max': 'fao_year_max'}, inplace=True)
    return df

def rename_fao_data_quality_flags(df: pd.DataFrame):
    '''Rename confusing data aggregation flags'''
    df.flag.replace(np.nan, 'O', inplace=True)
    df.flag.replace('*', 'U', inplace=True)
    return df

def remove_fao_zero_crop_yields(df):
    '''Remove 0 crop yield values to prevent bad training data'''
    return df[df.value != 0.0]

def drop_fao_non_country_regions(df):
    '''Drop areas which are not countries'''
    return df[~df.area.isin(regions+special_groups)]

def remove_fao_non_crop_items(df):
    '''Remove rows relating to unnecessary production_types.'''
    # Yield = Production/Area_harvested so we can drop those two rows and only keep Yield.
    df = df[~df.element.isin(['Area harvested', 'Production'])]
    # Items with other production types are animals, which do not depend on weather.
    production_types_to_drop = ['Stocks', 'Laying', 'Producing Animals/Slaughtered', 'Yield/Carcass Weight', 'Milk Animals', 'Prod Popultn']
    items_to_drop = df[df.element.isin(production_types_to_drop)].item_code.unique()
    return df[~df.item_code.isin(items_to_drop)]

def combine_similar_fao_crops(crop_yields, crop_ids):       
    # Remove crop_ids that don't exist in the crop_yield data
    crop_ids = crop_ids[crop_ids.crop_id.isin(crop_yields.crop_id)]

    # Add the crop names to combine yield values for duplicate crop names
    # Combine the yield values by name (The crop ids are now invalid because they are summed too.)
    summed_yields = crop_yields.merge(crop_ids, on='crop_id').groupby(['fao_country_code', 'year', 'crop']).sum()
    summed_yields.reset_index(inplace=True)
    
    # Drop the duplicated names from the crop_ids df
    crop_ids = crop_ids[~crop_ids.crop.duplicated()]

    # Recombine the dataframes by crop name to replace the ids in the crop_yield df with those from crop_ids
    summed_yields.drop('crop_id', axis=1, inplace=True)
    summed_yields = summed_yields.merge(crop_ids, on='crop')
    
    return summed_yields

def clean_fao_crop_names(df):
    df.crop = df.crop.str.split(';').str[0]
    df.crop = df.crop.str.split(' nes').str[0]
    return df

def refactor_fao_crop_names(df):
    df.crop.replace(to_replace=['Beans', 'Peas', 'Chick peas', 'Cow peas', 'Lentils'], 
                    value='Pulses', inplace=True)
    return df
    

def drop_duplicated_fao_crop_names(df):
    return df[~df.crop.duplicated()]

def drop_fao_crop_ids_not_in_crop_yields(df, crop_yields):
    """Remove crop_id_ids that don't exist in the fao_crop_yield data"""
    return df[df.crop_id.isin(fao_crop_yields.crop_id)]

def extract_fao_crop_yield_units(crop_yields):
    crop_yield_units = crop_yields[['crop_id', 'unit']].copy()
    return crop_yield_units.drop_duplicates()

def add_yield_units_to_fao_crop_ids(crop_ids, crop_yield_units):
    crop_ids = crop_ids.merge(crop_yield_units, on='crop_id')
    return crop_ids

def rename_uvm_crops_to_match_fao_yields(df: pd.DataFrame):
    df.crop = df.crop.str.replace('.', ' ')
    df.crop.replace('Sweet Potatoes', 'Sweet potatoes', inplace=True)
    df.crop.replace('Sugarbeets', 'Sugar beet', inplace=True)
    df.crop.replace('Sugarcane', 'Sugar cane', inplace=True)
    df.crop.replace('Sunflower', 'Sunflower seed', inplace=True)
    df.crop.replace('Tomato', 'Tomatoes', inplace=True)
    df.crop.replace('Safflower', 'Safflower seed', inplace=True)
    df.crop.replace('Spring vegetables (field planting)', 'Vegetables', inplace=True)
    df.crop.replace(to_replace=['Spring Coarse Grains', 'Spring Grains', 'Winter Grains'],
                        value='Grain', inplace=True)
    df.crop.replace(to_replace=['All Cereals (Belg)', 'Spring Cereals', 'Spring cereals', 'Winter Cereals', 'Winter cereals'],
                        value='Cereals', inplace=True)
    df.crop.replace(to_replace=['Castor (Kharif)', 'Castor beans', 'Castorseed'],
                    value='Castor oil seed', inplace=True)
    df.crop.replace(to_replace=['Flax', 'Flaxseed', 'Flazseed'],
                    value='Flax fibre and tow', inplace=True)
    df.crop.replace(to_replace=['Mustard', 'Mustard (Rabi)'],
                    value='Mustard seed', inplace=True)
    df.crop.replace(to_replace=['Sesame', 'Sesame (Deyr)', 'Sesame (Gu)'],
                    value='Sesame seed', inplace=True)
    df.crop.replace(to_replace=['Tobacco (Dyer)', 'Tobacco (Gu)', 'Tobacco (long rains)', 'Tobacco (short rains)', 'Tobacco (southern highlands)'],
                    value='Tobacco', inplace=True)
    return df

def refactor_uvm_winter_vegetables_crop(df):
    for idx, row in df[df.crop=='Winter vegetables'].iterrows():
        for veg in ['Cabbages and other brassicas', 'Cauliflowers and broccoli', 'Spinach', 'Onions', 'Garlic','Vegetables', 'Vegetables Primary']:
            new_row = row.replace('Winter vegetables', veg)
            df = df.append(new_row, ignore_index=True)
    df = df[df.crop!='Winter vegetables']
    return df

def replace_uvm_crop_name_with_crop_name_in_original_data(df, crop_name):
    df.loc[df.crop==crop_name, 'crop'] = df.loc[df.crop==crop_name, 'crop_name_in_original_data']
    return df

def remove_unnecessary_uvm_crops(df):
    df = df[~df.crop.isin(['Cotton'])]
    return df

def assign_FIPS_10_country_codes_to_uvm_crop_calendar(crop_calendar, uwm_to_fips_country_map, country_codes_df):
    '''Add the new UWM regions to the existing NOAA/FAO FIPS data and add to the uwm data'''
    uwm_to_fips_country_map.update(dict(zip(country_codes_df.country_name, country_codes_df.FIPS_10_country_code)))
    crop_calendar['FIPS_10_country_code'] = crop_calendar.location.map(uwm_to_fips_country_map)
    return crop_calendar

fao_crop_yield_col_name_map = {
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
    'flag': 'fao_data_quality_id', 
    'item_code': 'crop_id',
}

fao_crop_ids_col_name_map = {
    'item': 'crop', 
    'item_code': 'crop_id'
}

# Data preprocessing pipelines (must be in the correct order)
fao_crop_ids = (initial_fao_crop_ids
                .pipe(start_pipeline)
                .pipe(clean_col_names)
                .pipe(rename_columns, fao_crop_ids_col_name_map)
                .pipe(remove_unnecessary_columns, ['cpc_code'])
                .pipe(clean_fao_crop_names)
                .pipe(refactor_fao_crop_names)
               )
fao_crop_yields = (initial_fao_crop_yields
                   .pipe(start_pipeline)
                   .pipe(clean_col_names)
                   .pipe(rename_fao_data_quality_flags)
                   .pipe(remove_fao_zero_crop_yields)
                   .pipe(remove_fao_non_crop_items)
                   .pipe(drop_fao_non_country_regions)
                   .pipe(remove_unnecessary_columns, ['year_code', 'element', 'element_code', 'item'])
                   .pipe(rename_columns, fao_crop_yield_col_name_map)
                  )
fao_crop_yield_units = (fao_crop_yields
                        .pipe(extract_fao_crop_yield_units)
                       )

# Initial country codes are taken from a dictionary mapping standard 
# FIPS-10 country codes to the NOAA country codes and FAO country names
country_codes = (pd.DataFrame(FIPS_10_country_codes)
                .pipe(combine_fips_noaa_fao_country_codes, fao_crop_yields)
                )
fao_crop_yields = (fao_crop_yields
                   .pipe(combine_similar_fao_crops, fao_crop_ids)
                    )

fao_crop_ids = (fao_crop_ids
                .pipe(drop_duplicated_fao_crop_names)
                .pipe(drop_fao_crop_ids_not_in_crop_yields, fao_crop_yields)
                .pipe(add_yield_units_to_fao_crop_ids, fao_crop_yield_units)
               )





uwm_crop_calendar = (unprocessed_uwm_crop_calendar
                     .pipe(start_pipeline)
                     .pipe(clean_col_names)
                     .pipe(remove_unnecessary_columns, uwm_unnecessary_cols)
                     .pipe(replace_uvm_crop_name_with_crop_name_in_original_data, 'Other')
                     .pipe(replace_uvm_crop_name_with_crop_name_in_original_data, 'Multiple')
                     .pipe(rename_uvm_crops_to_match_fao_yields)
                     .pipe(remove_unnecessary_uvm_crops)
                     .pipe(refactor_uvm_winter_vegetables_crop)
                     .pipe(assign_FIPS_10_country_codes_to_uvm_crop_calendar, uwm_location_to_fips_country_map, country_codes)
                     .pipe(remove_unnecessary_columns, ['crop_name_in_original_data'])
                    )

print('fao_crop_yields: ', fao_crop_yields.shape)
print('fao_crop_ids: ', fao_crop_ids.shape)
print('country_codes: ', country_codes.shape)
print('uwm_crop_calendar: ', uwm_crop_calendar.shape)

  df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()


fao_crop_yields:  (468649, 5)
fao_crop_ids:  (151, 3)
country_codes:  (202, 6)
uwm_crop_calendar:  (1451, 11)


  df.crop = df.crop.str.replace('.', ' ')


In [5]:
country_codes.head()

Unnamed: 0,country_name,FIPS_10_country_code,noaa_country_code,fao_country_code,fao_year_min,fao_year_max
0,Afghanistan,AF,AF,2,1961,2020
1,Albania,AL,AL,3,1961,2020
2,Algeria,AG,AG,4,1961,2020
3,Angola,AO,AO,7,1961,2020
4,Antigua and Barbuda,AC,AC,8,1961,2020


In [6]:
uwm_crop_calendar.head()

Unnamed: 0,location,nation_code,state_code,county_code,crop,plant_start_date,harvest_end_date,location_code,lat_avg,lon_avg,FIPS_10_country_code
0,Albania,55,,,Wheat,10/16,6/30,55,40.936144,19.920749,AL
1,Albania,55,,,Maize,4/16,10/31,55,41.149095,19.968137,AL
2,Albania,55,,,Pulses,4/16,8/31,55,40.981883,19.946543,AL
4,Albania,55,,,Vegetables,4/16,10/15,55,,,AL
5,Albania,55,,,Spring vegetables (greenhouse planting),2/15,10/31,55,,,AL


In [7]:
fao_crop_yields.head()

Unnamed: 0,fao_country_code,year,crop,value,crop_id
0,1,1992,Apples,68800.0,515
1,1,1993,Apples,26354.0,515
2,1,1994,Apples,61589.0,515
3,1,1995,Apples,78646.0,515
4,1,1996,Apples,91364.0,515


In [8]:
fao_crop_ids.head()

Unnamed: 0,crop_id,crop,unit
0,101,Canary seed,hg/ha
1,103,Grain,hg/ha
2,108,Cereals,hg/ha
3,116,Potatoes,hg/ha
4,122,Sweet potatoes,hg/ha


In [12]:
''' REMOVE ALL CROPS THAT ARE NOT FAO AND ALL COUNTRIES'''
for crop in uwm_crop_calendar.crop.sort_values().unique():
    if crop not in fao_to_uvm_crop_col_matches:
        print(crop)

Birdseed
Crops in low lying areas
Forage
Nigerseed
Poppyseed
Spring vegetables (greenhouse planting)
Tapioca
Teff (Meher)
Walo crops
