In [4]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey, Date, Text, Boolean
from pprint import pprint as pp
from datetime import datetime

from config import local_mysql_password, local_mysql_user 
from helpers import FIPS_10_country_codes, uwm_location_to_fips_country_map, uwm_unnecessary_cols

pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 200)

## Load the data

In [5]:
fao_file_path = "./data/fao_data/"
crop_calendar_file_path = "./data/"

# Load the fao crop yield data
fao_crop_yields_file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
initial_fao_crop_yields = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_crop_yields_file_name}.csv", encoding='latin-1')

# Load the fao crop id data
fao_product_ids_file_name = 'Production_Crops_Livestock_E_ItemCodes'
initial_fao_crop_ids = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_product_ids_file_name}.csv", encoding='latin-1')

# Load the crop calendar
file_name = 'crop_calendar_uni_wisconsin_madison'
unprocessed_uwm_crop_calendar = pd.read_csv(f"{crop_calendar_file_path}{file_name}.csv")

## Data preprocessing pipelines

In [6]:
def start_pipeline(df: pd.DataFrame):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def clean_col_names(df: pd.DataFrame):
    ''' Replace spaces column names with underscores and make lower case '''
    df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()
    return df

def rename_columns(df: pd.DataFrame, column_name_map: dict):
    return df.rename(column_name_map, axis=1)

def remove_unnecessary_columns(df: pd.DataFrame, unnecessary_columns: list):
    return df.drop(unnecessary_columns, axis=1)

regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]

def combine_fips_noaa_fao_country_codes(df, fao_crop_yields):
    '''
    Drop any countries not in the FAO crop yield data and map the country codes from the different sources.
    Initial country codes are taken from a dictionary mapping standard FIPS-10 country codes to the NOAA country codes and FAO country names
    '''
    # Extract the fao countries from the crop yield data
    fao_country_data = fao_crop_yields.groupby(['fao_country_name', 'fao_country_code']).year.agg(['min', 'max'])
    fao_country_data.reset_index(inplace=True)
    # Merge with the FIPS country codes
    df = df.merge(fao_country_data, how='inner', left_on='fao_country_name', right_on='fao_country_name')
    df.drop(['fao_country_name'], axis=1, inplace=True)
    df.rename(columns={'min': 'fao_year_min', 'max': 'fao_year_max'}, inplace=True)
    return df

def rename_fao_data_quality_flags(df: pd.DataFrame):
    '''Rename confusing data aggregation flags'''
    df.flag.replace(np.nan, 'O', inplace=True)
    df.flag.replace('*', 'U', inplace=True)
    return df

def remove_fao_zero_crop_yields(df):
    '''Remove 0 crop yield values to prevent bad training data'''
    return df[df.value != 0.0]

def drop_fao_non_country_regions(df):
    '''Drop areas which are not countries'''
    return df[~df.area.isin(regions+special_groups)]

def remove_fao_non_crop_items(df):
    '''Remove rows relating to unnecessary production_types.'''
    # Yield = Production/Area_harvested so we can drop those two rows and only keep Yield.
    df = df[~df.element.isin(['Area harvested', 'Production'])]
    # Items with other production types are animals, which do not depend on weather.
    production_types_to_drop = ['Stocks', 'Laying', 'Producing Animals/Slaughtered', 'Yield/Carcass Weight', 'Milk Animals', 'Prod Popultn']
    items_to_drop = df[df.element.isin(production_types_to_drop)].item_code.unique()
    return df[~df.item_code.isin(items_to_drop)]

def combine_similar_fao_crops(crop_yields, crop_ids):       
    # Remove crop_ids that don't exist in the crop_yield data
    crop_ids = crop_ids[crop_ids.crop_id.isin(crop_yields.crop_id)]

    # Add the crop names to combine yield values for duplicate crop names
    # Combine the yield values by name (The crop ids are now invalid because they are summed too.)
    summed_yields = crop_yields.merge(crop_ids, on='crop_id').groupby(['fao_country_code', 'year', 'crop']).sum()
    summed_yields.reset_index(inplace=True)
    
    # Drop the duplicated names from the crop_ids df
    crop_ids = crop_ids[~crop_ids.crop.duplicated()]

    # Recombine the dataframes by crop name to replace the ids in the crop_yield df with those from crop_ids
    summed_yields.drop('crop_id', axis=1, inplace=True)
    summed_yields = summed_yields.merge(crop_ids, on='crop')
    
    return summed_yields

def clean_fao_crop_names(df):
    df.crop = df.crop.str.split(';').str[0]
    df.crop = df.crop.str.split(' nes').str[0]
    return df

def refactor_fao_crop_names(df):
    df.crop.replace(to_replace=['Beans', 'Peas', 'Chick peas', 'Cow peas', 'Lentils'], 
                    value='Pulses', inplace=True)
    return df

def drop_duplicated_fao_crop_names(df):
    return df[~df.crop.duplicated()]

def drop_fao_crop_ids_not_in_crop_yields(df, crop_yields):
    """Remove crop_id_ids that don't exist in the fao_crop_yield data"""
    return df[df.crop_id.isin(fao_crop_yields.crop_id)]

def extract_fao_crop_yield_units(crop_yields):
    crop_yield_units = crop_yields[['crop_id', 'unit']].copy()
    return crop_yield_units.drop_duplicates()

def add_yield_units_to_fao_crop_ids(crop_ids, crop_yield_units):
    crop_ids = crop_ids.merge(crop_yield_units, on='crop_id')
    return crop_ids

def rename_uvm_crops_to_match_fao_yields(df: pd.DataFrame):
    df.crop = df.crop.str.replace('.', ' ')
    df.crop.replace('Sweet Potatoes', 'Sweet potatoes', inplace=True)
    df.crop.replace('Sugarbeets', 'Sugar beet', inplace=True)
    df.crop.replace('Sugarcane', 'Sugar cane', inplace=True)
    df.crop.replace('Sunflower', 'Sunflower seed', inplace=True)
    df.crop.replace('Tomato', 'Tomatoes', inplace=True)
    df.crop.replace('Safflower', 'Safflower seed', inplace=True)
    df.crop.replace('Spring vegetables (field planting)', 'Vegetables', inplace=True)
    df.crop.replace(to_replace=['Spring Coarse Grains', 'Spring Grains', 'Winter Grains'],
                        value='Grain', inplace=True)
    df.crop.replace(to_replace=['All Cereals (Belg)', 'Spring Cereals', 'Spring cereals', 'Winter Cereals', 'Winter cereals'],
                        value='Cereals', inplace=True)
    df.crop.replace(to_replace=['Castor (Kharif)', 'Castor beans', 'Castorseed'],
                    value='Castor oil seed', inplace=True)
    df.crop.replace(to_replace=['Flax', 'Flaxseed', 'Flazseed'],
                    value='Flax fibre and tow', inplace=True)
    df.crop.replace(to_replace=['Mustard', 'Mustard (Rabi)'],
                    value='Mustard seed', inplace=True)
    df.crop.replace(to_replace=['Sesame', 'Sesame (Deyr)', 'Sesame (Gu)'],
                    value='Sesame seed', inplace=True)
    df.crop.replace(to_replace=['Tobacco (Dyer)', 'Tobacco (Gu)', 'Tobacco (long rains)', 'Tobacco (short rains)', 'Tobacco (southern highlands)'],
                    value='Tobacco', inplace=True)
    return df

def refactor_uvm_winter_vegetables_crop(df):
    for idx, row in df[df.crop=='Winter vegetables'].iterrows():
        for veg in ['Cabbages and other brassicas', 'Cauliflowers and broccoli', 'Spinach', 'Onions', 'Garlic','Vegetables', 'Vegetables Primary']:
            new_row = row.replace('Winter vegetables', veg)
            df = df.append(new_row, ignore_index=True)
    df = df[df.crop!='Winter vegetables']
    return df

def replace_uvm_crop_name_with_crop_name_in_original_data(df, crop_name):
    df.loc[df.crop==crop_name, 'crop'] = df.loc[df.crop==crop_name, 'crop_name_in_original_data']
    return df

def remove_unnecessary_uvm_crops(df):
    df = df[~df.crop.isin(['Cotton'])]
    return df

def assign_FIPS_10_country_codes_to_uvm_crop_calendar(crop_calendar, uwm_to_fips_country_map, country_codes_df):
    '''Add the new UWM regions to the existing NOAA/FAO FIPS data and add to the uwm data'''
    uwm_to_fips_country_map.update(dict(zip(country_codes_df.country_name, country_codes_df.FIPS_10_country_code)))
    crop_calendar['FIPS_10_country_code'] = crop_calendar.location.map(uwm_to_fips_country_map)
    crop_calendar.drop('nation_code', axis=1, inplace=True)
    return crop_calendar

def add_crop_ids_to_crop_calendar(crop_calendar, crop_ids):
    crop_calendar = crop_calendar.merge(crop_ids, on='crop')
    crop_calendar.drop(['unit', 'crop'], axis=1, inplace=True)
    return crop_calendar

def drop_non_fao_crops(crop_calendar, crop_ids):
    return crop_calendar[crop_calendar.crop.isin(crop_ids.crop)]

def drop_nan_dates(crop_calendar):
    crop_calendar = crop_calendar[~crop_calendar.harvest_end_date.isna()]
    crop_calendar = crop_calendar[~crop_calendar.plant_start_date.isna()]
    return crop_calendar
    
def _get_growing_months(row):
    if row.plant_start_month < row.harvest_end_month:
        return [i for i in range(row.plant_start_month, row.harvest_end_month+1)]
    else:
        growing_months = [i for i in range(row.plant_start_month, 13)]
        growing_months.extend([j for j in range(1, row.harvest_end_month+1)])
        return growing_months
    
def reformat_uvm_dates(crop_calendar):
    return (crop_calendar
            .assign(harvest_end_month = crop_calendar.harvest_end_date.str.split('/')
                    .apply(lambda x: x[0]).astype(int))
            .assign(plant_start_month = crop_calendar.plant_start_date.str.split('/')
                    .apply(lambda x: x[0]).astype(int))
            .assign(growing_months = lambda x: x.apply(_get_growing_months, axis=1))
            #.assign(growing_months = (_get_growing_months(plant_start_month, harvest_end_month)))
            .drop(['harvest_end_date', 'plant_start_date', 'harvest_end_month', 'plant_start_month'], axis=1)
           )
    
def replace_fao_country_codes_with_FIPS(crop_yields, country_codes_df):
    return crop_yields.merge(country_codes_df[['fao_country_code', 'FIPS_10_country_code']], on='fao_country_code').drop('fao_country_code', axis=1)
    
def join_crop_yields_with_crop_calendar(crop_calendar, crop_yields):
    return pd.merge(crop_yields, crop_calendar,  how='inner', on=['FIPS_10_country_code','crop_id'])

def combine_cooardinates(crop_calendar):
    return (crop_calendar
            .merge(crop_calendar
                   .groupby('location')
                   .agg(lat_max=('lat_avg', 'max'), 
                        lat_min=('lat_avg', 'min'), 
                        lon_max=('lon_avg', 'max'), 
                        lon_min=('lat_avg', 'min')),
                   on='location'
                  )
           .drop(['lat_avg', 'lon_avg'], axis=1)
           )

def combine_growing_months(crop_calendar):
    combined_growing_months = (crop_calendar
                               .groupby(['location', 'year', 'crop'])
                               .growing_months.sum()
                               .apply(lambda x: list(set(x)))
                               .reset_index()
                              )
    crop_calendar.drop('growing_months', axis=1, inplace=True)
    return crop_calendar.merge(combined_growing_months,
                               on=['location', 'year', 'crop'])

def drop_NaNs(df):
    df = df.dropna()
    return df

def drop_duplicate_rows_in_crop_calendar(crop_calendar):
    return crop_calendar.drop_duplicates(
        subset=['year', 'crop', 'crop_yield', 'crop_id', 'FIPS_10_country_code', 
                'location'])

fao_crop_yield_col_name_map = {
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
    'flag': 'fao_data_quality_id', 
    'item_code': 'crop_id',
    'value': 'crop_yield',
}

fao_crop_ids_col_name_map = {
    'item': 'crop', 
    'item_code': 'crop_id'
}

# Data preprocessing pipelines (must be in the correct order)
fao_crop_ids = (initial_fao_crop_ids
                .pipe(start_pipeline)
                .pipe(clean_col_names)
                .pipe(rename_columns, fao_crop_ids_col_name_map)
                .pipe(remove_unnecessary_columns, ['cpc_code'])
                .pipe(clean_fao_crop_names)
                .pipe(refactor_fao_crop_names)
               )
fao_crop_yields = (initial_fao_crop_yields
                   .pipe(start_pipeline)
                   .pipe(clean_col_names)
                   .pipe(rename_fao_data_quality_flags)
                   .pipe(remove_fao_zero_crop_yields)
                   .pipe(remove_fao_non_crop_items)
                   .pipe(drop_fao_non_country_regions)
                   .pipe(remove_unnecessary_columns, ['year_code', 'element', 'element_code', 'item'])
                   .pipe(rename_columns, fao_crop_yield_col_name_map)
                  )
fao_crop_yield_units = (fao_crop_yields
                        .pipe(start_pipeline)
                        .pipe(extract_fao_crop_yield_units)
                       )
country_codes = (pd.DataFrame(FIPS_10_country_codes)
                 .pipe(start_pipeline)
                 .pipe(combine_fips_noaa_fao_country_codes, fao_crop_yields)
                )
fao_crop_yields = (fao_crop_yields
                   .pipe(start_pipeline)
                   .pipe(combine_similar_fao_crops, fao_crop_ids)
                   .pipe(replace_fao_country_codes_with_FIPS, country_codes)
                  )
fao_crop_ids = (fao_crop_ids
                .pipe(start_pipeline)
                .pipe(drop_duplicated_fao_crop_names)
                .pipe(drop_fao_crop_ids_not_in_crop_yields, fao_crop_yields)
                .pipe(add_yield_units_to_fao_crop_ids, fao_crop_yield_units)
               )
uwm_fao_crop_calendar = (unprocessed_uwm_crop_calendar
                         .pipe(start_pipeline)
                         .pipe(clean_col_names)
                         .pipe(remove_unnecessary_columns, uwm_unnecessary_cols)
                         .pipe(replace_uvm_crop_name_with_crop_name_in_original_data, 'Other')
                         .pipe(replace_uvm_crop_name_with_crop_name_in_original_data, 'Multiple')
                         .pipe(rename_uvm_crops_to_match_fao_yields)
                         .pipe(remove_unnecessary_uvm_crops)
                         .pipe(refactor_uvm_winter_vegetables_crop)
                         .pipe(assign_FIPS_10_country_codes_to_uvm_crop_calendar, uwm_location_to_fips_country_map, country_codes)
                         .pipe(remove_unnecessary_columns, ['crop_name_in_original_data', 'state_code', 'county_code', 'location_code'])
                         .pipe(drop_non_fao_crops, fao_crop_ids)
                         .pipe(add_crop_ids_to_crop_calendar, fao_crop_ids)
                         .pipe(drop_nan_dates)
                         .pipe(reformat_uvm_dates)
                         .pipe(join_crop_yields_with_crop_calendar, fao_crop_yields)
                         .pipe(combine_cooardinates)
                         .pipe(drop_NaNs)
                         .pipe(combine_growing_months)
                         .pipe(drop_duplicate_rows_in_crop_calendar)
                        )

# Remove any rows left that don't match the uwm_fao_crop_calendar
country_codes = country_codes[country_codes.FIPS_10_country_code.isin(uwm_fao_crop_calendar.FIPS_10_country_code.values)]
fao_crop_ids = fao_crop_ids[fao_crop_ids.crop_id.isin(uwm_fao_crop_calendar.crop_id.values)]
fao_crop_yields = fao_crop_yields.merge(uwm_fao_crop_calendar,
                          on=['year','crop','crop_yield','crop_id','FIPS_10_country_code'], 
                          how='inner')

# Save the data aggregates to use in other scripts
fao_crop_yields.to_csv('fao_crop_yields.csv')
fao_crop_ids.to_csv('fao_crop_ids.csv')
country_codes.to_csv('country_codes.csv')
uwm_fao_crop_calendar.to_csv('uwm_fao_crop_calendar.csv')

print('fao_crop_yields: ', fao_crop_yields.shape)
print('fao_crop_ids: ', fao_crop_ids.shape)
print('country_codes: ', country_codes.shape)
print('uwm_fao_crop_calendar: ', uwm_fao_crop_calendar.shape)
uwm_fao_crop_calendar.head()

  df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()
  df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()
  df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()
  df.crop = df.crop.str.replace('.', ' ')
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)
  df = df.append(new_row, ignore_index=True)


fao_crop_yields:  (54876, 11)
fao_crop_ids:  (38, 3)
country_codes:  (118, 6)
uwm_fao_crop_calendar:  (54876, 11)


Unnamed: 0,year,crop,crop_yield,crop_id,FIPS_10_country_code,location,lat_max,lat_min,lon_max,lon_min,growing_months
0,1992,Wheat,21600.0,15,AM,Armenia,40.186727,40.186727,44.896421,40.186727,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
1,1993,Wheat,22257.0,15,AM,Armenia,40.186727,40.186727,44.896421,40.186727,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
2,1994,Wheat,17842.0,15,AM,Armenia,40.186727,40.186727,44.896421,40.186727,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
3,1995,Wheat,25480.0,15,AM,Armenia,40.186727,40.186727,44.896421,40.186727,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
4,1996,Wheat,23569.0,15,AM,Armenia,40.186727,40.186727,44.896421,40.186727,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"


In [80]:
locations_per_year = {}
for year in uwm_fao_crop_calendar.year.sort_values(ascending=False).unique().tolist():
    crop_yields_this_year = uwm_fao_crop_calendar[uwm_fao_crop_calendar.year==year]
    FIPS_10_country_codes_this_year = crop_yields_this_year.FIPS_10_country_code.unique().tolist()
    locations_this_year = crop_yields_this_year.location.unique().tolist()
    
    noaa_country_codes_this_year = country_codes[country_codes.FIPS_10_country_code.isin(FIPS_10_country_codes_this_year)].noaa_country_code.unique().tolist()
    locations_per_year[year] = {
         'noaa_country_codes': noaa_country_codes_this_year,
         'FIPS_10_country_codes': FIPS_10_country_codes_this_year,
         'crop_locations': locations_this_year,
         'crops': {}
     }
    print(noaa_country_codes_this_year)
    break

['AL', 'AG', 'AO', 'AR', 'AM', 'AS', 'AJ', 'BG', 'BO', 'BN', 'BC', 'BR', 'BU', 'UV', 'BY', 'CB', 'CM', 'CA', 'CT', 'CD', 'CI', 'CH', 'CO', 'CF', 'CG', 'CS', 'IV', 'CU', 'DA', 'DR', 'EC', 'EG', 'ES', 'ER', 'EN', 'ET', 'FI', 'FR', 'GB', 'GA', 'GG', 'GM', 'GH', 'GT', 'GV', 'PU', 'GY', 'HA', 'HU', 'IN', 'ID', 'IZ', 'IS', 'IT', 'JA', 'JO', 'KZ', 'KE', 'KN', 'KS', 'KG', 'LG', 'LE', 'LT', 'LI', 'LY', 'LH', 'MA', 'MI', 'MY', 'ML', 'MR', 'MX', 'MO', 'MZ', 'BM', 'WA', 'NP', 'NU', 'NG', 'NI', 'PK', 'PM', 'PA', 'PE', 'RP', 'PL', 'RO', 'RW', 'SA', 'SG', 'RI', 'SL', 'SO', 'SF', 'OD', 'SP', 'CE', 'SU', 'NS', 'SW', 'TI', 'TZ', 'TH', 'TT', 'TO', 'TD', 'TS', 'TU', 'TX', 'UG', 'UP', 'US', 'UY', 'UZ', 'VM', 'ZA', 'ZI']


In [56]:
year = 2020


Unnamed: 0,country_name,FIPS_10_country_code,noaa_country_code,fao_country_code,fao_year_min,fao_year_max
1,Albania,AL,AL,3,1961,2020
2,Algeria,AG,AG,4,1961,2020
3,Angola,AO,AO,7,1961,2020
5,Argentina,AR,AR,9,1961,2020
6,Armenia,AM,AM,1,1992,2020


In [None]:
country_codes.head()


In [60]:
(uwm_fao_crop_calendar[['FIPS_10_country_code', 'location']]
 .merge(country_codes[['country_name','FIPS_10_country_code','fao_country_code']])
 .drop_duplicates().groupby(['FIPS_10_country_code', 'country_name']).count()
 .sort_values('location', ascending=False)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,location,fao_country_code
FIPS_10_country_code,country_name,Unnamed: 2_level_1,Unnamed: 3_level_1
US,United States of America,45,45
IN,India,12,12
CH,China,6,6
ID,Indonesia,6,6
AS,Australia,5,5
CG,Congo (Democratic Republic of the),5,5
SF,South Africa,4,4
BR,Brazil,4,4
MX,Mexico,4,4
GH,Ghana,3,3


In [37]:
country_codes.head()

Unnamed: 0,country_name,FIPS_10_country_code,noaa_country_code,fao_country_code,fao_year_min,fao_year_max
1,Albania,AL,AL,3,1961,2020
2,Algeria,AG,AG,4,1961,2020
3,Angola,AO,AO,7,1961,2020
5,Argentina,AR,AR,9,1961,2020
6,Armenia,AM,AM,1,1992,2020


In [61]:
fao_file_path = "./data/fao_data/"

# Load the crop yield data
file_name = 'Environment_Temperature_change_E_All_Data_(Normalized)'
unprocessed_fao_temp_data = pd.read_csv(f"{fao_file_path}fao_temperature_change/normalized/{file_name}.csv", encoding='latin-1')
unprocessed_fao_temp_data.shape

(537370, 11)

In [72]:
fao_temp_col_name_map = {
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
}

def filter_out_unnecessary_countries(temp_change):
    

# Data preprocessing pipelines (must be in the correct order)
fao_temp_change = (unprocessed_fao_temp_data
                .pipe(start_pipeline)
                .pipe(clean_col_names)
                .pipe(rename_columns, fao_temp_col_name_map)
                .pipe(remove_unnecessary_columns, ['flag', 'year_code', 'months_code'])
               )

fao_temp_data.fao_country_code.unique().shape

  df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()


(285,)

In [69]:
(fao_temp_data
 .merge(country_codes[['fao_country_code', 'FIPS_10_country_code']], on='fao_country_code')
 .fao_country_name
 .unique()
 .shape
) # 118 countries

(118,)