In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey, Date, Text, Boolean
from pprint import pprint as pp
from datetime import datetime

from config import local_mysql_password, local_mysql_user, FIPS_10_country_codes, uwm_to_fips_country_map

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

## Load the data

In [2]:
fao_file_path = "./data/fao_data/"

# Load the fao crop yield data
fao_crop_yields_file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
initial_fao_crop_yields = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_crop_yields_file_name}.csv", encoding='latin-1')
initial_fao_crop_yields.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1975,1975,ha,0.0,F
1,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,1976,ha,5900.0,F
2,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,1977,ha,6000.0,F
3,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,1978,ha,6000.0,F
4,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,1979,ha,6000.0,F


In [3]:
# Load the fao crop id data
fao_product_ids_file_name = 'Production_Crops_Livestock_E_ItemCodes'
initial_fao_crop_ids = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_product_ids_file_name}.csv", encoding='latin-1')
initial_fao_crop_ids.head()

Unnamed: 0,Item Code,CPC Code,Item
0,101,'01195,Canary seed
1,1016,'02123,Goats
2,1017,'21116,Meat; goat
3,1018,'21156,Offals; edible; goats
4,1019,'21515,Fat; goats


#### Shared pipeline functions 

In [4]:
def start_pipeline(df: pd.DataFrame):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def clean_col_names(df: pd.DataFrame):
    ''' Replace spaces column names with underscores and make lower case '''
    df.columns = df.columns.str.replace(' ','_').str.lower()
    return df

def rename_columns(df: pd.DataFrame, column_name_map: dict):
    return df.rename(column_name_map, axis=1)

def remove_unnecessary_columns(df: pd.DataFrame, unnecessary_columns: list):
    return df.drop(unnecessary_columns, axis=1)

def rename_data_quality_flags(df: pd.DataFrame):
    '''Rename confusing data aggregation flags'''
    df.flag.replace(np.nan, 'O', inplace=True)
    df.flag.replace('*', 'U', inplace=True)
    return df



## Initial fao_crop_id data cleaning

In [5]:
def clean_crop_names(df):
    df.crop = df.crop.str.split(';').str[0]
    df.crop = df.crop.str.split(' nes').str[0]
    return df

fao_crop_ids_col_name_map = {
    'item': 'crop', 
    'item_code': 'crop_id'
}

print(initial_fao_crop_ids.shape)
fao_crop_ids = (initial_fao_crop_ids
                .pipe(start_pipeline)
                .pipe(clean_col_names)
                .pipe(rename_columns, fao_crop_ids_col_name_map)
                .pipe(remove_unnecessary_columns, ['cpc_code'])
                .pipe(clean_crop_names)
               )
fao_crop_ids.shape

(306, 3)


(306, 2)

In [6]:
regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]

def remove_zero_crop_yields(df):
    '''Remove 0 crop yield values to prevent bad training data'''
    return df[df.value != 0.0]


def drop_non_country_regions(df):
    '''Drop areas which are not countries'''
    return df[~df.area.isin(regions+special_groups)]

def remove_non_crop_items(df):
    '''Remove rows relating to unnecessary production_types.'''
    # Yield = Production/Area_harvested so we can drop those two rows and only keep Yield.
    df = df[~df.element.isin(['Area harvested', 'Production'])]
    
    # Items with other production types are animals, which do not depend on weather.
    production_types_to_drop = ['Stocks', 'Laying', 'Producing Animals/Slaughtered', 'Yield/Carcass Weight', 'Milk Animals', 'Prod Popultn']
    items_to_drop = df[df.element.isin(production_types_to_drop)].item_code.unique()
    return df[~df.item_code.isin(items_to_drop)]

fao_crop_yield_col_name_map = {
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
    'flag': 'fao_data_quality_id', 
    'item_code': 'crop_id',
}

print(initial_fao_crop_yields.shape)
fao_crop_yields = (initial_fao_crop_yields
                   .pipe(start_pipeline)
                   .pipe(clean_col_names)
                   .pipe(rename_data_quality_flags)
                   .pipe(remove_zero_crop_yields)
                   .pipe(remove_non_crop_items)
                   .pipe(drop_non_country_regions)
                   .pipe(remove_unnecessary_columns, ['year_code', 'element', 'element_code', 'item'])
                   .pipe(rename_columns, fao_crop_yield_col_name_map)
                  )
'''
# Removing unnecessary columns
fao_crop_yield_data.drop(['fao_country_name', 'product', 'production_type_id', 'production_type'], axis=1, inplace=True, errors='ignore')

# Remove countries

# Combine duplicated crops

'''
fao_crop_yields.shape

(3807008, 11)


(528551, 7)

In [7]:
fao_crop_yields.head()

Unnamed: 0,fao_country_code,fao_country_name,crop_id,year,unit,value,fao_data_quality_id
46,2,Afghanistan,221,1976,hg/ha,16610.0,Fc
47,2,Afghanistan,221,1977,hg/ha,15000.0,Fc
48,2,Afghanistan,221,1978,hg/ha,20000.0,Fc
49,2,Afghanistan,221,1979,hg/ha,17500.0,Fc
50,2,Afghanistan,221,1980,hg/ha,17069.0,Fc


In [8]:
fao_crop_yields[(fao_crop_yields.fao_country_code==351)&(fao_crop_yields.year==2020)&(fao_crop_yields.crop_id.isin([176, 414]))]

Unnamed: 0,fao_country_code,fao_country_name,crop_id,year,unit,value,fao_data_quality_id
484641,351,China,176,2020,hg/ha,17441.0,Fc
484791,351,China,414,2020,hg/ha,269875.0,Fc


In [None]:
def combine_similar_crops(crop_yields, crop_ids):
    crop_ids = crop_ids.copy()
    crop_yields = crop_yields.copy()
    
    
    print(crop_yields.shape)
    
    print(crop_ids.shape)
    # Remove crop_ids that don't exist in the crop_yield data
    crop_ids = crop_ids[crop_ids.crop_id.isin(crop_yields.crop_id)]
    print(crop_ids.shape)

    # Add the crop names to combine yield values for duplicate crop names
    # Combine the yield values by name (The crop ids are now invalid because they are summed too.)
    summed_yields = crop_yields.merge(crop_ids, on='crop_id').groupby(['fao_country_code', 'year', 'crop']).sum()
    summed_yields.reset_index(inplace=True)
    
    print(summed_yields.shape)
    
    # Drop the duplicated names from the crop_ids df
    crop_ids = crop_ids[~crop_ids.crop.duplicated()]

    # Recombine the dataframes by crop name to replace the ids in the crop_yield df with those from crop_ids
    summed_yields.drop('crop_id', axis=1, inplace=True)
    summed_yields = summed_yields.merge(crop_ids, on='crop')
    
    
    # Restore the units and fao_data_quality_id
    crop_yield_units = crop_yields[['crop_id', 'unit', 'fao_data_quality_id']]
    summed_yields = summed_yields.merge(crop_yield_units, on='crop_id')
    
    print(summed_yields[(summed_yields.fao_country_code==351)&(summed_yields.year==2020)&(summed_yields.crop_id.isin([176, 414]))])
    print('done')
    
combine_similar_crops(fao_crop_yields, fao_crop_ids)

(528551, 7)
(306, 2)
(170, 2)
(488403, 5)


Unnamed: 0,fao_country_code,year,crop,value,crop_id
12524,1,1992,Beans,16149.0,176
12525,1,1993,Beans,20265.0,176
12526,1,1994,Beans,24000.0,176
12527,1,1995,Beans,15044.0,176
12528,1,1996,Beans,24065.0,176
...,...,...,...,...,...
18865,351,2016,Beans,16439.0,176
18866,351,2017,Beans,17558.0,176
18867,351,2018,Beans,17541.0,176
18868,351,2019,Beans,17402.0,176


In [125]:
summed_yields[(summed_yields.fao_country_code==351)&(summed_yields.year==2020)&(summed_yields.crop_id.isin([176, 414]))]

Unnamed: 0,fao_country_code,year,crop,value,crop_id
18869,351,2020,Beans,17441.0,176


In [112]:
fao_crop_ids[fao_crop_ids.crop.duplicated()]

Unnamed: 0,crop_id,crop
70,1717,Cereals
97,1841,Oilcrops
107,211,Pulses
170,403,Onions
173,414,Beans
174,417,Peas
184,463,Vegetables
198,531,Cherries
201,541,Fruit
202,542,Fruit


In [96]:
# Drop the duplicated names from the crop_ids df
fao_crop_ids = fao_crop_ids[~fao_crop_ids.crop.duplicated()]

# Recombine by name to replace the ids in the crop yield df
combined = combined.merge(fao_crop_ids, on='crop')

print(combined.shape)

(488403, 5)


In [97]:
# Find duplicate crops after cleaning the crop names
combined_counts = combined.groupby('fao_country_code', 'year', 'crop').count().reset_index()
combined_counts
#duplicated_crops = combined_counts[combined_counts.crop_id > 1].crop.tolist()
#duplicated_crops

ValueError: No axis named year for object type DataFrame

In [57]:
fao_crop_ids.shape

(170, 2)

In [43]:
# Find duplicate crops after cleaning the crop names
crop_name_counts = fao_crop_ids.groupby('crop').count().reset_index()
duplicated_crops = crop_name_counts[crop_name_counts.crop_id > 1].crop.tolist()
duplicated_crops

['Beans',
 'Cereals',
 'Cherries',
 'Chillies and peppers',
 'Fruit',
 'Maize',
 'Oilcrops',
 'Onions',
 'Peas',
 'Pulses',
 'Vegetables']