In [143]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey, Date, Text, Boolean
from pprint import pprint as pp
from datetime import datetime

from config import local_mysql_password, local_mysql_user, FIPS_10_country_codes, uwm_location_to_fips_country_map, uwm_unnecessary_cols

pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 200)

## Load the data

In [2]:
fao_file_path = "./data/fao_data/"
crop_calendar_file_path = "./data/"

# Load the fao crop yield data
fao_crop_yields_file_name = 'Production_Crops_Livestock_E_All_Data_(Normalized)'
initial_fao_crop_yields = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_crop_yields_file_name}.csv", encoding='latin-1')

# Load the fao crop id data
fao_product_ids_file_name = 'Production_Crops_Livestock_E_ItemCodes'
initial_fao_crop_ids = pd.read_csv(f"{fao_file_path}fao_crop_data/normalized/{fao_product_ids_file_name}.csv", encoding='latin-1')

# Load the crop calendar
file_name = 'crop_calendar_uni_wisconsin_madison'
unprocessed_uwm_crop_calendar = pd.read_csv(f"{crop_calendar_file_path}{file_name}.csv")

## Data preprocessing pipelines

In [235]:
def start_pipeline(df: pd.DataFrame):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def clean_col_names(df: pd.DataFrame):
    ''' Replace spaces column names with underscores and make lower case '''
    df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()
    return df

def rename_columns(df: pd.DataFrame, column_name_map: dict):
    return df.rename(column_name_map, axis=1)

def remove_unnecessary_columns(df: pd.DataFrame, unnecessary_columns: list):
    return df.drop(unnecessary_columns, axis=1)

regions = [
    'World', 'Africa', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas',
    'Northern America', 'Central America', 'Caribbean', 'South America', 'Asia', 'Central Asia', 'Eastern Asia',
    'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe', 'Eastern Europe', 'Northern Europe', 'Southern Europe',
    'Western Europe', 'Oceania', 'Australia and New Zealand', 'Melanesia', 'Micronesia', 'Polynesia'
    ]
special_groups = [
    'European Union (28)', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States',
    'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries', 'Annex I countries', 'Non-Annex I countries',  'OECD'
    ]

def combine_fips_noaa_fao_country_codes(df, fao_crop_yields):
    '''
    Drop any countries not in the FAO crop yield data and map the country codes from the different sources.
    Initial country codes are taken from a dictionary mapping standard FIPS-10 country codes to the NOAA country codes and FAO country names
    '''
    # Extract the fao countries from the crop yield data
    fao_country_data = fao_crop_yields.groupby(['fao_country_name', 'fao_country_code']).year.agg(['min', 'max'])
    fao_country_data.reset_index(inplace=True)
    # Merge with the FIPS country codes
    df = df.merge(fao_country_data, how='inner', left_on='fao_country_name', right_on='fao_country_name')
    df.drop(['fao_country_name'], axis=1, inplace=True)
    df.rename(columns={'min': 'fao_year_min', 'max': 'fao_year_max'}, inplace=True)
    return df

def rename_fao_data_quality_flags(df: pd.DataFrame):
    '''Rename confusing data aggregation flags'''
    df.flag.replace(np.nan, 'O', inplace=True)
    df.flag.replace('*', 'U', inplace=True)
    return df

def remove_fao_zero_crop_yields(df):
    '''Remove 0 crop yield values to prevent bad training data'''
    return df[df.value != 0.0]

def drop_fao_non_country_regions(df):
    '''Drop areas which are not countries'''
    return df[~df.area.isin(regions+special_groups)]

def remove_fao_non_crop_items(df):
    '''Remove rows relating to unnecessary production_types.'''
    # Yield = Production/Area_harvested so we can drop those two rows and only keep Yield.
    df = df[~df.element.isin(['Area harvested', 'Production'])]
    # Items with other production types are animals, which do not depend on weather.
    production_types_to_drop = ['Stocks', 'Laying', 'Producing Animals/Slaughtered', 'Yield/Carcass Weight', 'Milk Animals', 'Prod Popultn']
    items_to_drop = df[df.element.isin(production_types_to_drop)].item_code.unique()
    return df[~df.item_code.isin(items_to_drop)]

def combine_similar_fao_crops(crop_yields, crop_ids):       
    # Remove crop_ids that don't exist in the crop_yield data
    crop_ids = crop_ids[crop_ids.crop_id.isin(crop_yields.crop_id)]

    # Add the crop names to combine yield values for duplicate crop names
    # Combine the yield values by name (The crop ids are now invalid because they are summed too.)
    summed_yields = crop_yields.merge(crop_ids, on='crop_id').groupby(['fao_country_code', 'year', 'crop']).sum()
    summed_yields.reset_index(inplace=True)
    
    # Drop the duplicated names from the crop_ids df
    crop_ids = crop_ids[~crop_ids.crop.duplicated()]

    # Recombine the dataframes by crop name to replace the ids in the crop_yield df with those from crop_ids
    summed_yields.drop('crop_id', axis=1, inplace=True)
    summed_yields = summed_yields.merge(crop_ids, on='crop')
    
    return summed_yields

def clean_fao_crop_names(df):
    df.crop = df.crop.str.split(';').str[0]
    df.crop = df.crop.str.split(' nes').str[0]
    return df

def refactor_fao_crop_names(df):
    df.crop.replace(to_replace=['Beans', 'Peas', 'Chick peas', 'Cow peas', 'Lentils'], 
                    value='Pulses', inplace=True)
    return df
    

def drop_duplicated_fao_crop_names(df):
    return df[~df.crop.duplicated()]

def drop_fao_crop_ids_not_in_crop_yields(df, crop_yields):
    """Remove crop_id_ids that don't exist in the fao_crop_yield data"""
    return df[df.crop_id.isin(fao_crop_yields.crop_id)]

def extract_fao_crop_yield_units(crop_yields):
    crop_yield_units = crop_yields[['crop_id', 'unit']].copy()
    return crop_yield_units.drop_duplicates()

def add_yield_units_to_fao_crop_ids(crop_ids, crop_yield_units):
    crop_ids = crop_ids.merge(crop_yield_units, on='crop_id')
    return crop_ids

def rename_uvm_crops_to_match_fao_yields(df: pd.DataFrame):
    df.crop = df.crop.str.replace('.', ' ')
    df.crop.replace('Sweet Potatoes', 'Sweet potatoes', inplace=True)
    df.crop.replace('Sugarbeets', 'Sugar beet', inplace=True)
    df.crop.replace('Sugarcane', 'Sugar cane', inplace=True)
    df.crop.replace('Sunflower', 'Sunflower seed', inplace=True)
    df.crop.replace('Tomato', 'Tomatoes', inplace=True)
    df.crop.replace('Safflower', 'Safflower seed', inplace=True)
    df.crop.replace('Spring vegetables (field planting)', 'Vegetables', inplace=True)
    df.crop.replace(to_replace=['Spring Coarse Grains', 'Spring Grains', 'Winter Grains'],
                        value='Grain', inplace=True)
    df.crop.replace(to_replace=['All Cereals (Belg)', 'Spring Cereals', 'Spring cereals', 'Winter Cereals', 'Winter cereals'],
                        value='Cereals', inplace=True)
    df.crop.replace(to_replace=['Castor (Kharif)', 'Castor beans', 'Castorseed'],
                    value='Castor oil seed', inplace=True)
    df.crop.replace(to_replace=['Flax', 'Flaxseed', 'Flazseed'],
                    value='Flax fibre and tow', inplace=True)
    df.crop.replace(to_replace=['Mustard', 'Mustard (Rabi)'],
                    value='Mustard seed', inplace=True)
    df.crop.replace(to_replace=['Sesame', 'Sesame (Deyr)', 'Sesame (Gu)'],
                    value='Sesame seed', inplace=True)
    df.crop.replace(to_replace=['Tobacco (Dyer)', 'Tobacco (Gu)', 'Tobacco (long rains)', 'Tobacco (short rains)', 'Tobacco (southern highlands)'],
                    value='Tobacco', inplace=True)
    return df

def refactor_uvm_winter_vegetables_crop(df):
    for idx, row in df[df.crop=='Winter vegetables'].iterrows():
        for veg in ['Cabbages and other brassicas', 'Cauliflowers and broccoli', 'Spinach', 'Onions', 'Garlic','Vegetables', 'Vegetables Primary']:
            new_row = row.replace('Winter vegetables', veg)
            df = df.append(new_row, ignore_index=True)
    df = df[df.crop!='Winter vegetables']
    return df

def replace_uvm_crop_name_with_crop_name_in_original_data(df, crop_name):
    df.loc[df.crop==crop_name, 'crop'] = df.loc[df.crop==crop_name, 'crop_name_in_original_data']
    return df

def remove_unnecessary_uvm_crops(df):
    df = df[~df.crop.isin(['Cotton'])]
    return df

def assign_FIPS_10_country_codes_to_uvm_crop_calendar(crop_calendar, uwm_to_fips_country_map, country_codes_df):
    '''Add the new UWM regions to the existing NOAA/FAO FIPS data and add to the uwm data'''
    uwm_to_fips_country_map.update(dict(zip(country_codes_df.country_name, country_codes_df.FIPS_10_country_code)))
    crop_calendar['FIPS_10_country_code'] = crop_calendar.location.map(uwm_to_fips_country_map)
    crop_calendar.drop('nation_code', axis=1, inplace=True)
    return crop_calendar

def add_crop_ids_to_crop_calendar(crop_calendar, crop_ids):
    crop_calendar = crop_calendar.merge(crop_ids, on='crop')
    crop_calendar.drop(['unit', 'crop'], axis=1, inplace=True)
    return crop_calendar

def drop_non_fao_crops(crop_calendar, crop_ids):
    return crop_calendar[crop_calendar.crop.isin(crop_ids.crop)]

def drop_nan_dates(crop_calendar):
    crop_calendar = crop_calendar[~crop_calendar.harvest_end_date.isna()]
    crop_calendar = crop_calendar[~crop_calendar.plant_start_date.isna()]
    return crop_calendar
    
def _get_growing_months(row):
    if row.plant_start_month < row.harvest_end_month:
        return [i for i in range(row.plant_start_month, row.harvest_end_month+1)]
    else:
        growing_months = [i for i in range(row.plant_start_month, 13)]
        growing_months.extend([j for j in range(1, row.harvest_end_month+1)])
        return growing_months
    
def reformat_uvm_dates(crop_calendar):
    return (crop_calendar
            .assign(harvest_end_month = crop_calendar.harvest_end_date.str.split('/')
                    .apply(lambda x: x[0]).astype(int))
            .assign(plant_start_month = crop_calendar.plant_start_date.str.split('/')
                    .apply(lambda x: x[0]).astype(int))
            .assign(growing_months = lambda x: x.apply(_get_growing_months, axis=1))
            #.assign(growing_months = (_get_growing_months(plant_start_month, harvest_end_month)))
            .drop(['harvest_end_date', 'plant_start_date', 'harvest_end_month', 'plant_start_month'], axis=1)
           )
    
def replace_fao_country_codes_with_FIPS(crop_yields, country_codes_df):
    return crop_yields.merge(country_codes_df[['fao_country_code', 'FIPS_10_country_code']], on='fao_country_code').drop('fao_country_code', axis=1)
    
def join_crop_yields_with_crop_calendar(crop_calendar, crop_yields):
    return pd.merge(crop_yields, crop_calendar,  how='inner', on=['FIPS_10_country_code','crop_id'])



fao_crop_yield_col_name_map = {
    'area_code': 'fao_country_code', 
    'area': 'fao_country_name', 
    'flag': 'fao_data_quality_id', 
    'item_code': 'crop_id',
    'value': 'yield',
}

fao_crop_ids_col_name_map = {
    'item': 'crop', 
    'item_code': 'crop_id'
}

# Data preprocessing pipelines (must be in the correct order)
fao_crop_ids = (initial_fao_crop_ids
                .pipe(start_pipeline)
                .pipe(clean_col_names)
                .pipe(rename_columns, fao_crop_ids_col_name_map)
                .pipe(remove_unnecessary_columns, ['cpc_code'])
                .pipe(clean_fao_crop_names)
                .pipe(refactor_fao_crop_names)
               )
fao_crop_yields = (initial_fao_crop_yields
                   .pipe(start_pipeline)
                   .pipe(clean_col_names)
                   .pipe(rename_fao_data_quality_flags)
                   .pipe(remove_fao_zero_crop_yields)
                   .pipe(remove_fao_non_crop_items)
                   .pipe(drop_fao_non_country_regions)
                   .pipe(remove_unnecessary_columns, ['year_code', 'element', 'element_code', 'item'])
                   .pipe(rename_columns, fao_crop_yield_col_name_map)
                  )
fao_crop_yield_units = (fao_crop_yields
                        .pipe(start_pipeline)
                        .pipe(extract_fao_crop_yield_units)
                       )
country_codes = (pd.DataFrame(FIPS_10_country_codes)
                 .pipe(start_pipeline)
                 .pipe(combine_fips_noaa_fao_country_codes, fao_crop_yields)
                )
fao_crop_yields = (fao_crop_yields
                   .pipe(start_pipeline)
                   .pipe(combine_similar_fao_crops, fao_crop_ids)
                   .pipe(replace_fao_country_codes_with_FIPS, country_codes)
                  )
fao_crop_ids = (fao_crop_ids
                .pipe(start_pipeline)
                .pipe(drop_duplicated_fao_crop_names)
                .pipe(drop_fao_crop_ids_not_in_crop_yields, fao_crop_yields)
                .pipe(add_yield_units_to_fao_crop_ids, fao_crop_yield_units)
               )
uwm_crop_calendar = (unprocessed_uwm_crop_calendar
                     .pipe(start_pipeline)
                     .pipe(clean_col_names)
                     .pipe(remove_unnecessary_columns, uwm_unnecessary_cols)
                     .pipe(replace_uvm_crop_name_with_crop_name_in_original_data, 'Other')
                     .pipe(replace_uvm_crop_name_with_crop_name_in_original_data, 'Multiple')
                     .pipe(rename_uvm_crops_to_match_fao_yields)
                     .pipe(remove_unnecessary_uvm_crops)
                     .pipe(refactor_uvm_winter_vegetables_crop)
                     .pipe(assign_FIPS_10_country_codes_to_uvm_crop_calendar, uwm_location_to_fips_country_map, country_codes)
                     .pipe(remove_unnecessary_columns, ['crop_name_in_original_data', 'state_code', 'county_code', 'location_code'])
                     .pipe(drop_non_fao_crops, fao_crop_ids)
                     .pipe(add_crop_ids_to_crop_calendar, fao_crop_ids)
                     .pipe(drop_nan_dates)
                     .pipe(reformat_uvm_dates)
                     .pipe(join_crop_yields_with_crop_calendar, fao_crop_yields)
                    )


# fao_crop_yields = (fao_crop_yields
#                   )

print('fao_crop_yields: ', fao_crop_yields.shape)
print('fao_crop_ids: ', fao_crop_ids.shape)
print('country_codes: ', country_codes.shape)
print('uwm_crop_calendar: ', uwm_crop_calendar.shape)



'''
Do I need to drop country-crop yields that don't have a crop calendar?
I cannot use them to train the model if I don't know when the crop is planted in that country
Group yields by country and crop
Group calendars by country and crop

Compare...

Or group all calendars per crop to see how the growth period varies per crop
'''
uwm_crop_calendar.head()

  df.columns = df.columns.str.replace(' ','_').str.replace('.','_').str.lower()


fao_crop_yields:  (455835, 5)
fao_crop_ids:  (151, 3)
country_codes:  (202, 6)
uwm_crop_calendar:  (71686, 9)


  df.crop = df.crop.str.replace('.', ' ')


Unnamed: 0,year,crop,yield,crop_id,FIPS_10_country_code,location,lat_avg,lon_avg,growing_months
0,1992,Wheat,21600.0,15,AM,Armenia,40.186727,44.896421,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]"
1,1993,Wheat,22257.0,15,AM,Armenia,40.186727,44.896421,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]"
2,1994,Wheat,17842.0,15,AM,Armenia,40.186727,44.896421,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]"
3,1995,Wheat,25480.0,15,AM,Armenia,40.186727,44.896421,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]"
4,1996,Wheat,23569.0,15,AM,Armenia,40.186727,44.896421,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]"


In [238]:
new_df = uwm_crop_calendar.copy()
print(ucc.head())

print(new_df.head())
new_df.groupby(['crop','location', 'year']).count().sort_values('crop_id') #[[]]
#new_df[new_df.crop=='Rice'].sort_values('year')

(455835, 5)
   year   crop    yield  crop_id FIPS_10_country_code location    lat_avg  \
0  1992  Wheat  21600.0       15                   AM  Armenia  40.186727   
1  1993  Wheat  22257.0       15                   AM  Armenia  40.186727   
2  1994  Wheat  17842.0       15                   AM  Armenia  40.186727   
3  1995  Wheat  25480.0       15                   AM  Armenia  40.186727   
4  1996  Wheat  23569.0       15                   AM  Armenia  40.186727   

     lon_avg                           growing_months  
0  44.896421  [8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]  
1  44.896421  [8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]  
2  44.896421  [8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]  
3  44.896421  [8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]  
4  44.896421  [8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]  
   year_x crop_x  yield_x  crop_id FIPS_10_country_code  year_y crop_y  \
0    1992  Wheat  21600.0       15                   AM    1992  Wheat   
1    1992  Wheat  21600.0       15       

KeyError: 'crop'

In [183]:
#{'harvest_end_month': 'max', 'harvest_end_day': 'max', 'plant_start_month': 'min', 'plant_start_day': 'min'}
new_df.groupby('location').agg(lat_max=('lat_avg', 'max'), lat_min=('lat_avg', 'min'), lon_max=('lon_avg', 'max'), lon_avg=('lat_avg', 'min')).head()

Unnamed: 0_level_0,lat_max,lat_min,lon_max,lon_avg
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,33.632371,31.294154,-85.831522,31.294154
Alaska,61.347183,61.347183,-149.191885,61.347183
Albania,41.210375,40.881363,20.515232,40.881363
Algeria,36.053175,35.591824,8.102748,35.591824
Andhra Pradesh,18.480246,13.581397,83.47003,13.581397


In [233]:
def combine_cooardinates(df):
    return (df
            .merge(df
                   .groupby('location')
                   .agg(lat_max=('lat_avg', 'max'), 
                        lat_min=('lat_avg', 'min'), 
                        lon_max=('lon_avg', 'max'), 
                        lon_min=('lat_avg', 'min')),
                   on='location'
                  )
           .drop(['lat_avg', 'lon_avg'], axis=1)
           )

def combine_growing_months(df):
    return (df
            .groupby(['location', 'year', 'crop'])
            .growing_months.sum()
            .apply(lambda x: list(set(x)))
            .reset_index()
           )


''' Combine the coordinates. NOT SURE ABOUT THIS YET. CHECK GERMANY'''
new_df = (new_df
          .pipe(combine_cooardinates)
          .pipe(combine_growing_months))
new_df

KeyError: "Column(s) ['lat_avg', 'lon_avg'] do not exist"

In [231]:
new_df

Unnamed: 0,year,crop,yield,crop_id,FIPS_10_country_code,location,growing_months,lat_max,lat_min,lon_max,lon_min
0,1992,Wheat,21600.0,15,AM,Armenia,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]",40.186727,40.186727,44.896421,40.186727
1,1993,Wheat,22257.0,15,AM,Armenia,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]",40.186727,40.186727,44.896421,40.186727
2,1994,Wheat,17842.0,15,AM,Armenia,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]",40.186727,40.186727,44.896421,40.186727
3,1995,Wheat,25480.0,15,AM,Armenia,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]",40.186727,40.186727,44.896421,40.186727
4,1996,Wheat,23569.0,15,AM,Armenia,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]",40.186727,40.186727,44.896421,40.186727
...,...,...,...,...,...,...,...,...,...,...,...
71681,2016,Sorghum,16717.0,83,OD,Sudan (South),"[4, 5, 6, 7, 8, 9]",10.772284,6.965755,32.354555,6.965755
71682,2017,Sorghum,12330.0,83,OD,Sudan (South),"[4, 5, 6, 7, 8, 9]",10.772284,6.965755,32.354555,6.965755
71683,2018,Sorghum,12330.0,83,OD,Sudan (South),"[4, 5, 6, 7, 8, 9]",10.772284,6.965755,32.354555,6.965755
71684,2019,Sorghum,12330.0,83,OD,Sudan (South),"[4, 5, 6, 7, 8, 9]",10.772284,6.965755,32.354555,6.965755


In [226]:
test_df = new_df[(new_df.location=='Germany')&(new_df.crop=='Barley')&(new_df.year==2020)].copy()#.groupby(['crop', 'year']).count().sort_values('value')
test_df

Unnamed: 0,year,crop,yield,crop_id,FIPS_10_country_code,location,growing_months,lat_max,lat_min,lon_max,lon_min
8918,2020,Barley,64591.0,44,GM,Germany,"[8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7]",52.299482,51.134897,11.691701,51.134897
8919,2020,Barley,64591.0,44,GM,Germany,"[3, 4, 5, 6, 7, 8, 9]",52.299482,51.134897,11.691701,51.134897


In [227]:
test_df.groupby(['location', 'year', 'crop']).growing_months.sum().apply(lambda x: list(set(x))).reset_index()# .agg({'growing_months': 'sum'})

Unnamed: 0,location,year,crop,growing_months
0,Germany,2020,Barley,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"


In [153]:
df = uwm_crop_calendar.copy()
df.groupby(['FIPS_10_country_code', 'year','crop_id']).count()

KeyError: 'year'

In [83]:
dm = df.copy()

dm['harvest_end_date'].str.split('/')#.tolist()

0        [6, 30]
1        [8, 21]
2       [10, 31]
3        [7, 31]
4        [4, 30]
5        [6, 21]
6       [10, 31]
7        [2, 19]
8        [9, 20]
9        [6, 30]
10       [6, 30]
11       [9, 20]
12       [6, 10]
13      [12, 31]
14       [8, 22]
15      [12, 31]
16       [8, 15]
17      [11, 20]
18       [7, 20]
19       [7, 21]
20       [5, 31]
21       [7, 21]
22       [7, 21]
23       [9, 30]
24      [11, 30]
25       [8, 17]
26       [8, 20]
27       [8, 21]
28      [11, 30]
29       [6, 20]
30       [8, 21]
31       [7, 31]
32      [11, 30]
33       [8, 31]
34       [8, 21]
35       [6, 21]
36      [10, 31]
37       [5, 11]
38      [11, 15]
39       [6, 30]
40      [11, 20]
41       [9, 19]
42       [5, 21]
43       [6, 30]
44       [3, 31]
45      [11, 30]
46       [7, 20]
47       [8, 15]
48       [6, 30]
49       [7, 31]
50       [6, 30]
51       [8, 10]
52       [8, 15]
53        [1, 8]
54      [11, 14]
55      [11, 14]
56      [11, 30]
57      [12, 30]
58       [8, 1

In [89]:
dm = df.copy()
print(dm.head())

#dm['plant_start_date'] = dm['plant_start_date'].str.split('/')

#df = dm.map(x.reverse())
# print(dm.head())
#dm[0], dm[2] = dm[1], dm[0]
#dm = dm.plant_start_date.str[]
dm

     location  nation_code  state_code  county_code plant_start_date  \
0     Albania           55         NaN          NaN            10/16   
1     Algeria           69         NaN          NaN            10/11   
2      Angola          191         NaN          NaN              5/1   
3     Armenia           61         NaN          NaN              8/1   
4  Bangladesh           92         NaN          NaN            11/11   

  harvest_end_date  location_code    lat_avg    lon_avg FIPS_10_country_code  \
0             6/30             55  40.936144  19.920749                   AL   
1             8/21             69  35.739504   3.232901                   AG   
2            10/31            191 -11.223518  17.831172                   AO   
3             7/31             61  40.186727  44.896421                   AM   
4             4/30             92  24.433967  89.572153                   BG   

   crop_id  
0       15  
1       15  
2       15  
3       15  
4       15  


Unnamed: 0,location,nation_code,state_code,county_code,location_code,lat_avg,lon_avg,FIPS_10_country_code,crop_id,harvest_end_month,harvest_end_day,plant_start_month,plant_start_day
0,Albania,55,,,55,40.936144,19.920749,AL,15,6,30,10,16
1,Algeria,69,,,69,35.739504,3.232901,AG,15,8,21,10,11
2,Angola,191,,,191,-11.223518,17.831172,AO,15,10,31,5,1
3,Armenia,61,,,61,40.186727,44.896421,AM,15,7,31,8,1
4,Bangladesh,92,,,92,24.433967,89.572153,BG,15,4,30,11,11
5,Bolivia,201,,,201,-18.171108,-63.79142,,15,6,21,11,11
6,Botswana,220,,,220,-23.096913,25.148438,BC,15,10,31,5,1
7,Chile,219,,,219,-37.587432,-72.024273,CI,15,2,19,4,11
8,Colombia,148,,,148,4.504946,-75.037596,CO,15,9,20,2,9
9,Cyprus,74,,,74,,,CY,15,6,30,11,9


In [24]:
''' create location codes using country code location and lat lon'''

df[~df.county_code.isna()]


Unnamed: 0,location,nation_code,state_code,county_code,plant_start_date,harvest_end_date,location_code,lat_avg,lon_avg,FIPS_10_country_code,crop_id
134,Gujarat,75,1688.0,-1.0,10/29,4/22,-1,21.50873,71.517409,IN,15
135,Gujarat,75,1688.0,-1.0,11/5,4/15,-1,23.358418,72.802816,IN,15
136,Himachal Pradesh,75,1650.0,-1.0,10/22,5/20,-1,31.580429,77.08855,IN,15
137,Himachal Pradesh,75,1650.0,-1.0,11/19,5/27,-1,31.552314,76.335204,IN,15
138,Himachal Pradesh,75,1650.0,15862.0,12/3,5/27,15862,30.979756,76.859048,IN,15
139,Himachal Pradesh,75,1650.0,15860.0,10/22,5/20,15860,31.08676,77.569119,IN,15
140,Karnataka,75,1752.0,15883.0,10/8,4/1,15883,16.112268,74.853098,IN,15
141,Karnataka,75,1752.0,15890.0,10/1,3/4,15890,15.152535,75.426859,IN,15
142,Karnataka,75,1752.0,15897.0,10/8,3/18,15897,15.803931,76.623709,IN,15
143,Maharashtra,75,1710.0,-1.0,10/29,4/8,-1,20.776713,78.562586,IN,15


In [7]:
fao_crop_yields.head()

Unnamed: 0,fao_country_code,year,crop,value,crop_id
0,1,1992,Apples,68800.0,515
1,1,1993,Apples,26354.0,515
2,1,1994,Apples,61589.0,515
3,1,1995,Apples,78646.0,515
4,1,1996,Apples,91364.0,515


In [8]:
fao_crop_ids.head()

Unnamed: 0,crop_id,crop,unit
0,101,Canary seed,hg/ha
1,103,Grain,hg/ha
2,108,Cereals,hg/ha
3,116,Potatoes,hg/ha
4,122,Sweet potatoes,hg/ha


In [12]:
''' REMOVE ALL CROPS THAT ARE NOT FAO AND ALL COUNTRIES'''
for crop in uwm_crop_calendar.crop.sort_values().unique():
    if crop not in fao_to_uvm_crop_col_matches:
        print(crop)

Birdseed
Crops in low lying areas
Forage
Nigerseed
Poppyseed
Spring vegetables (greenhouse planting)
Tapioca
Teff (Meher)
Walo crops
