# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
production = pd.read_csv('processed data/production.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

balance_historic = pd.read_csv('processed data/balance_historic.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

balance = pd.read_csv('processed data/balance.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

countries = pd.read_csv('processed data/countries.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

print('production shape: ', production.shape)
print('balance shape: ', balance.shape)
print('balance_historic shape: ', balance_historic.shape)

display(production.head())
display(balance.head())
display(balance_historic.head())
countries.head()

production shape:  (4837117, 7)
balance shape:  (1958440, 7)
balance_historic shape:  (8119335, 7)


Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,1961,0.0
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,1961,0.0
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,1961,0.0
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,1961,0.0
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,1961,0.0


Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'S2501,Population,Total Population - Both sexes,1000 persons,2010,29186.0
1,Afghanistan,'S2901,Grand Total,Food supply,kcal/capita/day,2010,2170.0
2,Afghanistan,'S2901,Grand Total,Protein supply quantity,g/capita/day,2010,59.23
3,Afghanistan,'S2901,Grand Total,Fat supply quantity,g/capita/day,2010,36.69
4,Afghanistan,'S2903,Vegetal Products,Food supply,kcal/capita/day,2010,1964.0


Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'S2501,Population,Total Population - Both sexes,1000 persons,1961,8954.0
1,Afghanistan,'S2901,Grand Total,Food supply,kcal/capita/day,1961,2999.0
2,Afghanistan,'S2901,Grand Total,Protein supply quantity,g/capita/day,1961,84.91
3,Afghanistan,'S2901,Grand Total,Fat supply quantity,g/capita/day,1961,37.51
4,Afghanistan,'S2903,Vegetal Products,Food supply,kcal/capita/day,1961,2752.0


Unnamed: 0,region_name,sub_region_name,country_or_area,iso_alpha3_code,least_developed_countries_ldc
0,Africa,Northern Africa,Algeria,DZA,0
1,Africa,Northern Africa,Egypt,EGY,0
2,Africa,Northern Africa,Libya,LBY,0
3,Africa,Northern Africa,Morocco,MAR,0
4,Africa,Northern Africa,Sudan,SDN,x


In [4]:
production = pd.merge(left = production,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)

balance = pd.merge(left = balance,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)

balance_historic = pd.merge(left = balance_historic,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)

production.head()

Unnamed: 0,area,item_code_cpc,item,element,unit,year,value,region_name,sub_region_name,iso_alpha3_code,least_developed_countries_ldc
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,1961,0.0,Asia,Southern Asia,AFG,x
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,1961,0.0,Asia,Southern Asia,AFG,x
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,1961,0.0,Asia,Southern Asia,AFG,x
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,1961,0.0,Asia,Southern Asia,AFG,x
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,1961,0.0,Asia,Southern Asia,AFG,x


In the production and balance datasets, useful *features* are buried within the element column. I will combine these with the unit column and turn into columns themselves.

Further, population data in the balance datsets is an area-level parameter and should be a column itself. We will use it to make sense of the balance features many of which are scaled by population (eg. per capita).

In [5]:
balance['element_unit'] = balance[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
balance_historic['element_unit'] = balance_historic[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
production['element_unit'] = production[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [6]:
def restructure_pivot(df):
       
    df_pivot = df.pivot_table(index = ['sub_region_name','area','iso_alpha3_code','least_developed_countries_ldc','year','item_code_cpc','item'], columns=['element_unit'], aggfunc = {'value':'sum'})
    
    col_list = []
    for col in df_pivot.columns:
        col_list.append(col[1])

    df_pivot.columns = col_list

    df_pivot = df_pivot.reset_index()
    
    return df_pivot

In [7]:
balance_pivot = restructure_pivot(balance)
balance_pivot.head()

Unnamed: 0,sub_region_name,area,iso_alpha3_code,least_developed_countries_ldc,year,item_code_cpc,item,Export Quantity_1000 tonnes,Fat supply quantity _g/capita/day,Food supply _kcal/capita/day,Food_1000 tonnes,Import Quantity_1000 tonnes,Losses_1000 tonnes,Production_1000 tonnes,Protein supply quantity _g/capita/day,Total Population - Both sexes_1000 persons
0,Australia and New Zealand,Australia,AUS,0,2010,'S2501,Population,,,,,,,,,22155.0
1,Australia and New Zealand,Australia,AUS,0,2010,'S2511,Wheat and products,16143.0,2.23,539.91,1510.0,170.0,218.0,21834.0,16.96,
2,Australia and New Zealand,Australia,AUS,0,2010,'S2513,Barley and products,4789.0,0.0,0.0,0.0,7.0,33.0,7865.0,0.0,
3,Australia and New Zealand,Australia,AUS,0,2010,'S2514,Maize and products,11.0,0.12,43.7,100.0,16.0,2.0,328.0,0.95,
4,Australia and New Zealand,Australia,AUS,0,2010,'S2515,Rye and products,0.0,0.0,0.39,1.0,1.0,1.0,29.0,0.01,


In [8]:
def population(df):
    pop = df[['area','year','Total Population - Both sexes_1000 persons']].copy()
    pop = pop.dropna()
    
    df = df.drop(['Total Population - Both sexes_1000 persons'],axis=1)
    df.drop(df[df['item'] == 'Population'].index, inplace = True)
    
    df = pd.merge(df, pop,  how='inner', left_on=['area','year'], right_on = ['area','year'])
    
    return df

In [9]:
balance_pivot = population(balance_pivot)
balance_pivot.head()

Unnamed: 0,sub_region_name,area,iso_alpha3_code,least_developed_countries_ldc,year,item_code_cpc,item,Export Quantity_1000 tonnes,Fat supply quantity _g/capita/day,Food supply _kcal/capita/day,Food_1000 tonnes,Import Quantity_1000 tonnes,Losses_1000 tonnes,Production_1000 tonnes,Protein supply quantity _g/capita/day,Total Population - Both sexes_1000 persons
0,Australia and New Zealand,Australia,AUS,0,2010,'S2511,Wheat and products,16143.0,2.23,539.91,1510.0,170.0,218.0,21834.0,16.96,22155.0
1,Australia and New Zealand,Australia,AUS,0,2010,'S2513,Barley and products,4789.0,0.0,0.0,0.0,7.0,33.0,7865.0,0.0,22155.0
2,Australia and New Zealand,Australia,AUS,0,2010,'S2514,Maize and products,11.0,0.12,43.7,100.0,16.0,2.0,328.0,0.95,22155.0
3,Australia and New Zealand,Australia,AUS,0,2010,'S2515,Rye and products,0.0,0.0,0.39,1.0,1.0,1.0,29.0,0.01,22155.0
4,Australia and New Zealand,Australia,AUS,0,2010,'S2516,Oats,347.0,0.01,0.76,3.0,1.0,23.0,1162.0,0.03,22155.0


In [10]:
balance_pivot.isna().sum()

sub_region_name                                    0
area                                               0
iso_alpha3_code                                    0
least_developed_countries_ldc                      0
year                                               0
item_code_cpc                                      0
item                                               0
Export Quantity_1000 tonnes                    27676
Fat supply quantity _g/capita/day              15829
Food supply _kcal/capita/day                   16027
Food_1000 tonnes                               20801
Import Quantity_1000 tonnes                     8030
Losses_1000 tonnes                            132902
Production_1000 tonnes                         70268
Protein supply quantity _g/capita/day          15785
Total Population - Both sexes_1000 persons         0
dtype: int64

I will fill all nulls with 0.

In [11]:
balance_pivot = balance_pivot.fillna(0)

We will need additional columns for analysis:

In [12]:
balance_pivot.columns

Index(['sub_region_name', 'area', 'iso_alpha3_code',
       'least_developed_countries_ldc', 'year', 'item_code_cpc', 'item',
       'Export Quantity_1000 tonnes', 'Fat supply quantity _g/capita/day',
       'Food supply _kcal/capita/day', 'Food_1000 tonnes',
       'Import Quantity_1000 tonnes', 'Losses_1000 tonnes',
       'Production_1000 tonnes', 'Protein supply quantity _g/capita/day',
       'Total Population - Both sexes_1000 persons'],
      dtype='object')

In [13]:
def new_columns(df):
    df['food_supply_kcal_per_day'] = df['Food supply _kcal/capita/day'] * (1000 * df['Total Population - Both sexes_1000 persons'])
    df['kcal_per_1000tonnes'] = 365 * df['food_supply_kcal_per_day'] / df['Food_1000 tonnes']
    df['import_multiple'] = df['Import Quantity_1000 tonnes'] / df['Food_1000 tonnes']
    df['export_multiple'] = df['Export Quantity_1000 tonnes'] / df['Food_1000 tonnes']
    df['loss_multiple'] = df['Losses_1000 tonnes'] / df['Food_1000 tonnes']
    df['losses_kcal_per_day'] = df['loss_multiple'] * df['food_supply_kcal_per_day']
    df['fat_supply_g_per_day'] = df['Fat supply quantity _g/capita/day'] * (1000 * df['Total Population - Both sexes_1000 persons'])
    df['protein_supply_g_per_day'] = df['Protein supply quantity _g/capita/day'] * (1000 * df['Total Population - Both sexes_1000 persons'])
    return df

In [14]:
balance_final = new_columns(balance_pivot)
balance_final

Unnamed: 0,sub_region_name,area,iso_alpha3_code,least_developed_countries_ldc,year,item_code_cpc,item,Export Quantity_1000 tonnes,Fat supply quantity _g/capita/day,Food supply _kcal/capita/day,Food_1000 tonnes,Import Quantity_1000 tonnes,Losses_1000 tonnes,Production_1000 tonnes,Protein supply quantity _g/capita/day,Total Population - Both sexes_1000 persons,food_supply_kcal_per_day,kcal_per_1000tonnes,import_multiple,export_multiple,loss_multiple,losses_kcal_per_day,fat_supply_g_per_day,protein_supply_g_per_day
0,Australia and New Zealand,Australia,AUS,0,2010,'S2511,Wheat and products,16143.00,2.23,539.91,1510.00,170.0,218.0,21834.00,16.96,22155.00,1.196171e+10,2.891406e+09,0.112583,10.690728,0.144371,1.726922e+09,49405650.0,375748800.0
1,Australia and New Zealand,Australia,AUS,0,2010,'S2513,Barley and products,4789.00,0.00,0.00,0.00,7.0,33.0,7865.00,0.00,22155.00,0.000000e+00,,inf,inf,inf,,0.0,0.0
2,Australia and New Zealand,Australia,AUS,0,2010,'S2514,Maize and products,11.00,0.12,43.70,100.00,16.0,2.0,328.00,0.95,22155.00,9.681735e+08,3.533833e+09,0.160000,0.110000,0.020000,1.936347e+07,2658600.0,21047250.0
3,Australia and New Zealand,Australia,AUS,0,2010,'S2515,Rye and products,0.00,0.00,0.39,1.00,1.0,1.0,29.00,0.01,22155.00,8.640450e+06,3.153764e+09,1.000000,0.000000,1.000000,8.640450e+06,0.0,221550.0
4,Australia and New Zealand,Australia,AUS,0,2010,'S2516,Oats,347.00,0.01,0.76,3.00,1.0,23.0,1162.00,0.03,22155.00,1.683780e+07,2.048599e+09,0.333333,115.666667,7.666667,1.290898e+08,221550.0,664650.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234680,Western Europe,Switzerland,CHE,0,2020,'S2946,Animal fats,11.00,17.89,159.00,72.00,26.0,2.0,105.00,0.14,8654.62,1.376085e+09,6.975984e+09,0.361111,0.152778,0.027778,3.822457e+07,154831151.8,1211646.8
234681,Western Europe,Switzerland,CHE,0,2020,'S2948,Milk - Excluding Butter,589.00,30.74,468.00,2535.00,595.0,0.0,3832.00,29.86,8654.62,4.050362e+09,5.831882e+08,0.234714,0.232347,0.000000,0.000000e+00,266043018.8,258426953.2
234682,Western Europe,Switzerland,CHE,0,2020,'S2949,Eggs,0.00,2.98,42.00,93.00,38.0,6.0,66.00,3.43,8654.62,3.634940e+08,1.426616e+09,0.408602,0.000000,0.064516,2.345123e+07,25790767.6,29685346.6
234683,Western Europe,Switzerland,CHE,0,2020,'S2960,"Fish, Seafood",0.82,1.19,29.00,138.43,134.9,0.0,3.43,4.15,8654.62,2.509840e+08,6.617724e+08,0.974500,0.005924,0.000000,0.000000e+00,10298997.8,35916673.0


In [15]:
# balance_final.to_csv('balance_final.csv')

I will just consider the aggregated food groups for the initial national analysis.

In [16]:
balance_national = balance_final[balance_final['item'].isin(['Grand Total','Vegetal Products','Animal Products'])==True]
balance_national.head(10)

Unnamed: 0,sub_region_name,area,iso_alpha3_code,least_developed_countries_ldc,year,item_code_cpc,item,Export Quantity_1000 tonnes,Fat supply quantity _g/capita/day,Food supply _kcal/capita/day,Food_1000 tonnes,Import Quantity_1000 tonnes,Losses_1000 tonnes,Production_1000 tonnes,Protein supply quantity _g/capita/day,Total Population - Both sexes_1000 persons,food_supply_kcal_per_day,kcal_per_1000tonnes,import_multiple,export_multiple,loss_multiple,losses_kcal_per_day,fat_supply_g_per_day,protein_supply_g_per_day
95,Australia and New Zealand,Australia,AUS,0,2010,'S2901,Grand Total,0.0,151.27,3410.0,0.0,0.0,0.0,0.0,110.87,22155.0,75548550000.0,inf,,,,,3351387000.0,2456325000.0
96,Australia and New Zealand,Australia,AUS,0,2010,'S2903,Vegetal Products,0.0,78.17,2363.0,0.0,0.0,0.0,0.0,36.81,22155.0,52352260000.0,inf,,,,,1731856000.0,815525600.0
111,Australia and New Zealand,Australia,AUS,0,2010,'S2941,Animal Products,0.0,73.11,1048.0,0.0,0.0,0.0,0.0,74.07,22155.0,23218440000.0,inf,,,,,1619752000.0,1641021000.0
214,Australia and New Zealand,Australia,AUS,0,2011,'S2901,Grand Total,0.0,155.21,3390.0,0.0,0.0,0.0,0.0,110.66,22538.0,76403820000.0,inf,,,,,3498123000.0,2494055000.0
215,Australia and New Zealand,Australia,AUS,0,2011,'S2903,Vegetal Products,0.0,81.29,2327.0,0.0,0.0,0.0,0.0,34.82,22538.0,52445930000.0,inf,,,,,1832114000.0,784773200.0
230,Australia and New Zealand,Australia,AUS,0,2011,'S2941,Animal Products,0.0,73.93,1063.0,0.0,0.0,0.0,0.0,75.84,22538.0,23957890000.0,inf,,,,,1666234000.0,1709282000.0
333,Australia and New Zealand,Australia,AUS,0,2012,'S2901,Grand Total,0.0,156.43,3437.0,0.0,0.0,0.0,0.0,110.1,22904.0,78721050000.0,inf,,,,,3582873000.0,2521730000.0
334,Australia and New Zealand,Australia,AUS,0,2012,'S2903,Vegetal Products,0.0,81.71,2368.0,0.0,0.0,0.0,0.0,35.3,22904.0,54236670000.0,inf,,,,,1871486000.0,808511200.0
349,Australia and New Zealand,Australia,AUS,0,2012,'S2941,Animal Products,0.0,74.72,1069.0,0.0,0.0,0.0,0.0,74.8,22904.0,24484380000.0,inf,,,,,1711387000.0,1713219000.0
452,Australia and New Zealand,Australia,AUS,0,2013,'S2901,Grand Total,0.0,152.66,3429.0,0.0,0.0,0.0,0.0,109.4,23255.0,79741400000.0,inf,,,,,3550108000.0,2544097000.0


In [17]:
def restructure_pivot2(df):
       
    df_pivot = df.pivot_table(index = ['sub_region_name','area','iso_alpha3_code','least_developed_countries_ldc','year','Total Population - Both sexes_1000 persons'], columns=['item'],
                                                  aggfunc = {'food_supply_kcal_per_day':'sum',
                                                            'fat_supply_g_per_day':'sum',
                                                            'protein_supply_g_per_day':'sum'})
    
    col_list = []
    for i in range(len(df_pivot.columns)):
        col_list.append(' '.join(df_pivot.columns[i]))

    df_pivot.columns = col_list

    df_pivot = df_pivot.reset_index()
    
    return df_pivot

In [18]:
balance_national_p = restructure_pivot2(balance_national)
# balance_national_p.to_csv('national_balance.csv')
balance_national_p

Unnamed: 0,sub_region_name,area,iso_alpha3_code,least_developed_countries_ldc,year,Total Population - Both sexes_1000 persons,fat_supply_g_per_day Animal Products,fat_supply_g_per_day Grand Total,fat_supply_g_per_day Vegetal Products,food_supply_kcal_per_day Animal Products,food_supply_kcal_per_day Grand Total,food_supply_kcal_per_day Vegetal Products,protein_supply_g_per_day Animal Products,protein_supply_g_per_day Grand Total,protein_supply_g_per_day Vegetal Products
0,Australia and New Zealand,Australia,AUS,0,2010,22155.00,1.619752e+09,3.351387e+09,1.731856e+09,2.321844e+10,7.554855e+10,5.235226e+10,1.641021e+09,2.456325e+09,815525550.0
1,Australia and New Zealand,Australia,AUS,0,2011,22538.00,1.666234e+09,3.498123e+09,1.832114e+09,2.395789e+10,7.640382e+10,5.244593e+10,1.709282e+09,2.494055e+09,784773160.0
2,Australia and New Zealand,Australia,AUS,0,2012,22904.00,1.711387e+09,3.582873e+09,1.871486e+09,2.448438e+10,7.872105e+10,5.423667e+10,1.713219e+09,2.521730e+09,808511200.0
3,Australia and New Zealand,Australia,AUS,0,2013,23255.00,1.725754e+09,3.550108e+09,1.824355e+09,2.455728e+10,7.974140e+10,5.518412e+10,1.717149e+09,2.544097e+09,826947800.0
4,Australia and New Zealand,Australia,AUS,0,2014,23596.00,1.798959e+09,3.732887e+09,1.933928e+09,2.520053e+10,8.053315e+10,5.533262e+10,1.728171e+09,2.544357e+09,816185640.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,Western Europe,Switzerland,CHE,0,2016,8380.00,7.507642e+08,1.300492e+09,5.498118e+08,9.511300e+09,2.819032e+10,1.867902e+10,5.121856e+08,8.100946e+08,297825200.0
1976,Western Europe,Switzerland,CHE,0,2017,8456.00,7.565583e+08,1.309073e+09,5.525150e+08,9.580648e+09,2.864893e+10,1.906828e+10,5.123490e+08,8.195555e+08,307206480.0
1977,Western Europe,Switzerland,CHE,0,2018,8526.00,7.563415e+08,1.346341e+09,5.899992e+08,9.591750e+09,2.880935e+10,1.920908e+10,5.225585e+08,8.301766e+08,307532820.0
1978,Western Europe,Switzerland,CHE,0,2019,8591.36,7.291487e+08,1.359067e+09,6.299185e+08,9.313034e+09,2.909034e+10,1.977731e+10,5.136774e+08,8.289803e+08,315302912.0


Do the same for the historic data

In [19]:
balance_historic_pivot = restructure_pivot(balance_historic)
balance_historic_pivot1 = population(balance_historic_pivot)
balance_historic_pivot2 = balance_historic_pivot1.fillna(0)
balance_historic_final = new_columns(balance_historic_pivot2)

balance_historic_national = balance_historic_final[balance_historic_final['item'].isin(['Grand Total','Vegetal Products','Animal Products'])==True]
balance_historic_national_p = restructure_pivot2(balance_historic_national)

balance_historic_national_p.head()

Unnamed: 0,sub_region_name,area,iso_alpha3_code,least_developed_countries_ldc,year,Total Population - Both sexes_1000 persons,fat_supply_g_per_day Animal Products,fat_supply_g_per_day Grand Total,fat_supply_g_per_day Vegetal Products,food_supply_kcal_per_day Animal Products,food_supply_kcal_per_day Grand Total,food_supply_kcal_per_day Vegetal Products,protein_supply_g_per_day Animal Products,protein_supply_g_per_day Grand Total,protein_supply_g_per_day Vegetal Products
0,Australia and New Zealand,Australia,AUS,0,1961,10495.0,995450700.0,1168408000.0,172957600.0,13118750000.0,32440040000.0,19321300000.0,713135250.0,1100821000.0,387685300.0
1,Australia and New Zealand,Australia,AUS,0,1962,10691.0,1024625000.0,1194933000.0,170307630.0,13598950000.0,33281080000.0,19682130000.0,755853700.0,1153345000.0,397384470.0
2,Australia and New Zealand,Australia,AUS,0,1963,10893.0,1028517000.0,1221432000.0,192915030.0,13670720000.0,34519920000.0,20849200000.0,769372590.0,1188208000.0,418835850.0
3,Australia and New Zealand,Australia,AUS,0,1964,11115.0,1022691000.0,1228986000.0,206405550.0,13849290000.0,34967790000.0,21118500000.0,805948650.0,1225540000.0,419591250.0
4,Australia and New Zealand,Australia,AUS,0,1965,11368.0,1038126000.0,1253436000.0,215423600.0,14028110000.0,35183960000.0,21155850000.0,810311040.0,1225584000.0,415273040.0


#### The first notable finding is that in 2020, globally enough food was available to feed 12.8bn people vs a population of 7.8bn. That's a surplus that could support 5bn people.

#### Yet, over 10% population lived in extreme food poverty.

In [20]:
# using WHO daily calory requirement for 'average person'
daily_cal = 1800
potential = balance_national_p[balance_national_p['year']==2020]['food_supply_kcal_per_day Grand Total'].sum() / (cal)
population = balance_national_p[balance_national_p['year']==2020]['Total Population - Both sexes_1000 persons'].sum() *1000

print('Total Population (bn):               ', round(population / 1000000000,2))
print('Population that could be fed (bn):   ', round(potential / 1000000000,2))

NameError: name 'cal' is not defined

### Compare these historic levels against the famine dataset

In [None]:
famines = pd.read_csv('processed data/famines.csv').drop(['Unnamed: 0'], axis = 1)
famines

In [None]:
years_affected = []
for row in range(len(famines)):
    year_list = []
    for i in range(famines['duration'][row]):
        year_list.append(famines['startdate'][row]+i)
    years_affected.append(year_list)

famines['years_affected'] = years_affected
famines

In [None]:
famines.excessmortality_midpoint = famines.excessmortality_midpoint.str.replace(',' , '').astype(int)
famines.dtypes

In [None]:
famines_df = pd.DataFrame(columns = ['area','year','mortality','famine'])

area_rows = []
year_rows = []
mortality_rows = []
famine_rows = []

for row in range(len(famines)):
    for year in famines.years_affected[row]:
        location = famines.location[row]
        year = year
        mortality = famines.excessmortality_midpoint[row] / len(famines.years_affected[row])
        famine = 1
        
        area_rows.append(location)
        year_rows.append(year)
        mortality_rows.append(mortality)
        famine_rows.append(famine)

display(len(area_rows))
display(len(year_rows))
display(len(mortality_rows))
display(len(famine_rows))

In [None]:
famines_df.area = area_rows
famines_df.year = year_rows
famines_df.mortality = mortality_rows
famines_df.famine = famine_rows

famines_df.head(10)

In [None]:
famines_df[famines_df.year > 1960].area.value_counts()

In [None]:
country_list = balance_historic_national_p.area.unique()
country_list

In [None]:
famines_df.area = famines_df.area.replace({"Democratic Republic of Congo":"Congo",
                                           "West Africa (Sahel)":"Niger",
                                           "North Korea":"Democratic People's Republic of Korea",
                                           "Ethiopia (Wallo & Tigray)":"Ethiopia",
                                           "Sudan (Darfur)":"Sudan",
                                           "Nigeria (Biafra)":"Nigeria",
                                           "India (Maharashtra)":"India",
                                           "Sudan (Darfur, Kordofan)":"Sudan",
                                           "Sudan (Baht el Ghazal)":"Sudan",
                                           "Sudan (south)":"Sudan",
                                           "Ethiopia (Wallo)":"Ethiopia"})

famines_df

In [None]:
balance_historic_national_p = pd.merge(left = balance_historic_national_p,
                                       right = famines_df,
                                       how = 'left',
                                       left_on=['area','year'],
                                       right_on = ['area','year'])
balance_historic_national_p = balance_historic_national_p.fillna(0)
balance_historic_national_p.head()

In [None]:
# balance_historic_national_p.to_csv('national_balance_historic.csv')

In [None]:
balance_historic_national_p.dtypes

In [None]:
# just for famine years
famine_balance = balance_historic_national_p[balance_historic_national_p['famine']==1].copy()

famine_balance['kcal_per_capita_per_day'] = famine_balance['food_supply_kcal_per_day Grand Total'] / (1000*famine_balance['Total Population - Both sexes_1000 persons'])
famine_balance

In [None]:
sns.scatterplot(data = famine_balance, x = 'year', y = 'kcal_per_capita_per_day', hue = 'area')

In [None]:
famine_countries = list(famine_balance.area.unique())
famine_countries

In [None]:
famine_balance_historic = balance_historic_national_p[balance_historic_national_p['area'].isin(famine_countries)==True].copy()
# famine_balance_historic.to_csv('famine_balance_historic.csv')

In [None]:
famine_balance_historic.famine.value_counts()

### Recent national food balances

In [None]:
balance_national_p.head()

In [None]:
balance_national_p.least_developed_countries_ldc.value_counts()

In [None]:
balance_national_p['least_developed_countries_ldc'] = balance_national_p['least_developed_countries_ldc'].str.replace('x','1').astype(int)

In [None]:
balance_national_p['kcal_per_capita_per_day'] = balance_national_p['food_supply_kcal_per_day Grand Total'] / (1000 * balance_national_p['Total Population - Both sexes_1000 persons'])
balance_national_p['protein_g_per_capita_per_day'] = balance_national_p['protein_supply_g_per_day Grand Total'] / (1000 * balance_national_p['Total Population - Both sexes_1000 persons'])
balance_national_p['fat_g_per_capita_per_day'] = balance_national_p['fat_supply_g_per_day Grand Total'] / (1000 * balance_national_p['Total Population - Both sexes_1000 persons'])
balance_national_p['animal_proportion'] = balance_national_p['food_supply_kcal_per_day Animal Products'] / balance_national_p['food_supply_kcal_per_day Grand Total']
balance_national_p['fat_g_per_kcal'] = balance_national_p['fat_g_per_capita_per_day'] / balance_national_p['kcal_per_capita_per_day']
balance_national_p['protein_g_per_kcal'] = balance_national_p['protein_g_per_capita_per_day'] / balance_national_p['kcal_per_capita_per_day']

In [None]:
key_columns = ['kcal_per_capita_per_day','protein_g_per_capita_per_day',
               'fat_g_per_capita_per_day','animal_proportion',
               'fat_g_per_kcal','protein_g_per_kcal',
               'least_developed_countries_ldc']

corr_matrix=balance_national_p[key_columns].corr(method='pearson')
fig, ax = plt.subplots(figsize=(16,12))
ax = sns.heatmap(corr_matrix)
plt.show()

In [None]:
corr_matrix

In [None]:
# let's cluster the countries into like groups

In [None]:
cluster_data = balance_national_p.groupby(['area']).agg({'kcal_per_capita_per_day':np.mean,
                                                         'protein_g_per_capita_per_day':np.mean,
                                                         'fat_g_per_capita_per_day':np.mean,
                                                         'animal_proportion':np.mean,
                                                         'fat_g_per_kcal':np.mean,
                                                         'protein_g_per_kcal':np.mean,
                                                         'Total Population - Both sexes_1000 persons':np.mean}).reset_index()
cluster_data

In [None]:
balance_national_p[balance_national_p['area']=='Afghanistan']

In [None]:
balance_national_p[balance_national_p['area']=='France']

In [None]:
from sklearn.preprocessing import MinMaxScaler

X = cluster_data.drop(['area'],axis=1)
y = cluster_data['area']

scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)
X_scaled

In [None]:
from sklearn import cluster
from sklearn.cluster import KMeans

K = range(2, 20)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=1234, n_init = 10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Elbow Method showing the optimal k')

In [None]:
from sklearn.metrics import silhouette_score
K = range(2, 20)
silhouette = []

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=1234, n_init = 10)
    kmeans.fit(X_scaled)
    silhouette.append(silhouette_score(X_scaled, kmeans.predict(X_scaled)))


plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.title('Slhouette score showing the optimal k')

In [None]:
kmeans = KMeans(n_clusters=5, random_state=1234, n_init = 10)
kmeans.fit(X_scaled)
cluster = kmeans.predict(X_scaled)

cluster_data['cluster'] = cluster

summary = cluster_data.groupby(['cluster','area']).agg({'kcal_per_capita_per_day':np.mean,
                                              'protein_g_per_capita_per_day':np.mean,
                                              'fat_g_per_capita_per_day':np.mean,
                                              'animal_proportion':np.mean,
                                              'fat_g_per_kcal':np.mean,
                                              'protein_g_per_kcal':np.mean,
                                              'Total Population - Both sexes_1000 persons':np.mean}).reset_index()

pd.set_option('display.max_rows', None)
display(summary)
pd.set_option('display.max_rows', 30)

In [None]:
# summary.to_csv('balance_summary.csv')

### Producers, exporters, importers

In [None]:
data = pd.read_csv('processed data/balance_final.csv').drop(['Unnamed: 0'], axis = 1)

data

In [None]:
producers = data[(data['item_code_cpc'].str[:4]=="'S29")&
                 (data['item'].isin(['Grand Total','Vegetal Products','Animal Products'])==False)]

producers

In [None]:
producers.isna().sum()

In [None]:
producers.dtypes

In [None]:
producers['kcal_per_1000tonnes']=producers['kcal_per_1000tonnes'].replace(np.inf,None)

item_average = pd.DataFrame(producers.groupby('item')['kcal_per_1000tonnes'].mean()).reset_index()

item_average.rename(columns={'kcal_per_1000tonnes': 'average'}, inplace=True)

item_average

In [None]:
producers = pd.merge(producers, item_average, on='item', how='left')
producers["kcal_per_1000tonnes"].fillna(producers["average"], inplace=True)

In [None]:
producers.columns

In [None]:
producers['kcal_produced'] = producers['Production_1000 tonnes'] * producers["kcal_per_1000tonnes"]
producers['kcal_imported'] = producers['Import Quantity_1000 tonnes'] * producers["kcal_per_1000tonnes"]
producers['kcal_exported'] = producers['Export Quantity_1000 tonnes'] * producers["kcal_per_1000tonnes"]
producers['kcal_lost'] = producers['Losses_1000 tonnes'] * producers["kcal_per_1000tonnes"]

In [None]:
trade = producers[['sub_region_name', 'area', 'iso_alpha3_code',
                  'least_developed_countries_ldc', 'year', 'item_code_cpc',
                  'Total Population - Both sexes_1000 persons',
                  'Food supply _kcal/capita/day',
                  'kcal_produced',
                  'kcal_imported',
                  'kcal_exported',
                  'kcal_lost']].copy()

# trade.to_csv('trade.csv')

trade.isna().sum()

In [None]:
trade.groupby('year')['kcal_produced','kcal_imported','kcal_exported','kcal_lost'].sum()

### Country Overview

In [None]:
# look at average of 2010 to 2020 for each country

In [None]:
table = trade.groupby(['iso_alpha3_code','year']).agg({'Total Population - Both sexes_1000 persons':np.mean,'kcal_produced':sum, 'kcal_imported':sum, 'kcal_exported':sum, 'kcal_lost':sum}).reset_index()

totals = table.groupby(['iso_alpha3_code']).agg({'Total Population - Both sexes_1000 persons':np.mean,'kcal_produced':np.mean, 'kcal_imported':np.mean, 'kcal_exported':np.mean, 'kcal_lost':np.mean}).reset_index()
totals

In [None]:
totals['population'] = 1000* totals['Total Population - Both sexes_1000 persons']
totals['requirement'] = daily_cal * totals['population']
totals = totals.drop(['Total Population - Both sexes_1000 persons'], axis=1)
totals.head()

In [None]:
totals['self_sufficient'] = np.where(totals['kcal_produced']>totals['requirement'],1,0)
totals.head(20)