# Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
production = pd.read_csv('processed data/production.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

balance_historic = pd.read_csv('processed data/balance_historic.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

balance = pd.read_csv('processed data/balance.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

countries = pd.read_csv('processed data/countries.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

print('production shape: ', production.shape)
print('balance shape: ', balance.shape)
print('balance_historic shape: ', balance_historic.shape)

display(production.head())
display(balance_historic.head())
countries.head()

production shape:  (4837117, 7)
balance shape:  (1958440, 7)
balance_historic shape:  (8119335, 7)


Unnamed: 0,area,item_code_(cpc),item,element,unit,year,value
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,1961,0.0
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,1961,0.0
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,1961,0.0
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,1961,0.0
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,1961,0.0


Unnamed: 0,area,item_code_(cpc),item,element,unit,year,value
0,Afghanistan,'S2501,Population,Total Population - Both sexes,1000 persons,1961,8954.0
1,Afghanistan,'S2901,Grand Total,Food supply (kcal/capita/day),kcal/capita/day,1961,2999.0
2,Afghanistan,'S2901,Grand Total,Protein supply quantity (g/capita/day),g/capita/day,1961,84.91
3,Afghanistan,'S2901,Grand Total,Fat supply quantity (g/capita/day),g/capita/day,1961,37.51
4,Afghanistan,'S2903,Vegetal Products,Food supply (kcal/capita/day),kcal/capita/day,1961,2752.0


Unnamed: 0,region_name,sub_region_name,country_or_area,iso_alpha3_code,least_developed_countries_ldc
0,Africa,Northern Africa,Algeria,DZA,
1,Africa,Northern Africa,Egypt,EGY,
2,Africa,Northern Africa,Libya,LBY,
3,Africa,Northern Africa,Morocco,MAR,
4,Africa,Northern Africa,Sudan,SDN,x


In [8]:
production = pd.merge(left = production,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)

balance = pd.merge(left = balance,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)

balance_historic = pd.merge(left = balance_historic,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)

production.head()

Unnamed: 0,area,item_code_(cpc),item,element,unit,year,value,region_name,sub_region_name,iso_alpha3_code,least_developed_countries_ldc
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,1961,0.0,Asia,Southern Asia,AFG,x
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,1961,0.0,Asia,Southern Asia,AFG,x
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,1961,0.0,Asia,Southern Asia,AFG,x
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,1961,0.0,Asia,Southern Asia,AFG,x
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,1961,0.0,Asia,Southern Asia,AFG,x


In the production and balance datasets, useful *features* are buried within the element column. I will combine these with the unit column and turn into columns themselves.

In [9]:
balance['element_unit'] = balance[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
balance_historic['element_unit'] = balance_historic[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
production['element_unit'] = production[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [11]:
balance_p = pd.DataFrame(balance.pivot_table(index = ['area'], columns=['element'], aggfunc = {'value':'sum'}))

Unnamed: 0,area,item_code_(cpc),item,element,unit,year,value,region_name,sub_region_name,iso_alpha3_code,least_developed_countries_ldc,element_unit
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,1961,0.0,Asia,Southern Asia,AFG,x,Area harvested_ha
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,1961,0.0,Asia,Southern Asia,AFG,x,Yield_hg/ha
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,1961,0.0,Asia,Southern Asia,AFG,x,Production_tonnes
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,1961,0.0,Asia,Southern Asia,AFG,x,Area harvested_ha
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,1961,0.0,Asia,Southern Asia,AFG,x,Yield_hg/ha
5,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Production,tonnes,1961,0.0,Asia,Southern Asia,AFG,x,Production_tonnes
6,Afghanistan,'01341,Apples,Area harvested,ha,1961,2220.0,Asia,Southern Asia,AFG,x,Area harvested_ha
7,Afghanistan,'01341,Apples,Yield,hg/ha,1961,68018.0,Asia,Southern Asia,AFG,x,Yield_hg/ha
8,Afghanistan,'01341,Apples,Production,tonnes,1961,15100.0,Asia,Southern Asia,AFG,x,Production_tonnes
9,Afghanistan,'01343,Apricots,Area harvested,ha,1961,4820.0,Asia,Southern Asia,AFG,x,Area harvested_ha


In [None]:
production_overview = production_data.groupby(['area','element']).agg({'value':sum})
production_overview

In [None]:
df = pd.DataFrame(production_data.pivot_table(index = ['area'], columns=['element'], aggfunc = {'value':'sum'}))
df.columns

In [None]:
col_list = []
for col in df.columns:
    col_list.append(col[1])
col_list

df.columns = col_list

df = df[['Area harvested','Producing Animals/Slaughtered','Production']].reset_index()
df

In [None]:
sns.scatterplot(data = df, x = 'Area harvested', y = 'Production')

In [None]:
balance_hist = pd.read_csv('processed data/balance_historic.csv', encoding='ISO-8859-1').drop(['Unnamed: 0'], axis = 1)

balance_hist = pd.merge(left = balance_hist,
                                 right = countries,
                                 how = 'inner', 
                                 left_on = "area", 
                                 right_on= "country_or_area").drop('country_or_area', axis=1)


balance_hist['item_element'] = balance_hist[['item', 'element']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

balance_hist['element_unit'] = balance_hist[['element', 'unit']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

balance_hist.head(10)

In [None]:
balance_hist[(balance_hist['element']=='Food supply (kcal/capita/day)')&(balance_hist['item']=='Grand Total')].value.describe().T

In [None]:
balance_hist_pivot = balance_hist[balance_hist['item'].isin(['Grand Total','Population','Vegetal Products','Animal Products'])==True].pivot_table(index = ['sub-region_name','area','year'], columns=['element'], aggfunc = {'value':'sum'})
balance_hist_pivot.columns

In [None]:
col_list = []
for col in balance_hist_pivot.columns:
    col_list.append(col[1])
col_list

balance_hist_pivot.columns = col_list

balance_hist_pivot = balance_hist_pivot[['Food supply (kcal/capita/day)','Total Population - Both sexes']].reset_index()
balance_hist_pivot

In [None]:
balance_hist_pivot['Food supply (kcal/day)'] = 1000*balance_hist_pivot['Food supply (kcal/capita/day)']*balance_hist_pivot['Total Population - Both sexes']
balance_hist_pivot['Total population'] = 1000*balance_hist_pivot['Total Population - Both sexes']

balance_hist_pivot

In [None]:
summary = balance_hist_pivot.groupby(['sub-region_name','year']).agg({'Food supply (kcal/day)':sum, 'Total population':sum})
summary_df = pd.DataFrame(summary).reset_index()
summary_df['Food supply (kcal/capita/day)'] = summary_df['Food supply (kcal/day)']/summary_df['Total population']
summary_df

In [None]:
sns.lineplot(data = summary_df, x = 'year', y = 'Food supply (kcal/capita/day)', hue="sub-region_name")

In [None]:
summary_df.describe().T

In [None]:
# since 1980 every sub-region has had an average food supply of 2,000+ kcal/day
# all are increasing

In [None]:
balance_hist['least_developed_countries_(ldc)'].value_counts(dropna=False)

In [None]:
ldc = balance_hist[(balance_hist['least_developed_countries_(ldc)']=='x')&
                   (balance_hist['element']=='Food supply (kcal/capita/day)')&
                   (balance_hist['item']=='Grand Total')]
sns.lineplot(data = ldc, x = 'year', y = 'value', hue="area")

In [None]:
ldc.describe().T

In [None]:
balance_hist.head()

In [None]:
pivot = balance_hist.pivot_table(index = ['sub-region_name','area','year','item_code_(cpc)','item'], columns=['element_unit'], aggfunc = {'value':'sum'})
pivot.columns

In [None]:
col_list = []
for col in pivot.columns:
    col_list.append(col[1])
col_list

pivot.columns = col_list

pivot = pivot.reset_index()
pivot

In [None]:
pop = pivot[['area','year','Total Population - Both sexes_1000 persons']].copy()
pop = pop.dropna()
pop

In [None]:
drop_index = pivot[pivot['item']=='Population'].index

pivot_test = pivot.drop(['Total Population - Both sexes_1000 persons'],axis=1)
pivot_test

In [None]:
pivot_test.drop(pivot_test[pivot_test['item'] == 'Population'].index, inplace = True)
pivot_test.head()

In [None]:
pivot_test.shape

In [None]:
pop[(pop['area']=='Australia')&(pop['year']==1961)]['Total Population - Both sexes_1000 persons']

In [None]:
pivot_test

In [None]:
pivot_test = pd.merge(pivot_test, pop,  how='outer', left_on=['area','year'], right_on = ['area','year'])
pivot_test

In [None]:
pivot_test['food_supply_kcal_per_day'] = 1000 * pivot_test['Food supply (kcal/capita/day)_kcal/capita/day'] * pivot_test['Total Population - Both sexes_1000 persons']
pivot_test['kcal_per_1000tonnes'] = 365 * pivot_test['food_supply_kcal_per_day'] / pivot_test['Food_1000 tonnes']
pivot_test['import_multiple'] = pivot_test['Import Quantity_1000 tonnes'] / pivot_test['Food_1000 tonnes']
pivot_test['export_multiple'] = pivot_test['Export Quantity_1000 tonnes'] / pivot_test['Food_1000 tonnes']
pivot_test['loss_multiple'] = pivot_test['Losses_1000 tonnes'] / pivot_test['Food_1000 tonnes']
pivot_test['losses_kcal_per_day'] = pivot_test['loss_multiple'] * pivot_test['food_supply_kcal_per_day']
pivot_test['fat_supply_g_per_day'] = pivot_test['Fat supply quantity (g/capita/day)_g/capita/day'] * pivot_test['Total Population - Both sexes_1000 persons']
pivot_test['protein_supply_g_per_day'] = pivot_test['Protein supply quantity (g/capita/day)_g/capita/day'] * pivot_test['Total Population - Both sexes_1000 persons']