In [2]:
import pandas as pd
import numpy as np
from data_processing import load_dataframe
import os

# Investigating missing values: years

In [11]:
dir = os.getcwd() + "\data"
countries = load_dataframe(dir + "\Countries.csv")
areas = load_dataframe(dir + "\Areas.csv").drop(['m49code', 'iso2code'], axis = 1)

# Looking at countrie with missing year records

In [141]:
sample.head(1)

Unnamed: 0,areacode,area,itemcode,item,elementcode,element,yearcode,year,unit,value,flag,note
0,2,Afghanistan,5058,Enteric Fermentation,7225,Emissions (CH4),1961,1961,Gigagrams,240.6831,A,


In [143]:
# should take all mighty total merged df
# for now lets load a random one
sample = load_dataframe(dir + "/emissions_agriculture\Emissions_Agriculture_Agriculture_total_E_All_Data_(Normalized).csv")\
    .drop(["areacode","itemcode","elementcode","yearcode","note"], axis = 1)
print(sample.year.min(), sample.year.max())

1961 2050


In [152]:
# remove data points occuring after 2017
sample = sample[sample.year < 2019]
print(sample.year.min(), sample.year.max())

1961 2017


In [101]:
sample.groupby(["area","item","element"]).agg({"year":"count"}).reset_index().describe()
# clearly, different countries have different number of values over the total year range

Unnamed: 0,year
count,10601.0
mean,46.935289
std,14.938729
min,1.0
25%,28.0
50%,57.0
75%,57.0
max,57.0


In [104]:
# 50% quantile has 57 values as expected
# need to investigate the rest.
sample.year.max()- sample.year.min() + 1

57

# Explaining all country formation + dissolution

In [21]:
start = areas[~areas.startyear.isnull()]
end = areas[~areas.endyear.isnull()]
print(" start : {} \n \n end : {}".format(start.startyear.unique(), end.endyear.unique()))

 start : [1992. 2000. 1993. 1991. 2006. 2012.] 
 
 end : [1999. 1992. 2010. 1990. 2005. 2011. 1991.]


In [16]:
check_end = list(end.country)
check_start = list(start.country)

In [59]:
# Yugoslavia republic split in 1991 --> 1992, new countries
ysfr  = ['Bosnia and Herzegovina', 'Croatia', 'North Macedonia', 'Serbia and Montenegro', 'Slovenia']
ysfr_og = ['Yugoslav SFR']

# USSR into different countries in 1991 --> 1992, new countries
ussr  = ['Lithuania', 'Estonia', 'Latvia', 'Armenia', 'Georgia', 'Ukraine', 'Belarus', \
         'Republic of Moldova', 'Kyrgyzstan', 'Uzbekistan','Tajikistan', 'Turkmenistan',\
         'Kazakhstan', 'Azerbaijan', 'Russian Federation']
ussr_og = ['USSR']

# belgium luxembourg split in 1999 --> 2000, new countries
belux = ['Belgium', 'Luxembourg']
belux_og = ['Belgium-Luxembourg']

# serbia and montegro, split from yugoslavia, then split again 2005 --> 2006, new countries
serbmont = ['Serbia', 'Montenegro']
serbmont_og= ['Serbia and Montenegro']

# Czechoslovakia dissolution in 1992 --> 1003, new countries
czeslo = ['Czechia', 'Slovakia']
czeslo_og = ['Czechoslovakia']

# South Sudan Independence in 2011 --> 2012, new countries
sudan = ['South Sudan', 'Sudan']
sudan_og = ['Sudan (former)']

# 1992 eritrea independence referendum --> 1993, new countries
ethiopia = ['Ethiopia', 'Eritrea']
ethiopia_og = ['Ethiopia PDR']

# 1990ish USA stops administering this territory --> 1991, new countries
pacific = ['Marshall Islands', 'Micronesia (Federated States of)', 'Northern Mariana Islands','Palau']
pacific_og = ['Pacific Islands Trust Territory']

# netherland antilles dissolved in 2010 --> 2011, part of netherlands
n_antilles_og = ['Netherlands Antilles (former)']

In [48]:
end_set = ysfr + ussr + belux + serbmont + czeslo + sudan + ethiopia + pacific
start_set = ysfr_og + ussr_og + belux_og + serbmont_og + czeslo_og + sudan_og + ethiopia_og + pacific_og + n_antilles_og

In [57]:
# check that we explained all discrepancies in new countries exploding / forming
exploding = set(check_end) - set(start_set)
forming = set(check_start) - set(end_set)
assert(not exploding and not forming)

### Now check the count removing those areas from the list, see how much it impacts the count

In [173]:
# new = sample[sample.area in set(end_set)]
# sample[sample.area.str.isin(end_set)]
new = sample[~sample.area.apply(lambda x: x in end_set or x in start_set) == True]

In [174]:
new.groupby(["area","item","element"]).agg({"year":"count"}).reset_index().describe()
# ok now even 25th percentile has all values for years (57)

Unnamed: 0,year
count,9011.0
mean,50.876706
std,12.013598
min,3.0
25%,57.0
50%,57.0
75%,57.0
max,57.0


In [175]:
years = pd.DataFrame(new.groupby('area').year.nunique().sort_values().reset_index().rename({'year':'yearcount'}, axis = 1))

In [177]:
missing = years[years.yearcount<57]
missing

Unnamed: 0,area,yearcount
0,Gibraltar,12
1,Monaco,23
2,Holy See,23
3,Norfolk Island,24
4,Central Asia,26
5,Palestine,28
6,Isle of Man,28
7,Svalbard and Jan Mayen Islands,28
8,Aruba,28
9,Mayotte,28


In [194]:
# we didnt even drop things like central Asia = not a country
# the rest of these teritories are very small in size... and therefore production of anything
# check sum of values for these (except for CEntral Asia)
check_missing = list(missing[missing.area != "Central Asia"].area)

In [209]:
prod1 = sample[sample.area.apply(lambda x: x in check_missing)==True].groupby(['item']).agg({'value':'sum'}, axis =1).rename({'value':'missing'},axis=1)

In [210]:
prod2 = sample[~(sample.area.apply(lambda x: x in check_missing)==True)].groupby(['item']).agg({'value':'sum'}, axis =1).rename({'value':'all'},axis=1)

In [225]:
# compare to the rest sum of the rest of the countries...
total = pd.DataFrame(prod2.merge(prod1, how = 'inner', on = 'item').apply(lambda x: x['missing'] / x['all'], axis = 1).reset_index())
total.columns = ['item', 'ratio']
total
# so we can probably not worry about the rest of these. Although if we are meticulous, we should investigate further

Unnamed: 0,item,ratio
0,Agricultural Soils,7.31886e-06
1,Agriculture total,4.951478e-06
2,Burning - Crop residues,1.6207e-06
3,Burning - Savanna,8.924623e-08
4,Crop Residues,2.224675e-06
5,Cultivation of Organic Soils,5.441293e-06
6,Enteric Fermentation,5.228226e-06
7,Manure Management,2.456088e-06
8,Manure applied to Soils,3.402434e-06
9,Manure left on Pasture,1.429941e-05


# Conclusion

This allows us to explain the "blank" areas when we are plotting worlwide countrywide evolution of values: some countries do not exist in certain time-frames (e.g. the Russian Fedeeration  before 1991). 

We could look at regions instead of countries, and define the "USSR" to define a certain geographic boundary. However this is not a good idea, and would not help our analysis and conclusions.

A more suitable idea would be to define geographical areas based on climate, however for large countries (e.g. Russia, China, USA), different climates arise within a single country, so this approach is limited as well.

Therefore we have decided to keep all countries as they are, and will take into account in our analyses that some countries have less data-points than others (have existed for a shorter amount of time).