In [1]:
import pandas as pd
import numpy as np

In [2]:
epi = pd.read_csv('../data/EPI_data/ObservationData_rqridaf.csv')

***Data Import***

In [3]:
def epi_to_country_data(country_name):
    country = epi[epi['location'] == country_name]

    indicators = list(country['indicator'].value_counts().to_dict().keys())

    min_date = country['Date'].min()
    max_date = country['Date'].max()

    dates = pd.Series(list(range(min_date, max_date + 1)))
    transpose_country = pd.DataFrame()
    transpose_country['date'] = dates
    transpose_country['country_name'] = country_name

    for indicator in indicators:
        indicator_df = country[country['indicator'] == indicator]

        date_values = {key: None for key in list(range(min_date, max_date + 1))}
        for row in indicator_df.iterrows():
            date_values[row[1]['Date']] = row[1]['Value']

        transpose_country[indicator] = date_values.values()
        
    return transpose_country

In [4]:
best_indicators = [key for key, value in epi['indicator'].value_counts().to_dict().items() if value > 500]

In [5]:
epi = epi[epi['indicator'].isin(best_indicators)]

In [6]:
epi_new = pd.DataFrame()

for country_name in  set(epi['location'].values):
    country_df = epi_to_country_data(country_name)
    epi_new = pd.concat([epi_new, country_df], ignore_index = True)

In [7]:
epi_new.head(50)

Unnamed: 0,date,country_name,Unsafe drinking water,Environmental Performance Index,Fish Stock Status,Agriculture,Air Quality,Wastewater Treatment,Sanitation & Drinking Water,Unsafe sanitation,...,Trend in Carbon Intensity,Household Air Quality,Child Mortality,Pesticide Regulation,Agricultural Subsidies,Change in Forest Cover,Coastal Shelf Fishing Pressure,Change of Trend in Carbon Intensity,Climate,Critical Habitat Protection
0,2002,Sweden,100.0,77.09,13.79,50.68,97.25,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,9.37,14.35,29.3,29.51,77.34,
1,2003,Sweden,100.0,77.14,8.16,50.48,96.77,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,8.96,14.35,37.14,29.51,77.34,
2,2004,Sweden,100.0,77.29,8.42,51.36,97.13,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,10.73,14.35,37.05,29.51,77.34,
3,2005,Sweden,100.0,77.46,10.71,54.62,97.13,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,17.23,14.35,37.9,29.51,77.34,
4,2006,Sweden,100.0,77.48,11.48,55.55,96.53,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,19.1,14.35,38.83,29.51,77.34,
5,2007,Sweden,100.0,77.47,5.41,60.24,96.53,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,28.48,14.35,38.83,29.51,77.34,
6,2008,Sweden,100.0,77.69,5.76,61.26,97.37,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,30.52,14.35,38.83,29.51,77.34,
7,2009,Sweden,100.0,77.94,11.29,60.73,97.49,87.86,100.0,100.0,...,77.67,95.0,99.93,92.0,29.46,14.35,38.83,29.51,77.34,
8,2010,Sweden,100.0,77.93,8.38,63.23,97.25,87.86,100.0,100.0,...,77.67,95.0,99.91,92.0,34.46,14.35,38.83,29.51,77.34,
9,2011,Sweden,100.0,78.03,11.76,65.18,96.65,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,38.35,14.35,38.83,29.51,77.34,


In [8]:
high_data = [key for key, value in epi_new['country_name'].value_counts().to_dict().items() if value >5]

In [9]:
high_data

['Hungary',
 'Zambia',
 'Niger',
 'Mali',
 'Equatorial Guinea',
 'Guyana',
 'Senegal',
 'Nepal',
 'Cambodia',
 'Dominica',
 'Bolivia',
 'Turkey',
 'Lesotho',
 'Costa Rica',
 'Luxembourg',
 'Vanuatu',
 'Nigeria',
 'Liberia',
 'Gabon',
 'Uganda',
 'Trinidad and Tobago',
 'Georgia',
 'Madagascar',
 'Russia',
 'Iraq',
 'Albania',
 'Kenya',
 'Malaysia',
 'Jamaica',
 'Malta',
 'Estonia',
 'Chad',
 'Switzerland',
 'Thailand',
 'Greece',
 'Tajikistan',
 'Denmark',
 'Ethiopia',
 'Kiribati',
 'Austria',
 'Laos',
 'Republic of Congo',
 'Uruguay',
 'Colombia',
 'Algeria',
 'Mauritius',
 'Mongolia',
 'Zimbabwe',
 'Benin',
 'Lebanon',
 'Cabo Verde',
 'Belgium',
 'Haiti',
 'Cuba',
 'Solomon Islands',
 'Dem. Rep. Congo',
 'Barbados',
 'Timor-Leste',
 'Paraguay',
 'South Korea',
 'Azerbaijan',
 'Belize',
 'Bulgaria',
 'Morocco',
 'Rwanda',
 'Chile',
 'Eswatini',
 'Cyprus',
 'Indonesia',
 'Sudan',
 'Ireland',
 'Sierra Leone',
 'Israel',
 'Peru',
 'Ghana',
 'Ukraine',
 'Mauritania',
 'Bahrain',
 'Latvia'

In [10]:
epi_new = epi_new[epi_new['country_name'].isin(high_data)]

In [11]:
epi_new

Unnamed: 0,date,country_name,Unsafe drinking water,Environmental Performance Index,Fish Stock Status,Agriculture,Air Quality,Wastewater Treatment,Sanitation & Drinking Water,Unsafe sanitation,...,Trend in Carbon Intensity,Household Air Quality,Child Mortality,Pesticide Regulation,Agricultural Subsidies,Change in Forest Cover,Coastal Shelf Fishing Pressure,Change of Trend in Carbon Intensity,Climate,Critical Habitat Protection
0,2002,Sweden,100,77.09,13.79,50.68,97.25,87.86,100,100,...,77.67,95,100,92,9.37,14.35,29.3,29.51,77.34,
1,2003,Sweden,100,77.14,8.16,50.48,96.77,87.86,100,100,...,77.67,95,100,92,8.96,14.35,37.14,29.51,77.34,
2,2004,Sweden,100,77.29,8.42,51.36,97.13,87.86,100,100,...,77.67,95,100,92,10.73,14.35,37.05,29.51,77.34,
3,2005,Sweden,100,77.46,10.71,54.62,97.13,87.86,100,100,...,77.67,95,100,92,17.23,14.35,37.9,29.51,77.34,
4,2006,Sweden,100,77.48,11.48,55.55,96.53,87.86,100,100,...,77.67,95,100,92,19.1,14.35,38.83,29.51,77.34,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3461,2016,Tajikistan,47.42,73.05,,89.97,74,49.33,70.28,90.25,...,96.6184,83.33,,,,,,,,
3462,2017,Tajikistan,,,,,,,,,...,,,,,,,,,,
3463,2018,Tajikistan,32.84,47.85,,33.47,23.22,58.85,29.9,26.96,...,,,,,,,,,,
3464,2019,Tajikistan,,,,,,,,,...,,,,,,,,,,


In [12]:
epi_new['country_name'].value_counts()

Hungary                  19
Cambodia                 19
Bolivia                  19
Turkey                   19
Lesotho                  19
                         ..
Sao Tome and Principe     9
Maldives                  9
Samoa                     9
Marshall Islands          9
Micronesia                9
Name: country_name, Length: 185, dtype: int64

In [13]:
def impute_values(country_name):
    country = epi_new[epi_new['country_name'] == country_name].copy()
    for col in country.columns:
        country[col] = country[col].map(lambda x: np.nan if x == 'None' else x)
        country[col].interpolate(method='linear',order=1,inplace=True, axis=0)
    return country
        

In [14]:
epi_new_new = pd.DataFrame()

for country_name in  set(epi_new['country_name'].values):
    country_df = impute_values(country_name)
    epi_new_new = pd.concat([epi_new_new, country_df], ignore_index = True)

In [15]:
epi_new_new.head()

Unnamed: 0,date,country_name,Unsafe drinking water,Environmental Performance Index,Fish Stock Status,Agriculture,Air Quality,Wastewater Treatment,Sanitation & Drinking Water,Unsafe sanitation,...,Trend in Carbon Intensity,Household Air Quality,Child Mortality,Pesticide Regulation,Agricultural Subsidies,Change in Forest Cover,Coastal Shelf Fishing Pressure,Change of Trend in Carbon Intensity,Climate,Critical Habitat Protection
0,2002,Sweden,100.0,77.09,13.79,50.68,97.25,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,9.37,14.35,29.3,29.51,77.34,
1,2003,Sweden,100.0,77.14,8.16,50.48,96.77,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,8.96,14.35,37.14,29.51,77.34,
2,2004,Sweden,100.0,77.29,8.42,51.36,97.13,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,10.73,14.35,37.05,29.51,77.34,
3,2005,Sweden,100.0,77.46,10.71,54.62,97.13,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,17.23,14.35,37.9,29.51,77.34,
4,2006,Sweden,100.0,77.48,11.48,55.55,96.53,87.86,100.0,100.0,...,77.67,95.0,100.0,92.0,19.1,14.35,38.83,29.51,77.34,


In [16]:
epi_new_new.shape

(3425, 35)

In [17]:
nulls = epi_new_new.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Critical Habitat Protection                        1930
Climate                                             959
Change of Trend in Carbon Intensity                 959
Trend in CO2 Emissions per KWH                      957
Marine Protected Areas                              868
Trend in Carbon Intensity                           853
Coastal Shelf Fishing Pressure                      845
Fish Stock Status                                   842
Fisheries                                           842
Change in Forest Cover                              826
Climate & Energy                                    801
Forests                                             668
Agricultural Subsidies                              466
Environmental Performance Index                      38
Ecosystem Vitality                                   38
Wastewater Treatment                                 26
Water Resources                                      26
Environmental Health                            

In [18]:
drop_list = ['Climate','Forests', 'Critical Habitat Protection', 'Change in Forest Cover','Change of Trend in Carbon Intensity', 'Coastal Shelf Fishing Pressure',
'Trend in CO2 Emissions per KWH','Trend in Carbon Intensity','Climate & Energy']

In [19]:
len(drop_list)

9

more to look at: marine, Fisheries, fish stock, agricultural subsidies.

In [20]:
epi_new_new.drop(columns=drop_list,inplace=True)

In [21]:
nulls = epi_new_new.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Marine Protected Areas                             868
Fisheries                                          842
Fish Stock Status                                  842
Agricultural Subsidies                             466
Environmental Performance Index                     38
Ecosystem Vitality                                  38
Water Resources                                     26
Wastewater Treatment                                26
Environmental Health                                12
Health Impacts                                       9
Child Mortality                                      9
Access to Electricity                                9
Agriculture                                          8
Terrestrial biome protection (global weights)        8
Terrestrial biome protection (national weights)      8
Biodiversity & Habitat                               8
Air Quality                                          4
PM2.5 Exposure                                       4
PM2.5 Exce

In [22]:
epi = epi_new_new

In [23]:
nulls = epi.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Marine Protected Areas                             868
Fisheries                                          842
Fish Stock Status                                  842
Agricultural Subsidies                             466
Environmental Performance Index                     38
Ecosystem Vitality                                  38
Water Resources                                     26
Wastewater Treatment                                26
Environmental Health                                12
Health Impacts                                       9
Child Mortality                                      9
Access to Electricity                                9
Agriculture                                          8
Terrestrial biome protection (global weights)        8
Terrestrial biome protection (national weights)      8
Biodiversity & Habitat                               8
Air Quality                                          4
PM2.5 Exposure                                       4
PM2.5 Exce

checking the nulls for high null count indicators: Marine Protected Areas, Fisheries, Fish Stock Status and Agricultural Subsidies

In [24]:
#make new DF with just marine nulls
marine_nulls = epi[epi['Marine Protected Areas'].isnull() == True]

In [25]:
marine_dic = marine_nulls['country_name'].value_counts().to_dict()

In [26]:
#get list of countries will nulls
marine_countries = list(marine_dic.keys())

In [27]:
#make new DF with just fishery Nulls
fish_nulls = epi[epi['Fisheries'].isnull() == True]

In [28]:
fish_dic = fish_nulls['country_name'].value_counts().to_dict()

In [29]:
#get list of countries with fisheries nulls
fish_countries = list(fish_dic.keys())

Compare list, decide what to do with non-landlocked countries.

In [30]:
[x for x in marine_countries if x not in fish_countries]

['Brunei Darussalam', 'Guinea-Bissau', 'Maldives']

In [31]:
[x for x in fish_countries if x not in marine_countries]

['Saint Vincent and the Grenadines']

drop: maldives, saint vincent and the grenadines, saint lucia, Marshall Islands,micronesia, Sao Tome and Principe, samoa

pad: guinea bissau, Brunei Darussalam

In [32]:
#update list
fish_countries.remove('Saint Vincent and the Grenadines')
marine_countries.remove('Maldives')
marine_countries.remove('Guinea-Bissau')
marine_countries.remove('Brunei Darussalam')

In [33]:
#new DF for fishing stock nulls
fish_stock_nulls = epi[epi['Fish Stock Status'].isnull() == True]

In [34]:
fish_stock_dic = fish_stock_nulls['country_name'].value_counts().to_dict()

In [35]:
#get list of fishing stock null countries 
fish_stock_countries = list(fish_stock_dic.keys())

In [36]:
#we are not using Saint Vincent and the Grenadines
fish_stock_countries.remove('Saint Vincent and the Grenadines')

In [37]:
#look for differences in the list
fish_stock_countries == fish_countries

True

In [38]:
#make sure not to leave out any countries from the list
sorted(marine_countries) == sorted(fish_countries)

True

In [39]:
land_locked = fish_countries

In [40]:
land_locked[:3] #fill in with zeros for "marine protected status, fisheries, and fish stock"

['Hungary', 'Dem. Rep. Congo', 'Serbia']

not ocean related high nulls

In [41]:
ag_nulls = epi[epi['Agricultural Subsidies'].isnull() == True]

In [42]:
ag_dic = ag_nulls['country_name'].value_counts().to_dict()

In [43]:
ag_countries = list(ag_dic.keys())

In [44]:
ag_countries.remove('Marshall Islands')
ag_countries.remove('Saint Lucia')

In [45]:
nulls = epi.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Marine Protected Areas                             868
Fisheries                                          842
Fish Stock Status                                  842
Agricultural Subsidies                             466
Environmental Performance Index                     38
Ecosystem Vitality                                  38
Water Resources                                     26
Wastewater Treatment                                26
Environmental Health                                12
Health Impacts                                       9
Child Mortality                                      9
Access to Electricity                                9
Agriculture                                          8
Terrestrial biome protection (global weights)        8
Terrestrial biome protection (national weights)      8
Biodiversity & Habitat                               8
Air Quality                                          4
PM2.5 Exposure                                       4
PM2.5 Exce

In [46]:
eco_vitality = epi[epi['Ecosystem Vitality'].isnull() == True]

In [47]:
eco_vitality = epi[epi['Ecosystem Vitality'].isnull() == True]
eco_vitality['country_name'].value_counts()

Marshall Islands                    8
Micronesia                          6
Saint Lucia                         6
Saint Vincent and the Grenadines    6
Sao Tome and Principe               4
Maldives                            4
Samoa                               4
Name: country_name, dtype: int64

* pad: 

* drop: micronesia, Sao Tome and Principe, samoa, 

In [48]:
drop_countries = ['Maldives', 'Saint Vincent and the Grenadines','Saint Lucia', 'Marshall Islands','Micronesia', 'Sao Tome and Principe', 'Samoa']     

In [49]:
epi_2 = epi[~epi['country_name'].isin(drop_countries)]

In [50]:
nulls = epi_2.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Marine Protected Areas    864
Fisheries                 836
Fish Stock Status         836
Agricultural Subsidies    448
dtype: int64

In [51]:
def error_country_list(indicator):
    df = epi[epi[indicator].isnull() == True]
    return df['country_name'].value_counts()
    

In [52]:
error_country_list('Environmental Performance Index')

Marshall Islands                    8
Micronesia                          6
Saint Lucia                         6
Saint Vincent and the Grenadines    6
Sao Tome and Principe               4
Maldives                            4
Samoa                               4
Name: country_name, dtype: int64

In [53]:
error_country_list('Water Resources')

Saint Vincent and the Grenadines    6
Micronesia                          6
Saint Lucia                         6
Samoa                               4
Sao Tome and Principe               4
Name: country_name, dtype: int64

In [54]:
error_country_list('Wastewater Treatment')

Saint Vincent and the Grenadines    6
Micronesia                          6
Saint Lucia                         6
Samoa                               4
Sao Tome and Principe               4
Name: country_name, dtype: int64

In [55]:
error_country_list('Environmental Health')

Marshall Islands    8
Maldives            4
Name: country_name, dtype: int64

In [56]:
error_country_list('Child Mortality')

Marshall Islands    9
Name: country_name, dtype: int64

In [57]:
error_country_list('Health Impacts')

Marshall Islands    9
Name: country_name, dtype: int64

In [58]:
error_country_list('Access to Electricity')

Micronesia    9
Name: country_name, dtype: int64

In [59]:
error_country_list('Terrestrial biome protection (global weights)')

Sao Tome and Principe    4
Maldives                 4
Name: country_name, dtype: int64

In [60]:
epi[epi['country_name'] == 'Micronesia'].head(50)

Unnamed: 0,date,country_name,Unsafe drinking water,Environmental Performance Index,Fish Stock Status,Agriculture,Air Quality,Wastewater Treatment,Sanitation & Drinking Water,Unsafe sanitation,...,Ecosystem Vitality,Water Resources,PM2.5 Exposure,PM2.5 Exceedance,Access to Electricity,Health Impacts,Household Air Quality,Child Mortality,Pesticide Regulation,Agricultural Subsidies
418,2012,Micronesia,40.69,,15.84,56.0,86.33,,27.52,14.34,...,,,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
419,2013,Micronesia,41.346667,,13.2,50.505,83.121667,,30.665,19.973333,...,,,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
420,2014,Micronesia,42.003333,,10.56,45.01,79.913333,,33.81,25.606667,...,,,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
421,2015,Micronesia,42.66,,7.92,39.515,76.705,,36.955,31.24,...,,,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
422,2016,Micronesia,43.316667,,5.28,34.02,73.496667,,40.1,36.873333,...,,,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
423,2017,Micronesia,43.973333,,2.64,28.525,70.288333,,43.245,42.506667,...,,,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
424,2018,Micronesia,44.63,49.8,0.0,23.03,67.08,39.76,46.39,48.14,...,43.07,39.76,100.0,100.0,,59.97,59.0,59.97,12.0,100.0
425,2019,Micronesia,38.765,41.4,0.0,21.215,48.04,19.93,40.945,43.77,...,38.785,19.93,65.5,100.0,,59.97,59.0,59.97,12.0,100.0
426,2020,Micronesia,32.9,33.0,0.0,19.4,29.0,0.1,35.5,39.4,...,34.5,0.1,31.0,100.0,,59.97,59.0,59.97,12.0,100.0


In [66]:
ag_countries

['Equatorial Guinea',
 'Saudi Arabia',
 'Bahamas',
 'Barbados',
 'Mauritius',
 'Djibouti',
 'Singapore',
 'Kuwait',
 'Trinidad and Tobago',
 'Qatar',
 'Panama',
 'Jordan',
 'Botswana',
 'Bahrain',
 'Seychelles',
 'Antigua and Barbuda',
 'Gabon',
 'Oman',
 'Brunei Darussalam',
 'United Arab Emirates',
 'Republic of Congo',
 'Cuba',
 'Libya',
 'Palau']

In [62]:
regional = pd.read_csv("../data/EPI_data/epi2020resultsregions20200604.csv")

In [89]:
regional_means = regional[[ 'country','AGR.rgn.mean']]

In [91]:
regional_means = regional_means[regional_means['country'].isin(ag_countries)]

In [92]:
regional_means

Unnamed: 0,country,AGR.rgn.mean
3,United Arab Emirates,36.5125
6,Antigua and Barbuda,32.653125
16,Bahrain,36.5125
17,Bahamas,32.653125
23,Barbados,32.653125
24,Brunei Darussalam,35.94
26,Botswana,32.882609
35,Republic of Congo,32.882609
40,Cuba,32.653125
44,Djibouti,32.882609


In [None]:
pd.DataFrame(regional.loc[regional['country'].isin(ag_countries), 'AGR.rgn.mean'])

In [70]:
regional.columns

Index(['code', 'iso', 'country', 'region', 'EPI.new', 'HLT.new', 'AIR.new',
       'PMD.new', 'HAD.new', 'OZD.new',
       ...
       'NXA.rgn.mean', 'NXA.rgn.rank', 'AGR.rgn.mean', 'AGR.rgn.rank',
       'SNM.rgn.mean', 'SNM.rgn.rank', 'WRS.rgn.mean', 'WRS.rgn.rank',
       'WWT.rgn.mean', 'WWT.rgn.rank'],
      dtype='object', length=142)

In [67]:
regional[regional['country'] == "Saudi Arabia"]

Unnamed: 0,code,iso,country,region,EPI.new,HLT.new,AIR.new,PMD.new,HAD.new,OZD.new,...,NXA.rgn.mean,NXA.rgn.rank,AGR.rgn.mean,AGR.rgn.rank,SNM.rgn.mean,SNM.rgn.rank,WRS.rgn.mean,WRS.rgn.rank,WWT.rgn.mean,WWT.rgn.rank
140,682,SAU,Saudi Arabia,Greater Middle East,44.0,47.2,37.4,10.0,76.2,29.0,...,60.26875,12,36.5125,1,36.5125,1,36.7,13,36.7,13


In [68]:
regional['region'].value_counts()

Sub-Saharan Africa           46
Latin America & Caribbean    32
Asia-Pacific                 25
Global West                  22
Eastern Europe               19
Greater Middle East          16
Former Soviet States         12
Southern Asia                 8
Name: region, dtype: int64

In [61]:
stop error

SyntaxError: invalid syntax (<ipython-input-61-b6379532b430>, line 1)

In [None]:
attributes = pd.read_csv('../../project_5/epi2020variableattributes20200604.csv')

In [None]:
nulls = epi_new.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

In [None]:
s = pd.Series([1,2,3,np.nan,np.nan,np.nan])
s

In [None]:
s.interpolate(method='spline', order=1)