In [1]:
import pandas as pd
import numpy as np

***Data Cleaning and Exploration***

In [2]:
epi = pd.read_csv('data/dataframes/epi_wrangled.csv')

- Remove countries with fewer than 5 records.

In [3]:
high_data = [key for key, value in epi['country_name'].value_counts().to_dict().items() if value >5]

In [4]:
epi = epi[epi['country_name'].isin(high_data)]

In [5]:
epi['country_name'].value_counts()

Sudan                               19
Croatia                             19
Cyprus                              19
Kyrgyzstan                          19
Grenada                             19
                                    ..
Marshall Islands                     9
Maldives                             9
Saint Vincent and the Grenadines     9
Samoa                                9
Saint Lucia                          9
Name: country_name, Length: 185, dtype: int64

- Linearly impute values in place of nulls.

In [6]:
def impute_values(country_name):
    country = epi[epi['country_name'] == country_name].copy()
    for col in country.columns:
        country[col] = country[col].map(lambda x: np.nan if x == 'None' else x)
        country[col].interpolate(method='linear',order=1,inplace=True, axis=0)
    return country
        

In [7]:
epi_new = pd.DataFrame()

for country_name in  set(epi['country_name'].values):
    country_df = impute_values(country_name)
    epi_new = pd.concat([epi_new, country_df], ignore_index = True)

In [8]:
epi = epi_new

* drop colums that have too many nulls or no valuable trends.

In [9]:
drop_list = ['Climate','Forests', 'Critical Habitat Protection', 'Change in Forest Cover','Change of Trend in Carbon Intensity', 'Coastal Shelf Fishing Pressure',
'Trend in CO2 Emissions per KWH','Trend in Carbon Intensity','Climate & Energy']

In [10]:
epi.drop(columns=drop_list,inplace=True)

In [12]:
nulls = epi.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Marine Protected Areas                             868
Fisheries                                          842
Fish Stock Status                                  842
Agricultural Subsidies                             466
Environmental Performance Index                     38
Ecosystem Vitality                                  38
Water Resources                                     26
Wastewater Treatment                                26
GDP                                                 13
Environmental Health                                12
Child Mortality                                      9
Access to Electricity                                9
Health Impacts                                       9
Terrestrial biome protection (national weights)      8
Agriculture                                          8
Biodiversity & Habitat                               8
Terrestrial biome protection (global weights)        8
Air Quality                                          4
PM2.5 Exce

checking the nulls for high null count indicators: Marine Protected Areas, Fisheries, Fish Stock Status and Agricultural Subsidies

In [13]:
#make new DF with just marine nulls
marine_nulls = epi[epi['Marine Protected Areas'].isnull() == True]

In [14]:
marine_dic = marine_nulls['country_name'].value_counts().to_dict()

In [15]:
#get list of countries will nulls
marine_countries = list(marine_dic.keys())

In [16]:
#make new DF with just fishery Nulls
fish_nulls = epi[epi['Fisheries'].isnull() == True]

In [17]:
fish_dic = fish_nulls['country_name'].value_counts().to_dict()

In [18]:
#get list of countries with fisheries nulls
fish_countries = list(fish_dic.keys())

Compare list, decide what to do with non-landlocked countries.

In [19]:
[x for x in marine_countries if x not in fish_countries]

['Guinea-Bissau', 'Brunei Darussalam', 'Maldives']

In [20]:
[x for x in fish_countries if x not in marine_countries]

['Saint Vincent and the Grenadines']

drop: maldives, saint vincent and the grenadines, saint lucia, Marshall Islands.
pad: guinea bissau, Brunei Darussalam

In [21]:
#update list
fish_countries.remove('Saint Vincent and the Grenadines')
marine_countries.remove('Maldives')
marine_countries.remove('Guinea-Bissau')
marine_countries.remove('Brunei Darussalam')

In [22]:
#new DF for fishing stock nulls
fish_stock_nulls = epi[epi['Fish Stock Status'].isnull() == True]

In [23]:
fish_stock_dic = fish_stock_nulls['country_name'].value_counts().to_dict()

In [24]:
#get list of fishing stock null countries 
fish_stock_countries = list(fish_stock_dic.keys())

In [25]:
#we are not using Saint Vincent and the Grenadines
fish_stock_countries.remove('Saint Vincent and the Grenadines')

In [26]:
#look for differences in the list
fish_stock_countries == fish_countries

True

In [27]:
#make sure not to leave out any countries from the list
sorted(marine_countries) == sorted(fish_countries)

True

In [28]:
land_locked = fish_countries

In [29]:
land_locked[:3] #fill in with zeros for "marine protected status, fisheries, and fish stock"

['Zambia', 'Slovakia', 'Paraguay']

not ocean related high nulls

In [30]:
ag_nulls = epi[epi['Agricultural Subsidies'].isnull() == True]

In [31]:
ag_dic = ag_nulls['country_name'].value_counts().to_dict()

In [32]:
ag_countries = list(ag_dic.keys())

In [33]:
ag_countries.remove('Marshall Islands')
ag_countries.remove('Saint Lucia')

In [38]:
len(ag_countries)

24

In [None]:
ag_countries[:3] # list of countries to use the regional values for agriculture subsidies 

checking other nulls.

In [34]:
#check eco_vitality
eco_vitality = epi[epi['Ecosystem Vitality'].isnull() == True]
eco_vitality['country_name'].value_counts()

Marshall Islands                    8
Saint Vincent and the Grenadines    6
Micronesia                          6
Saint Lucia                         6
Samoa                               4
Sao Tome and Principe               4
Maldives                            4
Name: country_name, dtype: int64

In [35]:
#drop those countries, and the ones we found above
drop_countries = ['Maldives', 'Saint Vincent and the Grenadines','Saint Lucia', 'Marshall Islands','Micronesia', 'Sao Tome and Principe', 'Samoa']     

In [36]:
epi_2 = epi[~epi['country_name'].isin(drop_countries)]

In [37]:
nulls = epi_2.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Marine Protected Areas    864
Fisheries                 836
Fish Stock Status         836
Agricultural Subsidies    448
GDP                        13
dtype: int64

nulls solved!