In [1]:
import pandas as pd
import numpy as np

In [2]:
epi = pd.read_csv('../data/EPI_data/ObservationData_rqridaf.csv')

***Data Import***

In [3]:
def epi_to_country_data(country_name):
    country = epi[epi['location'] == country_name]

    indicators = list(country['indicator'].value_counts().to_dict().keys())

    min_date = country['Date'].min()
    max_date = country['Date'].max()

    dates = pd.Series(list(range(min_date, max_date + 1)))
    transpose_country = pd.DataFrame()
    transpose_country['date'] = dates
    transpose_country['country_name'] = country_name

    for indicator in indicators:
        indicator_df = country[country['indicator'] == indicator]

        date_values = {key: None for key in list(range(min_date, max_date + 1))}
        for row in indicator_df.iterrows():
            date_values[row[1]['Date']] = row[1]['Value']

        transpose_country[indicator] = date_values.values()
        
    return transpose_country

In [4]:
best_indicators = [key for key, value in epi['indicator'].value_counts().to_dict().items() if value > 500]

In [5]:
epi = epi[epi['indicator'].isin(best_indicators)]

In [6]:
epi_new = pd.DataFrame()

for country_name in  set(epi['location'].values):
    country_df = epi_to_country_data(country_name)
    epi_new = pd.concat([epi_new, country_df], ignore_index = True)

In [7]:
epi_new.head(50)

Unnamed: 0,date,country_name,PM2.5 Exposure,Ecosystem Vitality,Unsafe sanitation,Terrestrial biome protection (global weights),Water Resources,Wastewater Treatment,Fisheries,Terrestrial biome protection (national weights),...,Critical Habitat Protection,Agricultural Subsidies,Change in Forest Cover,Coastal Shelf Fishing Pressure,Pesticide Regulation,Trend in CO2 Emissions per KWH,Climate & Energy,Trend in Carbon Intensity,Change of Trend in Carbon Intensity,Climate
0,2002,Comoros,100.0,22.45,4.73,0.0,3.96,3.96,63.25,0.0,...,0.0,100.0,68.23,97.93,8.0,,,,,
1,2003,Comoros,100.0,23.02,5.11,0.0,3.96,3.96,67.49,0.0,...,0.0,100.0,68.23,95.58,8.0,,,,,
2,2004,Comoros,100.0,23.09,5.49,0.0,3.96,3.96,68.05,0.0,...,0.0,100.0,68.23,96.7,8.0,,,,,
3,2005,Comoros,100.0,23.71,5.89,0.0,3.96,3.96,72.72,0.0,...,0.0,100.0,68.23,93.83,8.0,,,,,
4,2006,Comoros,100.0,22.49,6.29,0.0,3.96,3.96,63.54,0.0,...,0.0,100.0,68.23,96.52,8.0,,,,,
5,2007,Comoros,100.0,21.33,6.29,0.0,3.96,3.96,52.81,0.0,...,0.0,100.0,68.23,96.52,16.0,,,,,
6,2008,Comoros,100.0,21.33,6.29,0.0,3.96,3.96,52.81,0.0,...,0.0,100.0,68.23,96.52,16.0,,,,,
7,2009,Comoros,100.0,20.72,6.29,0.0,3.96,3.96,48.26,0.0,...,0.0,100.0,68.23,96.52,16.0,,,,,
8,2010,Comoros,100.0,20.72,6.3,0.0,3.96,3.96,48.26,0.0,...,0.0,100.0,68.23,96.52,16.0,,,,,
9,2011,Comoros,100.0,20.72,6.3,0.0,3.96,3.96,48.26,0.0,...,0.0,100.0,68.23,96.52,16.0,,,,,


In [8]:
high_data = [key for key, value in epi_new['country_name'].value_counts().to_dict().items() if value >5]

In [9]:
high_data

['Ethiopia',
 'Sri Lanka',
 'Eritrea',
 'Belize',
 'Gambia',
 'Uganda',
 'Uzbekistan',
 'Greece',
 'Myanmar',
 'Nigeria',
 'Republic of Congo',
 'Rwanda',
 'Belgium',
 'Burkina Faso',
 'Kuwait',
 'Switzerland',
 'Latvia',
 'Angola',
 'Brunei Darussalam',
 'Iran',
 'Russia',
 'Lebanon',
 'Ukraine',
 'Togo',
 'Australia',
 'Djibouti',
 'India',
 'Trinidad and Tobago',
 'Romania',
 'Poland',
 'Oman',
 'Iceland',
 'Norway',
 'Iraq',
 'Belarus',
 'Venezuela',
 'Czech Republic',
 'Cyprus',
 'Algeria',
 'Botswana',
 'Fiji',
 'Grenada',
 'Kazakhstan',
 'Cambodia',
 'Guinea',
 'Seychelles',
 'Finland',
 'Nicaragua',
 'Austria',
 'Guatemala',
 "Cote d'Ivoire",
 'Netherlands',
 'Eswatini',
 'Indonesia',
 'Gabon',
 'Portugal',
 'Bhutan',
 'Italy',
 'South Africa',
 'Mozambique',
 'Zambia',
 'Serbia',
 'Kenya',
 'Costa Rica',
 'Bolivia',
 'Egypt',
 'Denmark',
 'Mongolia',
 'Turkmenistan',
 'Cameroon',
 'Antigua and Barbuda',
 'Barbados',
 'Tunisia',
 'Lesotho',
 'Pakistan',
 'Kyrgyzstan',
 'China',

In [10]:
epi_new = epi_new[epi_new['country_name'].isin(high_data)]

In [11]:
epi_new

Unnamed: 0,date,country_name,PM2.5 Exposure,Ecosystem Vitality,Unsafe sanitation,Terrestrial biome protection (global weights),Water Resources,Wastewater Treatment,Fisheries,Terrestrial biome protection (national weights),...,Critical Habitat Protection,Agricultural Subsidies,Change in Forest Cover,Coastal Shelf Fishing Pressure,Pesticide Regulation,Trend in CO2 Emissions per KWH,Climate & Energy,Trend in Carbon Intensity,Change of Trend in Carbon Intensity,Climate
0,2002,Comoros,100,22.45,4.73,0,3.96,3.96,63.25,0,...,0,100,68.23,97.93,8,,,,,
1,2003,Comoros,100,23.02,5.11,0,3.96,3.96,67.49,0,...,0,100,68.23,95.58,8,,,,,
2,2004,Comoros,100,23.09,5.49,0,3.96,3.96,68.05,0,...,0,100,68.23,96.7,8,,,,,
3,2005,Comoros,100,23.71,5.89,0,3.96,3.96,72.72,0,...,0,100,68.23,93.83,8,,,,,
4,2006,Comoros,100,22.49,6.29,0,3.96,3.96,63.54,0,...,0,100,68.23,96.52,8,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3460,2016,Venezuela,100,71.24,88.79,99.39,70.23,70.23,14.09,99.39,...,,,,,,76.6719,70.24,72.0458,,
3461,2017,Venezuela,,,,,,,,,...,,,,,,,,,,
3462,2018,Venezuela,100,55.99,52.4,99.99,80.74,80.74,56.16,99.95,...,,,,,,,37.8,,,
3463,2019,Venezuela,,,,,,,,,...,,,,,,,,,,


In [12]:
epi_new['country_name'].value_counts()

Venezuela                19
Papua New Guinea         19
Republic of Congo        19
Rwanda                   19
Belgium                  19
                         ..
Saint Lucia               9
Micronesia                9
Samoa                     9
Sao Tome and Principe     9
Maldives                  9
Name: country_name, Length: 185, dtype: int64

In [13]:
def impute_values(country_name):
    country = epi_new[epi_new['country_name'] == country_name].copy()
    for col in country.columns:
        country[col] = country[col].map(lambda x: np.nan if x == 'None' else x)
        country[col].interpolate(method='linear',order=1,inplace=True, axis=0)
    return country
        

In [14]:
epi_new_new = pd.DataFrame()

for country_name in  set(epi_new['country_name'].values):
    country_df = impute_values(country_name)
    epi_new_new = pd.concat([epi_new_new, country_df], ignore_index = True)

In [15]:
epi_new_new.head()

Unnamed: 0,date,country_name,PM2.5 Exposure,Ecosystem Vitality,Unsafe sanitation,Terrestrial biome protection (global weights),Water Resources,Wastewater Treatment,Fisheries,Terrestrial biome protection (national weights),...,Critical Habitat Protection,Agricultural Subsidies,Change in Forest Cover,Coastal Shelf Fishing Pressure,Pesticide Regulation,Trend in CO2 Emissions per KWH,Climate & Energy,Trend in Carbon Intensity,Change of Trend in Carbon Intensity,Climate
0,2002,Comoros,100.0,22.45,4.73,0.0,3.96,3.96,63.25,0.0,...,0.0,100.0,68.23,97.93,8.0,,,,,
1,2003,Comoros,100.0,23.02,5.11,0.0,3.96,3.96,67.49,0.0,...,0.0,100.0,68.23,95.58,8.0,,,,,
2,2004,Comoros,100.0,23.09,5.49,0.0,3.96,3.96,68.05,0.0,...,0.0,100.0,68.23,96.7,8.0,,,,,
3,2005,Comoros,100.0,23.71,5.89,0.0,3.96,3.96,72.72,0.0,...,0.0,100.0,68.23,93.83,8.0,,,,,
4,2006,Comoros,100.0,22.49,6.29,0.0,3.96,3.96,63.54,0.0,...,0.0,100.0,68.23,96.52,8.0,,,,,


In [16]:
epi_new_new.shape

(3425, 35)

In [24]:
nulls = epi_new_new.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Climate                                            959
Change of Trend in Carbon Intensity                959
Trend in CO2 Emissions per KWH                     957
Marine Protected Areas                             868
Trend in Carbon Intensity                          853
Coastal Shelf Fishing Pressure                     845
Fisheries                                          842
Fish Stock Status                                  842
Change in Forest Cover                             826
Climate & Energy                                   801
Forests                                            668
Agricultural Subsidies                             466
Ecosystem Vitality                                  38
Environmental Performance Index                     38
Water Resources                                     26
Wastewater Treatment                                26
Environmental Health                                12
Health Impacts                                       9
Access to 

In [40]:
drop_list = ['Climate','Forests', 'Critical Habitat Protection', 'Change in Forest Cover','Change of Trend in Carbon Intensity', 'Coastal Shelf Fishing Pressure',
'Trend in CO2 Emissions per KWH','Trend in Carbon Intensity','Climate & Energy']

In [42]:
len(drop_list)

9

In [None]:
more to look at: marine, Fisheries, fish stock, agricultural subsidies.

In [39]:
epi_new_new['Trend in Carbon Intensity'][1000:1050].head(50)

1000    57.190000
1001    60.000000
1002    60.000000
1003    60.000000
1004    60.000000
1005    60.000000
1006    60.000000
1007    60.000000
1008    60.000000
1009    60.000000
1010    60.000000
1011    60.000000
1012    60.000000
1013    60.000000
1014    73.511300
1015    87.022599
1016    87.022599
1017    87.022599
1018    87.022599
1019    87.022599
1020    54.700000
1021    54.700000
1022    54.700000
1023    54.700000
1024    54.700000
1025    54.700000
1026    54.700000
1027    54.700000
1028    54.700000
1029    54.700000
1030    54.700000
1031    54.700000
1032    54.700000
1033    68.152785
1034    81.605571
1035    81.605571
1036    81.605571
1037    81.605571
1038    81.605571
1039    90.000000
1040    90.000000
1041    90.000000
1042    90.000000
1043    90.000000
1044    90.000000
1045    90.000000
1046    90.000000
1047    90.000000
1048    90.000000
1049    90.000000
Name: Trend in Carbon Intensity, dtype: float64

In [30]:
epi_new['Coastal Shelf Fishing Pressure'].head(50)

0     97.93
1     95.58
2      96.7
3     93.83
4     96.52
5     96.52
6     96.52
7     96.52
8     96.52
9     96.52
10    96.52
11     None
12    96.52
13     None
14     None
15     None
16     None
17     None
18     None
19    21.76
20    22.41
21     22.5
22    26.71
23    26.76
24    26.76
25    26.76
26    26.76
27    26.76
28    26.76
29    26.76
30     None
31    26.76
32     None
33     None
34     None
35     None
36     None
37     None
38    63.18
39     None
40     None
41     None
42     None
43     None
44     None
45     None
46     None
47        0
48        0
49        0
Name: Coastal Shelf Fishing Pressure, dtype: object

In [26]:
epi_new_new['Climate'].value_counts()

43.49    38
51.02    19
56.45    19
44.06    19
48.29    19
         ..
59.10    15
1.02     13
14.45     9
16.38     9
11.15     9
Name: Climate, Length: 131, dtype: int64

In [18]:
epi_new.shape

(3425, 35)

In [19]:
epi_new.columns

Index(['date', 'country_name', 'PM2.5 Exposure', 'Ecosystem Vitality',
       'Unsafe sanitation', 'Terrestrial biome protection (global weights)',
       'Water Resources', 'Wastewater Treatment', 'Fisheries',
       'Terrestrial biome protection (national weights)',
       'Sanitation & Drinking Water', 'Unsafe drinking water',
       'Environmental Performance Index', 'Fish Stock Status', 'Air Quality',
       'Biodiversity & Habitat', 'Marine Protected Areas', 'Agriculture',
       'Environmental Health', 'Forests', 'PM2.5 Exceedance',
       'Household Air Quality', 'Health Impacts', 'Access to Electricity',
       'Child Mortality', 'Critical Habitat Protection',
       'Agricultural Subsidies', 'Change in Forest Cover',
       'Coastal Shelf Fishing Pressure', 'Pesticide Regulation',
       'Trend in CO2 Emissions per KWH', 'Climate & Energy',
       'Trend in Carbon Intensity', 'Change of Trend in Carbon Intensity',
       'Climate'],
      dtype='object')

In [20]:
nulls = epi_new.isnull().sum().sort_values(ascending =False)
nulls[nulls > 0]

Critical Habitat Protection                        2497
Climate                                            2003
Trend in CO2 Emissions per KWH                     1890
Change of Trend in Carbon Intensity                1874
Coastal Shelf Fishing Pressure                     1811
Change in Forest Cover                             1781
Trend in Carbon Intensity                          1737
Climate & Energy                                   1581
Agricultural Subsidies                             1572
Forests                                            1494
Fish Stock Status                                  1428
Marine Protected Areas                             1427
Fisheries                                          1403
Child Mortality                                    1283
Pesticide Regulation                               1282
Health Impacts                                     1103
Access to Electricity                              1103
Household Air Quality                           

In [21]:
s = pd.Series([1,2,3,np.nan,np.nan,np.nan])
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    NaN
dtype: float64

In [22]:
s.interpolate(method='spline', order=1)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64