In [1]:
import pandas as pd
import numpy as np

***Data Cleaning and Exploration***

In [2]:
epi = pd.read_csv('data/dataframes/epi_wrangled.csv')

- Remove countries with fewer than 5 records.

In [3]:
high_data = [key for key, value in epi['country_name'].value_counts().to_dict().items() if value >5]

In [4]:
epi = epi[epi['country_name'].isin(high_data)]

In [5]:
epi['country_name'].value_counts()

Cameroon                            19
South Africa                        19
Afghanistan                         19
Dem. Rep. Congo                     19
Tunisia                             19
                                    ..
Sao Tome and Principe                9
Saint Lucia                          9
Saint Vincent and the Grenadines     9
Maldives                             9
Samoa                                9
Name: country_name, Length: 185, dtype: int64

- Linearly impute values in place of nulls.

In [6]:
def impute_values(country_name):
    country = epi[epi['country_name'] == country_name].copy()
    for col in country.columns:
        country[col] = country[col].map(lambda x: np.nan if x == 'None' else x)
        country[col].interpolate(method='linear',order=1,inplace=True, axis=0)
    return country
        

In [7]:
epi_new = pd.DataFrame()

for country_name in  set(epi['country_name'].values):
    country_df = impute_values(country_name)
    epi_new = pd.concat([epi_new, country_df], ignore_index = True)

In [8]:
epi = epi_new

* drop colums that have too many nulls or no valuable trends.

In [11]:
drop_list = ['Climate','Forests', 'Critical Habitat Protection', 'Change in Forest Cover','Change of Trend in Carbon Intensity', 'Coastal Shelf Fishing Pressure',
'Trend in CO2 Emissions per KWH', 'Trend in Carbon Intensity']

In [12]:
epi.drop(columns=drop_list,inplace=True)