In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
epi = pd.read_csv('data/EPI_data/ObservationData_rqridaf.csv')

In [3]:
demo = pd.read_csv('data/Demographic_data/demograpic_data.csv')

In [4]:
gdp_pre_16 = pd.read_csv('data/GDP_data/world_gdp.csv')

In [5]:
gdp_post_16 = pd.read_csv('data/GDP_data/World_Bank_GDP.csv')

***Data Import***

- Convert EPI indicators into columns for new data frame

In [6]:
def epi_to_country_data(country_name):
    country = epi[epi['location'] == country_name]

    indicators = list(country['indicator'].value_counts().to_dict().keys())

    min_date = country['Date'].min()
    max_date = country['Date'].max()

    dates = pd.Series(list(range(min_date, max_date + 1)))
    transpose_country = pd.DataFrame()
    transpose_country['year'] = dates
    transpose_country['country_name'] = country_name

    for indicator in indicators:
        indicator_df = country[country['indicator'] == indicator]

        date_values = {key: None for key in list(range(min_date, max_date + 1))}
        for row in indicator_df.iterrows():
            date_values[row[1]['Date']] = row[1]['Value']

        transpose_country[indicator] = date_values.values()
        
    return transpose_country

In [7]:
best_indicators = [key for key, value in epi['indicator'].value_counts().to_dict().items() if value > 500]

In [8]:
epi = epi[epi['indicator'].isin(best_indicators)]

In [9]:
epi_new = pd.DataFrame()

for country_name in  set(epi['location'].values):
    country_df = epi_to_country_data(country_name)
    epi_new = pd.concat([epi_new, country_df], ignore_index = True)

- Add GDP data.

In [10]:
epi_new['GDP'] = 0

for row in epi_new.iterrows():
    index = row[0]
    year = row[1]['year']
    country_name = row[1]['country_name']
    
    if year < 2016:
        value = gdp_pre_16[gdp_pre_16['Country Name'] == country_name][f'{year}']
    elif year < 2020:
        value = gdp_post_16[gdp_post_16['Country Name'] == country_name][f'{year} [YR{year}]']
    
    if(len(list(value.to_dict().values())) > 0):
        epi_new.iloc[index, epi_new.columns.get_loc('GDP')] = list(value.to_dict().values())[0]

In [11]:
epi_new['GDP'].value_counts()[:5]

0                 285
..                 15
115000000000.0      9
189000000000.0      5
199000000000.0      5
Name: GDP, dtype: int64

In [12]:
epi_new['GDP'] = epi_new['GDP'].map(lambda x: 0 if x == '..' else x)

- Save data.

In [14]:
epi_new.to_csv('data/dataframes/epi_wrangled.csv', index=False)

- To Step 2 ->