In [6]:
import sys
!apk add build-essential
!pip install fbprophet

/bin/sh: apk: command not found
Collecting fbprophet
  Using cached https://files.pythonhosted.org/packages/f7/86/4509e952f9724f084625e93e0bf8d8519b25c79029a0a916b0f996644c75/fbprophet-0.6.tar.gz
Building wheels for collected packages: fbprophet
  Running setup.py bdist_wheel for fbprophet ... [?25l-^C
error
[31m  Failed building wheel for fbprophet[0m
[?25h  Running setup.py clean for fbprophet


In [1]:
import os
from pathlib import Path
import urllib.request

import pandas as pd
import numpy as np
import io
import requests


def download_world_bank_indicator(indicator_name: str, directory: str = "."):
    directory = Path(directory)
    file_path = directory / (indicator_name + ".csv")
    os.makedirs(directory, exist_ok=True)
    data_url = f"https://api.worldbank.org/indicator/{indicator_name}?format=csv"
    urllib.request.urlretrieve(data_url, file_path)


def extract_series_of_newest_data(csv_path: str):
    df = pd.read_csv(csv_path, index_col=[0, 1], header=0).T
    recent_year = df.apply(pd.Series.last_valid_index)

    nan_indicies = recent_year[recent_year.isna()].index
    no_na_df = df.drop(nan_indicies, axis=1)

    recent_year = no_na_df.apply(pd.Series.last_valid_index)
    recent_data = no_na_df.lookup(recent_year, no_na_df.columns)
    s = pd.Series(recent_data)
    s.index = no_na_df.columns
    return s


def add_new_feature(
    main_df: pd.DataFrame, additional_feature: pd.Series, name: str
) -> pd.DataFrame:
    tmp_df = main_df.set_index("countryterritoryCode")
    additional_feature = additional_feature.reset_index().set_index("Country Code")
    tmp_df[name] = additional_feature[0]
    return tmp_df.reset_index()

In [2]:
indicator_names = {
    "GDP (current US$)": "NY.GDP.MKTP.CD",
    "GDP per capita (current US$)": "NY.GDP.PCAP.CD",
    "Access to electricity (% of population)": "EG.ELC.ACCS.ZS",
    "Current health expenditure per capita (current US$)": "SH.XPD.CHEX.PC.CD",
    "Current health expenditure (% of GDP)": "SH.XPD.CHEX.GD.ZS",
    "Hospital beds (per 1,000 people)": "SH.MED.BEDS.ZS",
}

In [7]:
%pwd

'/home/ec2-user/SageMaker'

In [10]:
data_dir = Path("Datasets/DLL/World_Bank")
dataset_output_path = "Datasets/DLL/ECDC/DLL_COVID_TRAIN.csv"

In [11]:
df = pd.read_csv(
    io.StringIO(
        requests.get(
            "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"
        ).content.decode("utf-8")
    ),
    usecols=[
        "dateRep",
        "cases",
        "deaths",
        "countriesAndTerritories",
        "popData2018",
        "countryterritoryCode",
    ],
    parse_dates=["dateRep"],
    infer_datetime_format=True,
)

for key, value in indicator_names.items():
    download_world_bank_indicator(value, directory=data_dir)
    csv_path = data_dir / (value + ".csv")
    new_feature = extract_series_of_newest_data(csv_path)
    df = add_new_feature(df, new_feature, key)
    
df = df.dropna(subset=["countryterritoryCode"])
df.to_csv(dataset_output_path, index=False)

In [12]:
df

Unnamed: 0,countryterritoryCode,dateRep,cases,deaths,countriesAndTerritories,popData2018,GDP (current US$),GDP per capita (current US$),Access to electricity (% of population),Current health expenditure per capita (current US$),Current health expenditure (% of GDP),"Hospital beds (per 1,000 people)"
0,AFG,2020-04-18,51,1,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
1,AFG,2020-04-17,10,4,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
2,AFG,2020-04-16,70,2,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
3,AFG,2020-04-15,49,2,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
4,AFG,2020-04-14,58,3,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
5,AFG,2020-04-13,52,0,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
6,AFG,2020-04-12,34,3,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
7,AFG,2020-04-11,37,0,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
8,AFG,2020-04-10,61,1,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
9,AFG,2020-04-09,56,3,Afghanistan,37172386.0,1.936297e+10,520.896603,97.700000,67.122650,11.777194,0.5
