In [78]:
import os
from pathlib import Path
import urllib.request

import pandas as pd
import numpy as np
import io
import requests


def download_world_bank_indicator(indicator_name: str, directory: str = "."):
    directory = Path(directory)
    file_path = directory / (indicator_name + ".csv")
    os.makedirs(directory, exist_ok=True)
    data_url = f"https://api.worldbank.org/indicator/{indicator_name}?format=csv"
    urllib.request.urlretrieve(data_url, file_path)


def extract_series_of_newest_data(csv_path: str):
    df = pd.read_csv(csv_path, index_col=[0, 1], header=0).T
    recent_year = df.apply(pd.Series.last_valid_index)

    nan_indicies = recent_year[recent_year.isna()].index
    no_na_df = df.drop(nan_indicies, axis=1)

    recent_year = no_na_df.apply(pd.Series.last_valid_index)
    recent_data = no_na_df.lookup(recent_year, no_na_df.columns)
    s = pd.Series(recent_data)
    s.index = no_na_df.columns
    return s


def add_new_feature(
    main_df: pd.DataFrame, additional_feature: pd.Series, name: str
) -> pd.DataFrame:
    tmp_df = main_df.set_index("countryterritoryCode")
    additional_feature = additional_feature.reset_index().set_index("Country Code")
    tmp_df[name] = additional_feature[0]
    return tmp_df.reset_index()

In [79]:
data_dir = Path("data")
dataset_output_path = "DLL_COVID_TRAIN.csv"
indicator_names = {
    "GDP per capita (current US$)": "NY.GDP.PCAP.CD",
    "Current health expenditure per capita (current US$)": "SH.XPD.CHEX.PC.CD",
    "Population ages 65 and above (% of total population)": "SP.POP.65UP.TO.ZS",
    "Population density": "EN.POP.DNST",
    "Urban population (% of total population)": "SP.URB.TOTL.IN.ZS"
}

In [80]:
df = pd.read_csv(
    io.StringIO(
        requests.get(
            "https://opendata.ecdc.europa.eu/covid19/casedistribution/csv"
        ).content.decode("utf-8")
    ),
    usecols=[
        "dateRep",
        "countriesAndTerritories",
        "countryterritoryCode",
    ],
    parse_dates=["dateRep"],
    infer_datetime_format=True,
)

for key, value in indicator_names.items():
    download_world_bank_indicator(value, directory=data_dir)
    csv_path = data_dir / (value + ".csv")
    new_feature = extract_series_of_newest_data(csv_path)
    df = add_new_feature(df, new_feature, key)
    
df = df.dropna(subset=["countryterritoryCode"])

In [81]:
df = df.drop_duplicates(subset='countryterritoryCode', keep='first').reset_index(drop=True)

In [82]:
df = df.drop(columns=['dateRep'])

In [83]:
df.head()

Unnamed: 0,countryterritoryCode,countriesAndTerritories,GDP per capita (current US$),Current health expenditure per capita (current US$),Population ages 65 and above (% of total population),Population density,Urban population (% of total population)
0,AFG,Afghanistan,520.896603,67.12265,2.584927,56.93776,25.495
1,ALB,Albania,5268.848504,,13.744736,104.612263,60.319
2,DZA,Algeria,4114.715061,258.494293,6.362497,17.730075,72.629
3,AND,Andorra,42029.762737,4040.786621,,163.842553,88.062
4,AGO,Angola,3432.385736,114.459641,2.216374,24.713052,65.514


In [84]:
df.to_csv("../Datasets/world_bank_features.csv", index=False)