In [1]:
import altair as alt
import numpy as np
import pandas as pd

In [2]:
# preprocess data (this data has Age population percentage 10-14, male, but no same age range for female.. )
health_data_url = "https://raw.githubusercontent.com/ZeningQu/World-Bank-Data-by-Indicators/master/health/health.csv"
# downloaded from https://data.worldbank.org/indicator/SP.POP.0014.TO.ZS?view=chart and convert to readable csv
population_distribution_file = "API_SP.POP.0014.TO.ZS_DS2_en_csv_v2_1500178/API_SP.POP.0014.TO.ZS_DS2_en_csv_v2_1500178 (copy).csv"

# regions_url = "https://raw.githubusercontent.com/ZeningQu/World-Bank-Data-by-Indicators/master/health/Metadata_Country_API_8_DS2_en_csv_v2_10138079.csv"

In [3]:
def load_data(source):
    data = pd.read_csv(source, header=0, skipinitialspace=True)
    return data

In [4]:
health_df = load_data(health_data_url)

# remove regions, only keep countries
names_exclude = ['Sub-Saharan Africa', 'IDA', 'income', 'Fragile and conflict affected situations', 'IBRD',
                 'poor countries', 'demographic', 'developed countries: UN classification', 'Europe', 'Asia',
                 'Middle East', 'Pacific', 'North America', 'small states', 'members', 'World', 'Small states',  'Latin America & Caribbean']
for name in names_exclude:
    health_df = health_df[~health_df['Country Name'].str.contains(name)]
print(health_df['Country Name'].unique())    

['Albania' 'Azerbaijan' 'Burkina Faso' 'Bulgaria' 'Belarus'
 'Brunei Darussalam' 'Central African Republic' 'Chile' 'China' 'Cyprus'
 'Dominica' 'Dominican Republic' 'Algeria' 'Estonia' 'Fiji'
 'United Kingdom' 'Ghana' 'Gibraltar' 'Gambia, The' 'Equatorial Guinea'
 'Guyana' 'Haiti' 'Isle of Man' 'India' 'Iran, Islamic Rep.' 'Iraq'
 'Israel' 'Jamaica' 'Cambodia' 'Kiribati' 'Kuwait' 'Libya' 'Morocco'
 'Maldives' 'Mexico' 'Mali' 'Malta' 'Myanmar' 'Mauritania' 'Namibia'
 'New Caledonia' 'Netherlands' 'Nauru' 'Panama' 'Philippines'
 'Korea, Dem. People’s Rep.' 'Portugal' 'Qatar' 'Saudi Arabia' 'Senegal'
 'South Sudan' 'Slovenia' 'Togo' 'Tonga' 'Kosovo' 'Yemen, Rep.' 'Liberia'
 'Mozambique' 'Malawi' 'Poland' 'Thailand' 'Zimbabwe' 'Afghanistan'
 'Angola' 'Andorra' 'Antigua and Barbuda' 'Belgium' 'Belize' 'Bermuda'
 'Botswana' 'Channel Islands' "Cote d'Ivoire" 'Curacao' 'Germany'
 'Djibouti' 'Denmark' 'Eritrea' 'Faroe Islands' 'Gabon' 'Grenada' 'Jordan'
 'Lao PDR' 'Lebanon' 'St. Lucia' 'Liecht

In [5]:
# drop unused columns
health_df = health_df.drop(columns=["Country Code", "Population growth (annual %)", 'Population ages 15-64 (% of total)', 'Population ages 15-64, female',
       'Population ages 15-64, female (% of total)',
       'Population ages 15-64, male',
       'Population ages 15-64, male (% of total)', 'Population ages 65 and above (% of total)',
       'Population ages 65 and above, female',
       'Population ages 65 and above, female (% of total)',
       'Population ages 65 and above, male',
       'Population ages 65 and above, male (% of total)',
       'Population ages 10-14, male (% of male population)',
       'Population ages 65 and above, total', "Sex ratio at birth (male births per female births)"])

In [6]:
missing_df = load_data(population_distribution_file)
# add missing column to health data
missing_col = missing_df.head(1)['Indicator Name'].item()
health_df[missing_col] = np.nan

In [7]:
years = list(missing_df.columns)[4:-1]
years = [int(i) for i in years] 
for index, row in missing_df.iterrows():
    country_name = row['Country Name']
    matching_rows = health_df.loc[(health_df['Country Name'] == country_name) & (health_df['Year'].isin(years))]
    for sub_idx, sub_row in matching_rows.iterrows():
        health_df.at[sub_idx, missing_col] = row[str(sub_row['Year'])]

In [8]:
# clean some all-zeros
print(len(health_df))
force_non_zero_cols = ['Population ages 15-19, female (% of female population)',
       'Population ages 15-19, male (% of male population)',
       'Population ages 20-24, female (% of female population)',
       'Population ages 20-24, male (% of male population)',
       'Population ages 25-29, female (% of female population)',
       'Population ages 25-29, male (% of male population)',
       'Population ages 30-34, female (% of female population)',
       'Population ages 30-34, male (% of male population)',
       'Population ages 35-39, female (% of female population)',
       'Population ages 35-39, male (% of male population)',
       'Population ages 40-44, female (% of female population)',
       'Population ages 40-44, male (% of male population)',
       'Population ages 45-49, female (% of female population)',
       'Population ages 45-49, male (% of male population)',
       'Population ages 50-54, female (% of female population)',
       'Population ages 50-54, male (% of male population)',
       'Population ages 55-59, female (% of female population)',
       'Population ages 55-59, male (% of male population)',
       'Population ages 60-64, female (% of female population)',
       'Population ages 60-64, male (% of male population)',
       'Population ages 65-69, female (% of female population)',
       'Population ages 65-69, male (% of male population)',
       'Population ages 70-74, female (% of female population)',
       'Population ages 70-74, male (% of male population)',
       'Population ages 75-79, female (% of female population)',
       'Population ages 75-79, male (% of male population)',
       'Population ages 80 and above, female (% of female population)',
       'Population ages 80 and above, male (% of male population)']
for col in force_non_zero_cols:
    health_df = health_df[health_df[col] != 0.0]
print(len(health_df))

12607
11244


In [9]:
# calculate % total for these ranges
age_ranges = ['15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 and above']
for r in age_ranges:
    health_df['Population ages {} (% of total population)'.format(r)] = np.nan

for idx, row in health_df.iterrows():
    male = row['Population, male']
    female = row['Population, female']
    total = row['Population, total']
    if total == 0 or male == 0 or female == 0:
        continue
    for r in age_ranges:
        male_frac = row['Population ages {}, male (% of male population)'.format(r)]
        female_frac = row['Population ages {}, female (% of female population)'.format(r)]
        total_sub = male_frac * male + female_frac * female
        total_frac = (1.0 *total_sub) / total
        health_df.at[idx, 'Population ages {} (% of total population)'.format(r)] = total_frac

In [10]:
health_df.to_csv("health.csv", index=False)