In [3]:
import pandas as pd
from pathlib import Path

In [4]:
DATA_DIR = Path('../data/raw')

In [3]:
world_data = pd.read_csv('../data/processed/final-assignment-gapminder.csv')

In [22]:
def process_raw_csv(filename, value_name):
    dt = pd.read_csv(filename)
    if 'country' not in dt.columns:
        try:
            dt = dt.rename(columns={'geo': 'country'})
        except:
            print("no geo column")
            print(dt.columns)
    dtMlt = dt.melt(id_vars='country', var_name='year', value_name=value_name)
    dtMlt['year'] = dtMlt['year'].astype(int)
    dtMlt = dtMlt.loc[dtMlt['year'] > 1950] # exlude pre1950
    print(dtMlt.head(2))
    return dtMlt

In [23]:
pop_melt = process_raw_csv(DATA_DIR / 'population_total.csv', 'population')

           country  year  population
29445  Afghanistan  1951     7840000
29446      Albania  1951     1290000


In [24]:
malnutrition = process_raw_csv(DATA_DIR / 'malnutrition_weight_for_age_percent_of_children_under_5.csv', 'malnutrition_children')

       country  year  malnutrition_children
0  Afghanistan  1983                    NaN
1      Albania  1983                    NaN


In [54]:
# percent of children under 5 years old that are 2 stdev under average weight
underweight = process_raw_csv(DATA_DIR / 'underweight_children.csv', 'underweight_children')

       country  year  underweight_children
0  Afghanistan  1970                   NaN
1      Albania  1970                   NaN


In [25]:
children_born = process_raw_csv(DATA_DIR / 'children_per_woman_total_fertility.csv', 'children_per_woman')

           country  year  children_per_woman
27784  Afghanistan  1951                7.56
27785      Albania  1951                5.97


In [26]:
# child mortality = 0-5 years olds deaths per 1000 newborns
child_mortality = process_raw_csv(DATA_DIR / 'child_mortality_0_5_year_olds_dying_per_1000_born.csv', 'child_mortality')

           country  year  child_mortality
29143  Afghanistan  1951            421.0
29144      Albania  1951            269.0


In [27]:
# Population density
pop_dens =  process_raw_csv(DATA_DIR / 'population_density_per_square_km.csv', 'pop_density')

         country  year  pop_density
195  Afghanistan  1951         12.0
196      Albania  1951         47.0


In [28]:
# Life expectancy
life_exp = process_raw_csv(DATA_DIR / 'life_expectancy_years.csv', 'life_expectancy')

           country  year  life_expectancy
28237  Afghanistan  1951             32.4
28238      Albania  1951             54.7


In [29]:
# Income per person GDP per capita
income = process_raw_csv(DATA_DIR / 'income_per_person_gdppercapita_ppp_inflation_adjusted.csv', 'income')

           country  year  income
29143  Afghanistan  1951    1060
29144      Albania  1951    2000


In [30]:
# Education men
education_men = process_raw_csv(DATA_DIR / 'mean_years_in_school_men_25_to_34_years.csv', 'years_in_school_men')

       country  year  years_in_school_men
0  Afghanistan  1970                 1.36
1      Albania  1970                 6.10


In [31]:
# Education women
education_women = process_raw_csv(DATA_DIR / 'mean_years_in_school_women_25_to_34_years.csv', 'years_in_school_women')

       country  year  years_in_school_women
0  Afghanistan  1970                   0.21
1      Albania  1970                   5.33


In [32]:
# Education percent women/men
education_women_men = process_raw_csv(DATA_DIR / 'mean_years_in_school_women_percent_men_25_to_34_years.csv', 'education_gender_ratio')

       country  year  education_gender_ratio
0  Afghanistan  1970                    15.4
1      Albania  1970                    87.4


In [33]:
# Blood pressure mmhg (men)
bp_men = process_raw_csv(DATA_DIR / 'blood_pressure_sbp_men_mmhg.csv', 'bp_men')

       country  year  bp_men
0  Afghanistan  1980     125
1      Albania  1980     133


In [34]:
# Blood pressure mmhg (women)
bp_women = process_raw_csv(DATA_DIR / 'blood_pressure_sbp_women_mmhg.csv', 'bp_women')

       country  year  bp_women
0  Afghanistan  1980       122
1      Albania  1980       132


In [35]:
health_spending_pp = process_raw_csv(DATA_DIR / 'government_health_spending_per_person_us.csv', 'gov_health_spending')

       country  year  gov_health_spending
0  Afghanistan  1995                  NaN
1      Albania  1995                 13.9


In [36]:
# Total health spending as percent of GDP
health_budget = process_raw_csv(DATA_DIR / 'total_health_spending_percent_of_gdp.csv', 'gov_health_spending_ratio')

       country  year  gov_health_spending_ratio
0  Afghanistan  1995                        NaN
1      Albania  1995                       2.56


In [37]:
# maternal deaths within 42 days of birth, total number
maternal_deaths_total = process_raw_csv(DATA_DIR / 'maternal_deaths_total_number.csv',
                                      'maternal_deaths_total')

           country  year  maternal_deaths_total
27784  Afghanistan  1951                    NaN
27785      Albania  1951                    NaN


In [38]:
# maternal deaths within 42 days of birth, ratio per 100,000 live births
maternal_mortality_ratio = process_raw_csv(DATA_DIR / 'maternal_mortality_ratio_per_100000_live_births.csv', 'maternal_mortality_ratio')

           country  year  maternal_mortality_ratio
28237  Afghanistan  1951                       NaN
28238      Albania  1951                       NaN


In [39]:
# percent of births attended to by skilled health professionals
births_skilled = process_raw_csv(DATA_DIR / 'births_attended_by_skilled_health_staff_percent_of_total.csv',
                                 'births_by_health_staff')

       country  year  births_by_health_staff
0  Afghanistan  1984                     NaN
1      Albania  1984                     NaN


In [40]:
# maternal deaths within 42 days of birth, total number
contraceptive_use = process_raw_csv(DATA_DIR / 'births_attended_by_skilled_health_staff_percent_of_total.csv',
                                    'contraceptive_use_percent_women')

       country  year  contraceptive_use_percent_women
0  Afghanistan  1984                              NaN
1      Albania  1984                              NaN


In [41]:
# sugar (g) per person per day
sugar_intake = process_raw_csv(DATA_DIR / 'sugar_per_person_g_per_day.csv', 'sugar_intake_daily')

       country  year  sugar_intake_daily
0  Afghanistan  1961                14.4
1      Albania  1961                31.0


In [42]:
# Income groups world bank 2018
# temp_df['Income group'] = temp_df['Income group'].str.rpartition()[0]
# temp_df.drop('X', axis=1).to_csv('../data/raw/income-group.csv', index=False)
income_group = pd.read_csv('../data/raw/income-group.csv')
income_group = income_group.rename(columns={'Economy': 'country', 'Income group': 'income_group'})
income_group.head()

Unnamed: 0,country,Code,Region,income_group
0,Afghanistan,AFG,South Asia,Low
1,Albania,ALB,Europe & Central Asia,Upper middle
2,Algeria,DZA,Middle East & North Africa,Upper middle
3,American Samoa,ASM,East Asia & Pacific,Upper middle
4,Andorra,AND,Europe & Central Asia,High


In [43]:
country_regions = pd.read_csv('../data/raw/countries-regions.csv')
country_regions = country_regions.rename(columns={'name': 'country', 'sub-region': 'sub_region'})
country_regions.head()

Unnamed: 0,country,alpha-2,alpha-3,country-code,iso_3166-2,region,sub_region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


Fix country names to be the same from different sources... yawn...

In [44]:
health_budget['country'].unique().shape

(190,)

In [46]:
income_group['country'].shape

(218,)

In [47]:
pop_melt['country'].unique().shape

(195,)

## Normalize names of the population regions data frame to the melted data frames

In [48]:
pop_melt.loc[pop_melt['year'] == 2018, ['country', 'population']].loc[~pop_melt['country'].isin(country_regions['country'])]

Unnamed: 0,country,population
42530,Bolivia,11200000
42534,Brunei,434000
42541,Cape Verde,553000
42548,"Congo, Dem. Rep.",84000000
42549,"Congo, Rep.",5400000
42551,Cote d'Ivoire,24900000
42555,Czech Republic,10600000
42588,Iran,82000000
42600,Kyrgyz Republic,6130000
42601,Lao,6960000


In [49]:
# country_regions.loc[country_regions['country'] == 'Bahamas', 'country'] = 'Bahamas, The'
country_regions.loc[country_regions['country'].str.startswith('Bolivia'), 'country'] = 'Bolivia'
country_regions.loc[country_regions['country'].str.startswith('Congo (Dem'), 'country'] = 'Congo, Dem. Rep.'
country_regions.loc[country_regions['country'] == 'Congo', 'country'] = 'Congo, Rep.'
country_regions.loc[country_regions['country'].str.startswith('Czhechia'), 'country'] = 'Czech Republic'
country_regions.loc[country_regions['country'].str.startswith('Czechia'), 'country'] = 'Czech Republic'
country_regions.loc[country_regions['country'].str.startswith('Iran'), 'country'] = 'Iran'
country_regions.loc[country_regions['country'].str.startswith('Kyrgyzstan'), 'country'] = 'Kyrgyz Republic'
country_regions.loc[country_regions['country'].str.startswith('Korea (Rep'), 'country'] = 'South Korea'
country_regions.loc[country_regions['country'].str.startswith('Korea (Dem'), 'country'] = 'North Korea'
country_regions.loc[country_regions['country'].str.startswith('Macedonia'), 'country'] = 'Macedonia, FYR'
country_regions.loc[country_regions['country'].str.startswith('Moldova'), 'country'] = 'Moldova'
country_regions.loc[country_regions['country'].str.startswith('Slovakia'), 'country'] = 'Slovak Republic'
country_regions.loc[country_regions['country'].str.startswith('Russian'), 'country'] = 'Russia'
country_regions.loc[country_regions['country'].str.startswith('Palestine'), 'country'] = 'Palestine'
country_regions.loc[country_regions['country'].str.startswith('Syria'), 'country'] = 'Syria'
country_regions.loc[country_regions['country'].str.startswith('United Kingdom'), 'country'] = 'United Kingdom'
country_regions.loc[country_regions['country'].str.startswith('United States'), 'country'] = 'United States'
country_regions.loc[country_regions['country'].str.startswith('Venezuela'), 'country'] = 'Venezuela'
country_regions.loc[country_regions['country'].str.startswith('Viet Nam'), 'country'] = 'Vietnam'
country_regions.loc[country_regions['country'].str.startswith('Tanzania'), 'country'] = 'Tanzania'
country_regions.loc[country_regions['country'].str.startswith('Lao'), 'country'] = 'Lao'
country_regions.loc[country_regions['country'].str.startswith('Côte '), 'country'] = "Cote d'Ivoire"
country_regions.loc[country_regions['country'].str.startswith('Eswatini'), 'country'] = 'Swaziland'

In [51]:
country_regions.loc[~country_regions['country'].isin(pop_melt['country'])].head()

Unnamed: 0,country,alpha-2,alpha-3,country-code,iso_3166-2,region,sub_region,intermediate-region,region-code,sub-region-code,intermediate-region-code
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,
7,Anguilla,AI,AIA,660,ISO 3166-2:AI,Americas,Latin America and the Caribbean,Caribbean,19.0,419.0,29.0
8,Antarctica,AQ,ATA,10,ISO 3166-2:AQ,,,,,,
12,Aruba,AW,ABW,533,ISO 3166-2:AW,Americas,Latin America and the Caribbean,Caribbean,19.0,419.0,29.0


## Merge data frames

In [55]:
world_data = pd.merge(pop_melt, country_regions[['country', 'region', 'sub_region']], on='country')
world_data = pd.merge(world_data, income_group[['country', 'income_group']], on=['country'])
world_data = pd.merge(world_data, life_exp, on=['country', 'year'])
world_data = pd.merge(world_data, income, on=['country', 'year'])
world_data = pd.merge(world_data, children_born, on=['country', 'year'])
world_data = pd.merge(world_data, child_mortality, on=['country', 'year'])
world_data = pd.merge(world_data, pop_dens, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, education_men, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, education_women_men, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, education_women, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, bp_men, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, bp_women, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, sugar_intake, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, health_budget, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, health_spending_pp, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, maternal_deaths_total, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, maternal_mortality_ratio, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, contraceptive_use, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, births_skilled, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, malnutrition, on=['country', 'year'], how='left')
world_data = pd.merge(world_data, underweight, on=['country', 'year'], how='left')
world_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11288 entries, 0 to 11287
Data columns (total 25 columns):
country                            11288 non-null object
year                               11288 non-null int64
population                         11288 non-null int64
region                             11288 non-null object
sub_region                         11288 non-null object
income_group                       11288 non-null object
life_expectancy                    11288 non-null float64
income                             11288 non-null int64
children_per_woman                 11288 non-null float64
child_mortality                    11286 non-null float64
pop_density                        11288 non-null float64
years_in_school_men                7636 non-null float64
education_gender_ratio             7636 non-null float64
years_in_school_women              7636 non-null float64
bp_men                             4785 non-null float64
bp_women                           

In [56]:
world_data.to_csv('../data/processed/final-assignment-gapminder.csv', index=False)