In [18]:
# gm.columns
# Index(['country', 'year', 'region', 'population', 'life_expectancy',
#        'age5_surviving', 'babies_per_woman', 'gdp_per_capita'],
#       dtype='object')
# gm[~gm.duplicated(['country', 'region', 'population', 'life_expectancy',
#                    'age5_surviving', 'babies_per_woman', 'gdp_per_capita'])].to_csv('gapminder2.csv', index=False)

# 09_01: exploring data

In [1]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

Let's look at our dataset. For all the countries of the world, and for years starting in 1800, they show us basic facts about life in those countries: the approximate population, the expected lifetime (i.e., the average age of death for all those born in that year), the percentage of children surviving to age 5, the average number of babies per woman, the gross national product divided by population, and the income available on average to each citizen each year (the unit is 2017-equivalent dollars).

The gapminder website includes a number of beautiful and powerful visualizations. [browse https://www.gapminder.org/tools/#$chart-type=bubbles&url=v2]. We will use Python to reproduce some of them.

In [79]:
gm = pd.read_csv('gapminder2.csv')

In [80]:
gm{

Unnamed: 0,country,year,region,population,life_expectancy,age5_surviving,babies_per_woman,gdp_per_capita
0,Afghanistan,1800,Asia,3280000.0,28.2,53.10,7.00,599.0
1,Afghanistan,1806,Asia,3280000.0,28.1,53.00,7.00,599.0
2,Afghanistan,1816,Asia,3280000.0,28.1,52.90,7.00,599.0
3,Afghanistan,1817,Asia,3280000.0,28.0,52.90,7.00,599.0
4,Afghanistan,1820,Asia,3290000.0,28.0,52.90,7.00,599.0
...,...,...,...,...,...,...,...,...
41817,Zimbabwe,2019,Africa,15300000.0,61.0,95.56,3.75,2200.0
41818,Zimbabwe,2020,Africa,15500000.0,60.9,95.69,3.75,1990.0
41819,Zimbabwe,2021,Africa,15800000.0,59.0,95.80,3.77,2120.0
41820,Zimbabwe,2022,Africa,16100000.0,59.1,95.87,3.77,2140.0


In [61]:
# cm = pd.read_csv('../gdata/childmortality.csv').melt(id_vars=['country'], var_name='year', value_name='child_mortality')
# le = pd.read_csv('../gdata/lifeexp.csv').melt(id_vars=['country'], var_name='year', value_name='life_expectancy')
# fe = pd.read_csv('../gdata/fertility.csv').melt(id_vars=['country'], var_name='year', value_name='babies_per_woman')
# gd = pd.read_csv('../gdata/gdp_pcap.csv').melt(id_vars=['country'], var_name='year', value_name='gdp_per_capita')
# po = pd.read_csv('../gdata/population.csv').melt(id_vars=['country'], var_name='year', value_name='population')

In [62]:
# countries_to_continent = pd.Series({
#     'Afghanistan': 'Asia', 'Albania': 'Europe', 'Algeria': 'Africa', 'Andorra': 'Europe', 'Angola': 'Africa',
#     'Antigua and Barbuda': 'North America', 'Argentina': 'South America', 'Armenia': 'Asia', 'Australia': 'Oceania',
#     'Austria': 'Europe', 'Azerbaijan': 'Asia', 'Bahamas': 'North America', 'Bahrain': 'Asia', 'Bangladesh': 'Asia',
#     'Barbados': 'North America', 'Belarus': 'Europe', 'Belgium': 'Europe', 'Belize': 'North America', 'Benin': 'Africa',
#     'Bhutan': 'Asia', 'Bolivia': 'South America', 'Bosnia and Herzegovina': 'Europe', 'Botswana': 'Africa', 'Brazil': 'South America',
#     'Brunei': 'Asia', 'Bulgaria': 'Europe', 'Burkina Faso': 'Africa', 'Burundi': 'Africa', 'Cambodia': 'Asia',
#     'Cameroon': 'Africa', 'Canada': 'North America', 'Cape Verde': 'Africa', 'Central African Republic': 'Africa', 'Chad': 'Africa',
#     'Chile': 'South America', 'China': 'Asia', 'Colombia': 'South America', 'Comoros': 'Africa',
#     'Congo, Dem. Rep.': 'Africa', 'Congo, Rep.': 'Africa', 'Costa Rica': 'North America', "Cote d'Ivoire": 'Africa',
#     'Croatia': 'Europe', 'Cuba': 'North America', 'Cyprus': 'Asia', 'Czech Republic': 'Europe', 'Denmark': 'Europe',
#     'Djibouti': 'Africa', 'Dominica': 'North America', 'Dominican Republic': 'North America', 'Ecuador': 'South America', 'Egypt': 'Africa',
#     'El Salvador': 'North America', 'Equatorial Guinea': 'Africa', 'Eritrea': 'Africa', 'Estonia': 'Europe',
#     'Eswatini': 'Africa', 'Ethiopia': 'Africa', 'Fiji': 'Oceania', 'Finland': 'Europe', 'France': 'Europe', 'Gabon': 'Africa',
#     'Gambia': 'Africa', 'Georgia': 'Asia', 'Germany': 'Europe', 'Ghana': 'Africa', 'Greece': 'Europe', 'Grenada': 'North America',
#     'Guatemala': 'North America', 'Guinea': 'Africa', 'Guinea-Bissau': 'Africa', 'Guyana': 'South America', 'Haiti': 'North America',
#     'Holy See': 'Europe', 'Honduras': 'North America', 'Hong Kong, China': 'Asia', 'Hungary': 'Europe', 'Iceland': 'Europe',
#     'India': 'Asia', 'Indonesia': 'Asia', 'Iran': 'Asia', 'Iraq': 'Asia', 'Ireland': 'Europe', 'Israel': 'Asia', 'Italy': 'Europe',
#     'Jamaica': 'North America', 'Japan': 'Asia', 'Jordan': 'Asia', 'Kazakhstan': 'Asia', 'Kenya': 'Africa', 'Kiribati': 'Oceania',
#     'Kuwait': 'Asia', 'Kyrgyz Republic': 'Asia', 'Lao': 'Asia', 'Latvia': 'Europe', 'Lebanon': 'Asia', 'Lesotho': 'Africa',
#     'Liberia': 'Africa', 'Libya': 'Africa', 'Liechtenstein': 'Europe', 'Lithuania': 'Europe', 'Luxembourg': 'Europe',
#     'Madagascar': 'Africa', 'Malawi': 'Africa', 'Malaysia': 'Asia', 'Maldives': 'Asia', 'Mali': 'Africa', 'Malta': 'Europe',
#     'Marshall Islands': 'Oceania', 'Mauritania': 'Africa', 'Mauritius': 'Africa', 'Mexico': 'North America',
#     'Micronesia, Fed. Sts.': 'Oceania', 'Moldova': 'Europe', 'Monaco': 'Europe', 'Mongolia': 'Asia',
#     'Montenegro': 'Europe', 'Morocco': 'Africa', 'Mozambique': 'Africa', 'Myanmar': 'Asia', 'Namibia': 'Africa',
#     'Nauru': 'Oceania', 'Nepal': 'Asia', 'Netherlands': 'Europe', 'New Zealand': 'Oceania', 'Nicaragua': 'North America',
#     'Niger': 'Africa', 'Nigeria': 'Africa', 'North Korea': 'Asia', 'North Macedonia': 'Europe', 'Norway': 'Europe',
#     'Oman': 'Asia', 'Pakistan': 'Asia', 'Palau': 'Oceania', 'Palestine': 'Asia', 'Panama': 'North America',
#     'Papua New Guinea': 'Oceania', 'Paraguay': 'South America', 'Peru': 'South America', 'Philippines': 'Asia', 'Poland': 'Europe',
#     'Portugal': 'Europe', 'Qatar': 'Asia', 'Romania': 'Europe', 'Russia': 'Europe', 'Rwanda': 'Africa', 'Samoa': 'Oceania',
#     'San Marino': 'Europe', 'Sao Tome and Principe': 'Africa', 'Saudi Arabia': 'Asia', 'Senegal': 'Africa',
#     'Serbia': 'Europe', 'Seychelles': 'Africa', 'Sierra Leone': 'Africa', 'Singapore': 'Asia',
#     'Slovak Republic': 'Europe', 'Slovenia': 'Europe', 'Solomon Islands': 'Oceania', 'Somalia': 'Africa',
#     'South Africa': 'Africa', 'South Korea': 'Asia', 'South Sudan': 'Africa', 'Spain': 'Europe', 'Sri Lanka': 'Asia',
#     'St. Kitts and Nevis': 'North America', 'St. Lucia': 'North America',
#     'St. Vincent and the Grenadines': 'North America', 'Sudan': 'Africa', 'Suriname': 'South America', 'Sweden': 'Europe',
#     'Switzerland': 'Europe', 'Syria': 'Asia', 'Taiwan': 'Asia', 'Tajikistan': 'Asia', 'Tanzania': 'Africa',
#     'Thailand': 'Asia', 'Timor-Leste': 'Asia', 'Togo': 'Africa', 'Tonga': 'Oceania', 'Trinidad and Tobago': 'North America',
#     'Tunisia': 'Africa', 'Turkey': 'Asia', 'Turkmenistan': 'Asia', 'Tuvalu': 'Oceania', 'UAE': 'Asia', 'UK': 'Europe', 'USA': 'North America',
#     'Uganda': 'Africa', 'Ukraine': 'Europe', 'Uruguay': 'South America', 'Uzbekistan': 'Asia', 'Vanuatu': 'Oceania',
#     'Venezuela': 'South America', 'Vietnam': 'Asia', 'Yemen': 'Asia', 'Zambia': 'Africa', 'Zimbabwe': 'Africa'
# })

# countries = countries_to_continent.replace({'North America': 'America', 'South America': 'America'})

# countries.name = 'region'
# countries.index.name = 'country'


In [74]:
# gm2 = cm.merge(le).merge(fe).merge(gd).merge(po).merge(countries, left_on='country', right_index=True)

# gm2['age5_surviving'] = 0.1 * (1000 - gm2['child_mortality'])

# gm2 = gm2.convert_dtypes(dtype_backend='pyarrow').sort_values(['country','year'])

# gm2 = gm2[['country', 'year', 'region', 'population', 'life_expectancy',
#            'age5_surviving', 'babies_per_woman', 'gdp_per_capita']].copy()

# gm2['year'] = gm2['year'].astype('int64[pyarrow]') 

# gm2['gdp_per_capita'] = gm2['gdp_per_capita'].apply(lambda v: float(v) if isinstance(v, int) else float(v[:-1]) * 1e3 if v[-1] == 'k' else
#                                                     float(v[:-1]) * 1e6 if v[-1] == 'M' else float(v))

# gm2['population'] = gm2['population'].apply(lambda v: float(v) if isinstance(v, int) else float(v[:-1]) * 1e3 if v[-1] == 'k' else
#                                                       float(v[:-1]) * 1e6 if v[-1] == 'M' else
#                                                       float(v[:-1]) * 1e9 if v[-1] == 'B' else float(v))
# gm2 = gm2[gm2.year < 2024]
# gm2 = gm2[~gm2.duplicated(['country', 'region', 'population', 'life_expectancy',
#                            'age5_surviving', 'babies_per_woman', 'gdp_per_capita'])]

# gm2.to_csv('gapminder2.csv', index=False)