# WPP: total population

## Parameters

In [1]:
dest_dir = '/tmp/wpp_2019_total_population'

## Walden

In [2]:
from owid import walden

In [3]:
walden_ds = walden.Catalog().find_one('wpp', '2019', 'standard_projections')

In [4]:
walden_ds

Dataset(namespace='wpp', short_name='standard_projections', name='UN World Population Prospects 2019 - Standard Projections', description='The 2019 Revision of World Population Prospects is the twenty-sixth round of official United Nations population estimates and projections that have been prepared by the Population Division of the Department of Economic and Social Affairs of the United Nations Secretariat.', source_name='United Nations', url='https://population.un.org/wpp2019/Download/Standard/Population/', date_accessed='2021-10-08', file_extension='zip', license_url='https://data.un.org/Host.aspx?Content=UNdataUse', source_data_url=None, md5='03884d5712560092a7f93469f5d048bb', publication_year=2019, publication_date=None, owid_data_url='https://walden.nyc3.digitaloceanspaces.com/wpp/2019/WPP2019.zip', license_name='CC-BY (custom)', access_notes=None)

## Unzip

In [5]:
import tempfile
import shutil

In [6]:
temp_dir = tempfile.mkdtemp()

In [7]:
import zipfile

In [8]:
zipfile.ZipFile(walden_ds.local_path).extractall(temp_dir)

In [9]:
!ls {temp_dir}/WPP2019

WPP2019_Fertility_by_Age.csv
WPP2019_Life_Table_Medium.csv
WPP2019_Life_Table_OtherVariants.csv
WPP2019_Period_Indicators_Medium.csv
WPP2019_Period_Indicators_OtherVariants.csv
WPP2019_PopulationByAgeSex_Medium.csv
WPP2019_PopulationByAgeSex_OtherVariants.csv
WPP2019_PopulationBySingleAgeSex_1950-2019.csv
WPP2019_PopulationBySingleAgeSex_2020-2100.csv
WPP2019_TotalPopulationBySex.csv


## Make dataset

In [10]:
from owid.catalog import Dataset

In [11]:
ds = Dataset.create_empty(dest_dir)

## Add tables

In [12]:
from owid.catalog import Table
import pandas as pd

### Total population

In [13]:
df = pd.read_csv(f'{temp_dir}/WPP2019/WPP2019_TotalPopulationBySex.csv')

In [14]:
df.head()

Unnamed: 0,LocID,Location,VarID,Variant,Time,MidPeriod,PopMale,PopFemale,PopTotal,PopDensity
0,4,Afghanistan,2,Medium,1950,1950.5,4099.243,3652.874,7752.117,11.874
1,4,Afghanistan,2,Medium,1951,1951.5,4134.756,3705.395,7840.151,12.009
2,4,Afghanistan,2,Medium,1952,1952.5,4174.45,3761.546,7935.996,12.156
3,4,Afghanistan,2,Medium,1953,1953.5,4218.336,3821.348,8039.684,12.315
4,4,Afghanistan,2,Medium,1954,1954.5,4266.484,3884.832,8151.316,12.486


In [15]:
df.columns = ['loc_id', 'location', 'var_id', 'variant', 'year', 'mid_period', 'population_male', 'population_female', 'population_total', 'population_density']

In [16]:
t = Table(df[['loc_id', 'location']].drop_duplicates().set_index('loc_id'))
t.metadata.short_name = 'location_codes'
ds.add(t)

In [17]:
t = Table(df[['var_id', 'variant']].drop_duplicates().set_index('var_id'))
t.metadata.short_name = 'variant_codes'
ds.add(t)

In [18]:
df.drop(columns=['loc_id', 'var_id'], inplace=True)

In [38]:
for col in ['location', 'variant']:
    df[col] = df[col].astype('category')

In [19]:
df.set_index(['variant', 'location', 'year'], inplace=True)

In [20]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mid_period,population_male,population_female,population_total,population_density
variant,location,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Medium,Afghanistan,1950,1950.5,4099.243,3652.874,7752.117,11.874
Medium,Afghanistan,1951,1951.5,4134.756,3705.395,7840.151,12.009
Medium,Afghanistan,1952,1952.5,4174.450,3761.546,7935.996,12.156
Medium,Afghanistan,1953,1953.5,4218.336,3821.348,8039.684,12.315
Medium,Afghanistan,1954,1954.5,4266.484,3884.832,8151.316,12.486
...,...,...,...,...,...,...,...
Lower 95 PI,Zimbabwe,2080,2080.5,10576.533,11255.983,21836.893,56.448
Lower 95 PI,Zimbabwe,2085,2085.5,10293.349,11050.875,21355.988,55.205
Lower 95 PI,Zimbabwe,2090,2090.5,9920.336,10767.709,20689.956,53.483
Lower 95 PI,Zimbabwe,2095,2095.5,9503.711,10412.184,19892.080,51.421


In [21]:
df.index.levels[0]

Index(['Constant fertility', 'Constant mortality', 'High',
       'Instant replacement', 'Low', 'Lower 80 PI', 'Lower 95 PI', 'Median PI',
       'Medium', 'Momentum', 'No change', 'Upper 80 PI', 'Upper 95 PI',
       'Zero migration'],
      dtype='object', name='variant')

In [22]:
t = Table(df)
t.metadata.short_name = 'total_population'
ds.add(t)

### Fertility by age

In [23]:
df = pd.read_csv(f'{temp_dir}/WPP2019/WPP2019_Fertility_by_Age.csv')

In [24]:
df.head()

Unnamed: 0,LocID,Location,VarID,Variant,Time,MidPeriod,AgeGrp,AgeGrpStart,AgeGrpSpan,ASFR,PASFR,Births
0,4,Afghanistan,2,Medium,1950-1955,1953,15-19,15,5,145.245,9.748,280.906
1,4,Afghanistan,2,Medium,1950-1955,1953,20-24,20,5,313.809,21.061,522.501
2,4,Afghanistan,2,Medium,1950-1955,1953,25-29,25,5,339.005,22.752,481.946
3,4,Afghanistan,2,Medium,1950-1955,1953,30-34,30,5,290.535,19.499,351.097
4,4,Afghanistan,2,Medium,1950-1955,1953,35-39,35,5,233.111,15.645,238.645


In [25]:
df.drop(columns=['LocID', 'VarID'], inplace=True)

In [30]:
df.columns = [
    'location', 'variant', 'year_range', 'mid_period', 'age_group', 'age_group_start', 'age_group_span', 'asfr', 'pasfr', 'births'
]

In [31]:
df.head()

Unnamed: 0,location,variant,year_range,mid_period,age_group,age_group_start,age_group_span,asfr,pasfr,births
0,Afghanistan,Medium,1950-1955,1953,15-19,15,5,145.245,9.748,280.906
1,Afghanistan,Medium,1950-1955,1953,20-24,20,5,313.809,21.061,522.501
2,Afghanistan,Medium,1950-1955,1953,25-29,25,5,339.005,22.752,481.946
3,Afghanistan,Medium,1950-1955,1953,30-34,30,5,290.535,19.499,351.097
4,Afghanistan,Medium,1950-1955,1953,35-39,35,5,233.111,15.645,238.645


In [38]:
for col in ['location', 'variant', 'year_range', 'age_group']:
    df[col] = df[col].astype('category')

In [33]:
df.set_index(['variant', 'location', 'year_range', 'age_group'], inplace=True)

In [41]:
t = Table(df)
t.metadata.short_name = 'fertility_by_age'
ds.add(t)

## Clean up

In [42]:
shutil.rmtree(temp_dir)