In [1]:
%matplotlib inline

from pathlib import Path
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from unidecode import unidecode

DATA_EXT = (Path(os.getcwd()) / os.pardir / 'data' / 'external').resolve()

In [2]:
import sys
sys.path.append(os.path.join(os.getcwd(), os.pardir, 'src'))

%load_ext autoreload

%autoreload 2
from visualization.visualize import *
from data.geo_utils import canonicalize_dataframe_geographies

### DEPRECATED This notebook loads the raw NASA GPW v4.10 data and then writes the estimated population at the admin2 and country levels to `interim`

In [3]:
df = pd.read_csv(DATA_EXT/'nasa-gpw'/'gpw_v4_admin_unit_center_points_population_estimates_rev10_global.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,GUBID,ISOALPHA,COUNTRYNM,NAME1,NAME2,NAME3,NAME4,NAME5,NAME6,CENTROID_X,...,A60_64M,A65PLUSM,A65_69M,A70PLUSM,A70_74M,A75PLUSM,A75_79M,A80PLUSM,A80_84M,A85PLUSM
0,{F1A487E2-8EA6-4934-AE6E-F3B7BF0A6DAE},ABW,Aruba,Paradera,Piedra Plat,,,,,-69.991539,...,57.419963,115.923321,52.002985,63.920336,30.335075,33.585261,19.501119,14.084142,7.583769,6.500373
1,{344FA064-8E7A-4884-A9F9-3824164F34AE},ABW,Aruba,San Nicolas North,Rooi Congo,,,,,-69.913123,...,56.343235,121.354661,46.591522,74.763139,36.839808,37.923332,20.586951,17.33638,13.002285,4.334095
2,{0E49304A-B532-4842-9E47-540C5D5DFFBD},ABW,Aruba,San Nicolas South,Seroe Colorado,,,,,-69.877344,...,4.329681,15.153885,4.329681,10.824204,5.412102,5.412102,4.329681,1.08242,0.0,1.08242
3,{89AD49DF-E7F8-43B0-A2F1-4CB018B87CA4},ABW,Aruba,San Nicolas South,Pastoor Hendriksstraat,,,,,-69.907521,...,29.22535,73.604585,24.895669,48.708917,12.989044,35.719872,11.906624,23.813248,10.824204,12.989044
4,{DF116CBC-DDCD-4A53-9B8D-81C65DE10BE0},ABW,Aruba,San Nicolas South,San Nicolas South Other,,,,,-69.901461,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
admin_2_populations = df.groupby(['ISOALPHA', 'NAME1', 'NAME2']).UN_2015_E.sum()
admin_2_populations

ISOALPHA  NAME1                  NAME2                    
ABW       Noord/ Tanki Leendert  Alto Vista                     5503.0
                                 Moko/Tanki Flip                4071.0
                                 Noord Other                       0.0
                                 Palm Beach/Malmok              5499.0
                                 Tanki Leendert                 3958.0
                                 Washington                     4163.0
          Oranjestad East        Dacota/Potrero                 2480.0
                                 Klip/Mon Plaisir               1240.0
                                 Nassaustraat                    714.0
                                 Oranjestad East Other             0.0
                                 Sabana Blanco/Mahuma           3000.0
                                 Seroe Blanco/Cumana            2553.0
                                 Simeon Antonio                 1001.0
                  

In [5]:
country_populations = df.groupby(['ISOALPHA']).UN_2015_E.sum()

In [6]:
admin_2_populations.to_frame().to_csv(Path("../data/interim/external-processed/admin_population_nasa.csv"))

In [7]:
country_populations.to_frame().to_csv(Path("../data/interim/external-processed/counrty_population_nasa.csv"))

Now the `admin_2_populations` data frame has the estimates from NASA, we need to match these to the regions in the WHO dataset.

# Load immunization data

immunization data is from WHO. We subset this just to have
 - Iso Code (for a country)
 - Admin 1 (the first level administrative region)
 - Admin 2 (the second level administrative region)
 - `available_admin`: calculated to be one or both of Admin1/Admin2 depending on what is avaialable
 - Denominator: The number of children that could potentially be vaccinated

In [8]:
who_df = pd.read_csv(Path('../data/interim/calc_cols_added.csv'), index_col=0)


denoms = who_df[['Iso Code', 'Admin1', 'Admin2', 'available_admin', 'Denominator']].drop_duplicates()

print(denoms.shape)
denoms.head()

(38259, 5)


Unnamed: 0,Iso Code,Admin1,Admin2,available_admin,Denominator
0,AFG,,Aab Band,Aab Band,1266.0
1,AFG,,Aab Kamari,Aab Kamari,4599.0
2,AFG,,Aaqcha,Aaqcha,5674.0
3,AFG,,Acheen,Acheen,4846.0
4,AFG,,Adraskan,Adraskan,3557.0


In [9]:
print(admin_2_populations.shape)
admin_2_populations = admin_2_populations.to_frame()
admin_2_populations.reset_index(inplace=True)
admin_2_populations.head()

(42230,)


Unnamed: 0,ISOALPHA,NAME1,NAME2,UN_2015_E
0,ABW,Noord/ Tanki Leendert,Alto Vista,5503.0
1,ABW,Noord/ Tanki Leendert,Moko/Tanki Flip,4071.0
2,ABW,Noord/ Tanki Leendert,Noord Other,0.0
3,ABW,Noord/ Tanki Leendert,Palm Beach/Malmok,5499.0
4,ABW,Noord/ Tanki Leendert,Tanki Leendert,3958.0


In [19]:
# call out to function for geocoding from Google
res = canonicalize_dataframe_geographies(denoms.head(200), 'Iso Code', 'Admin1', 'Admin2', return_failures=True)

if isinstance(res, tuple):
    good, bad = res
else:
    good = res

good

Unnamed: 0,admin1,admin2,country
0,Ghazni,Ab Band,Afghanistan
1,Badghis,Ab Kamari,Afghanistan
2,Jowzjan,Mingajik,Afghanistan
3,Nangarhar,Achin,Afghanistan
4,Herat,Adraskan,Afghanistan
5,Herat,Kohsan,Afghanistan
6,Ghazni,Ajristan,Afghanistan
7,Kapisa,Alasay,Afghanistan
8,Laghman,Alingar,Afghanistan
9,Kunduz,Chahar Dara,Afghanistan


In [15]:
!cat ../data/interim/geo_query_cache.json | jq .

[1;39m{
  [0m[34;1m"Aab Band  Afghanistan"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"admin2"[0m[1;39m: [0m[0;32m"Ab Band"[0m[1;39m,
    [0m[34;1m"admin1"[0m[1;39m: [0m[0;32m"Ghazni"[0m[1;39m,
    [0m[34;1m"country"[0m[1;39m: [0m[0;32m"Afghanistan"[0m[1;39m
  [1;39m}[0m[1;39m,
  [0m[34;1m"Aab Kamari  Afghanistan"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"admin2"[0m[1;39m: [0m[0;32m"Ab Kamari"[0m[1;39m,
    [0m[34;1m"admin1"[0m[1;39m: [0m[0;32m"Badghis"[0m[1;39m,
    [0m[34;1m"country"[0m[1;39m: [0m[0;32m"Afghanistan"[0m[1;39m
  [1;39m}[0m[1;39m,
  [0m[34;1m"Aaqcha  Afghanistan"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"admin2"[0m[1;39m: [0m[0;32m"Mingajik"[0m[1;39m,
    [0m[34;1m"admin1"[0m[1;39m: [0m[0;32m"Jowzjan"[0m[1;39m,
    [0m[34;1m"country"[0m[1;39m: [0m[0;32m"Afghanistan"[0m[1;39m
  [1;39m}[0m[1;39m,
  [0m[34;1m"Acheen  Afghanistan"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"admin2"

In [23]:
!rm ../data/interim/geo_query_cache.json