In [1]:
import os
import json
import pandas as pd
from glob import glob
from tqdm import tqdm
from pathlib import Path
from joblib import Parallel, delayed
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [2]:
API_KEY = os.environ.get('GMAPS_KEY', '')

In [3]:
def standardize_columns(df):
    df.columns = ['admin2', 'admin1', 'alpha-3']
    df.loc[:, 'alpha-3'] = df['alpha-3'].str.strip()
    return df

def add_country_names(df, iso):
    wanted = iso[['alpha-2', 'name']].rename(columns={'name': 'country'})
    df = df.join(wanted, on='alpha-3')
    return df.rename(columns=lambda c: c.replace('-', ''))

def add_provenance(df, name):
    df = df.reset_index().rename(columns={'index': 'idx'})
    df['provenance'] = name
    df['provenance'] = df.provenance + df.idx.astype(str)
    return df.drop('idx', axis=1).set_index('provenance')

def to_strings(df):
    series = df.apply(_join_row, axis=1)
    return series.drop_duplicates().values.tolist()

def get_search_string(row):
    print(row)
    return ', '.join([row['admin1'], row['admin2']])

In [4]:
ISO = Path('..')/'data'/'external'/'iso-codes.csv'
# this is tricksy because Namibia's alpha-2 is "NA" which gets read as NaN by default
iso = pd.read_csv(ISO, na_values=[], keep_default_na=False).set_index('alpha-3')

WHO = Path('..')/'data'/'interim'/'calc_cols_added.csv'
who = pd.read_csv(WHO, index_col=0).pipe(add_provenance, 'who')
vac = (who[['Admin1', 'Admin2', 'Iso Code']]
       .fillna('')
       .pipe(standardize_columns)
       .pipe(add_country_names, iso)
       .drop_duplicates())

NASA = Path('..')/'data'/'interim'/'external-processed/admin_population_nasa.csv'
nasa = pd.read_csv(NASA).pipe(add_provenance, 'nasa')
pop = (nasa[['NAME1', 'NAME2', 'ISOALPHA']]
       .fillna('')
       .pipe(standardize_columns)
       .pipe(add_country_names, iso)
       .drop_duplicates())

In [5]:
pop.size, vac.size

(211150, 114435)

In [6]:
df = pd.concat([vac, pop]).sort_values(['country', 'admin1', 'admin2'])
df.head(20)

Unnamed: 0_level_0,admin2,admin1,alpha3,alpha2,country
provenance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
who0,,Aab Band,AFG,AF,Afghanistan
who1,,Aab Kamari,AFG,AF,Afghanistan
who2,,Aaqcha,AFG,AF,Afghanistan
nasa162,Ghazni,Ab Band,AFG,AF,Afghanistan
nasa83,Badghis,Ab Kamari,AFG,AF,Afghanistan
who3,,Acheen,AFG,AF,Afghanistan
nasa325,Nangarhar,Acheen,AFG,AF,Afghanistan
who4,,Adraskan,AFG,AF,Afghanistan
nasa204,Hirat,Adraskan,AFG,AF,Afghanistan
nasa380,Paktya,Ahmad Abad,AFG,AF,Afghanistan


In [7]:
!http get https://maps.googleapis.com/maps/api/geocode/json key=="$API_KEY" address=="Hirat, Adraskan" components=="country:AF"

[34mHTTP[39;49;00m/[34m1.1[39;49;00m [34m200[39;49;00m [36mOK[39;49;00m
[36mAccess-Control-Allow-Origin[39;49;00m: [33m*[39;49;00m
[36mAlt-Svc[39;49;00m: [33mhq=":443"; ma=2592000; quic=51303431; quic=51303339; quic=51303338; quic=51303337; quic=51303335,quic=":443"; ma=2592000; v="41,39,38,37,35"[39;49;00m
[36mCache-Control[39;49;00m: [33mpublic, max-age=86400[39;49;00m
[36mContent-Encoding[39;49;00m: [33mgzip[39;49;00m
[36mContent-Length[39;49;00m: [33m428[39;49;00m
[36mContent-Type[39;49;00m: [33mapplication/json; charset=UTF-8[39;49;00m
[36mDate[39;49;00m: [33mMon, 05 Mar 2018 23:38:53 GMT[39;49;00m
[36mExpires[39;49;00m: [33mTue, 06 Mar 2018 23:38:53 GMT[39;49;00m
[36mServer[39;49;00m: [33mmafe[39;49;00m
[36mVary[39;49;00m: [33mAccept-Language[39;49;00m
[36mX-Frame-Options[39;49;00m: [33mSAMEORIGIN[39;49;00m
[36mX-XSS-Protection[39;49;00m: [33m1; mode=block[39;49;00m

{
    [34;01m"results"[39;49;00m: [
        {
          

In [8]:
OUTPUT_DIR = Path('..')/'data'/'geocode'
#geocoder = GoogleV3(api_key=API_KEY)

def _geocode(search_string, alpha2, filename):
    !http get https://maps.googleapis.com/maps/api/geocode/json key=="$API_KEY" address=="$search_string" components=="country:$alpha2" > $filename

def get_filename(index):
    return OUTPUT_DIR/f"{index}.json"

def geocode_if_not_already_here(row):
    index, admin2, admin1, _, alpha2, _ = row
    filename = get_filename(index)
    if os.path.exists(filename):
        return False
    search_string = f"{admin2}, {admin1}" if admin2 else admin1
    _geocode(search_string, alpha2, filename)
    return True

In [9]:
rows = list(df.itertuples(name=None))
r = Parallel(n_jobs=26, verbose=5)(delayed(geocode_if_not_already_here)(row) for row in rows)

[Parallel(n_jobs=26)]: Done  20 tasks      | elapsed:    0.0s


/bin/sh: 1: Syntax error: EOF in backquote substitution
/bin/sh: 1: Syntax error: EOF in backquote substitution


[Parallel(n_jobs=26)]: Done 5496 tasks      | elapsed:    0.4s


/bin/sh: 1: Syntax error: EOF in backquote substitution
/bin/sh: 1: Syntax error: EOF in backquote substitution
/bin/sh: 1: Syntax error: Unterminated quoted string


[Parallel(n_jobs=26)]: Done 65117 out of 65117 | elapsed:    1.2s finished


In [10]:
def read_geocoded_results():
    files = glob('../data/geocode/*.json')
    canonicals = []
    for filepath in files:
        _, filename = os.path.split(filepath)
        index = filename.split('.')[0]
        with open(filepath, 'r') as f:
            try:
                payload = json.load(f)
            except json.JSONDecodeError:
                print('problem:', index)
        payload.update({'provenance': index})
        canonicals.append(payload)
    return canonicals

def canonical_df_from_geocoded_results(canonicals):
    generator = (extract_canonical_location_info(index, canonicals) for index in canonicals)
    return pd.DataFrame.from_records(generator).set_index('provenance')

In [11]:
canonicals = read_geocoded_results()
canonicals[0]

problem: nasa12077
problem: nasa12615
problem: nasa12424
problem: nasa12361
problem: nasa11955
problem: nasa12565
problem: nasa12533
problem: nasa12872
problem: nasa12315
problem: nasa12805


{'provenance': 'who106401',
 'results': [{'address_components': [{'long_name': 'Gaya',
     'short_name': 'Gaya',
     'types': ['locality', 'political']},
    {'long_name': 'Gaya',
     'short_name': 'Gaya',
     'types': ['administrative_area_level_2', 'political']},
    {'long_name': 'Dosso Region',
     'short_name': 'Dosso Region',
     'types': ['administrative_area_level_1', 'political']},
    {'long_name': 'Niger',
     'short_name': 'NE',
     'types': ['country', 'political']}],
   'formatted_address': 'Gaya, Niger',
   'geometry': {'bounds': {'northeast': {'lat': 11.9013684, 'lng': 3.4696809},
     'southwest': {'lat': 11.876374, 'lng': 3.4380523}},
    'location': {'lat': 11.8852599, 'lng': 3.4548833},
    'location_type': 'APPROXIMATE',
    'viewport': {'northeast': {'lat': 11.9013684, 'lng': 3.4696809},
     'southwest': {'lat': 11.876374, 'lng': 3.4380523}}},
   'place_id': 'ChIJmUvhbjQYzBER5QY6VBoqTio',
   'types': ['locality', 'political']}],
 'status': 'OK'}

In [12]:
no_results = [c['provenance'] for c in canonicals if not c['results']]

In [13]:
len(no_results)

13643

In [14]:
df.loc[df.index.isin(no_results)]

Unnamed: 0_level_0,admin2,admin1,alpha3,alpha2,country
provenance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nasa380,Paktya,Ahmad Abad,AFG,AF,Afghanistan
nasa381,Paktya,Ali Khail (Jaji),AFG,AF,Afghanistan
who11,,Almaar,AFG,AF,Afghanistan
nasa55,Badakhshan,Arghanj Khaw,AFG,AF,Afghanistan
nasa56,Badakhshan,Argo,AFG,AF,Afghanistan
who20,,Asl-e-Chakhansor,AFG,AF,Afghanistan
who24,,Baak,AFG,AF,Afghanistan
nasa422,Takhar,Baharak,AFG,AF,Afghanistan
nasa165,Ghazni,Bahrami Shahid (Jaghatu),AFG,AF,Afghanistan
who31,,Bala Bolook,AFG,AF,Afghanistan
