In [1]:
import os
import re
import json
import time
import requests
import pandas as pd
from glob import glob
from tqdm import tqdm
from joblib import Parallel, delayed
from pathlib import Path
from unidecode import unidecode
from dotenv import load_dotenv, find_dotenv
from collections import defaultdict, ChainMap

load_dotenv(find_dotenv())
API_KEY = os.environ.get('GMAPS_KEY', '')
PARENTHESES_PATTERN = re.compile(r'\ ?\(.*\)')

In [2]:
def _geocode(search_string):
    payload = {'key': API_KEY, 'address': search_string}
    r = requests.get('https://maps.googleapis.com/maps/api/geocode/json', params=payload)
    if r.status_code == 200:
        return r.json()
    else:
        print("Problem: ", search_string)
        return {}

def geocode_from_tuple(tup):
    country, admin1 = tup
    search_country = PARENTHESES_PATTERN.sub('', country)
    search_string = ', '.join((admin1, search_country))
    res = _geocode(search_string)
    res['_meta'] = {'key': tup, 'search_string': search_string}
    return res

In [6]:
childrens_surveys = {}
for f in Path("../data/interim/MICS").iterdir():
    if f.name.startswith('.'):
        continue
    elif not f.is_dir():
        continue
    childrens_surveys[f.stem] = pd.read_csv(f/'ch.csv', index_col=0, low_memory=False)

In [7]:
to_geocode = [(country, admin1) for country, cdf in childrens_surveys.items() for admin1 in cdf.HH7.unique()]
r = Parallel(n_jobs=4, verbose=5)(delayed(geocode_from_tuple)(tup) for tup in to_geocode)

[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    7.2s
[Parallel(n_jobs=4)]: Done 281 tasks      | elapsed:   18.2s
[Parallel(n_jobs=4)]: Done 322 out of 322 | elapsed:   20.3s finished


In [9]:
[d['_meta']['key'] for d in r if not d['results']]

[('Sao Tome and Principe', 'RegiÃ£o Centro Este'),
 ('Sao Tome and Principe', 'RegiÃ£o Sul Este'),
 ('Sao Tome and Principe', 'RegiÃ£o Norte Oeste'),
 ('Pakistan (Punjab)', 'B. Nagar'),
 ('Pakistan (Punjab)', 'M. Bahaudin'),
 ('Dominican Republic', 'Cibao Nordeste'),
 ('Viet Nam', 'Northern Midlands and Mountain area'),
 ('Sudan', 'West Kordofan'),
 ('Nepal', 'Eastern Terai'),
 ('Nepal', 'Western  Terai'),
 ('Nepal', 'Far-WesternTerai'),
 ('Guinea Bissau', 'SAB')]

In [10]:
lookups = defaultdict(dict)
for result in r:
    country, admin1 = result['_meta']['key']
    lookups[country][admin1] = result
with open(Path('../data/interim/MICS/geocodes.json'), 'w') as f:
    json.dump(lookups, f, indent=2, sort_keys=True)

In [11]:
lookups['Pakistan (Punjab)']['M. Garh']

{'_meta': {'key': ('Pakistan (Punjab)', 'M. Garh'),
  'search_string': 'M. Garh, Pakistan'},
 'results': [{'address_components': [{'long_name': 'Muzaffargarh',
     'short_name': 'Muzaffargarh',
     'types': ['locality', 'political']},
    {'long_name': 'Muzaffargarh',
     'short_name': 'Muzaffargarh',
     'types': ['administrative_area_level_2', 'political']},
    {'long_name': 'Punjab',
     'short_name': 'Punjab',
     'types': ['administrative_area_level_1', 'political']},
    {'long_name': 'Pakistan',
     'short_name': 'PK',
     'types': ['country', 'political']}],
   'formatted_address': 'Muzaffargarh, Pakistan',
   'geometry': {'bounds': {'northeast': {'lat': 30.1144161, 'lng': 71.2265209},
     'southwest': {'lat': 30.0286753, 'lng': 71.15027549999999}},
    'location': {'lat': 30.07360869999999, 'lng': 71.1804988},
    'location_type': 'APPROXIMATE',
    'viewport': {'northeast': {'lat': 30.1144161, 'lng': 71.2265209},
     'southwest': {'lat': 30.0286753, 'lng': 71.15027

In [12]:
def pluck(result):
    d = {}
    try:
        loc = result['results'][0]
        d['place_id'] = loc.get('place_id')
        country_dict = [c for c in loc['address_components'] if 'country' in c['types']][0]
        if country_dict:
            d['country'] = country_dict.get('long_name')
            d['alpha2'] = country_dict.get('short_name')
        admin1_dict = [c for c in loc['address_components'] if 'administrative_area_level_1' in c['types']][0]
        if admin1_dict:
            d['admin1'] = admin1_dict.get('long_name')
        admin2_dict = [c for c in loc['address_components'] if 'administrative_area_level_2' in c['types']][0]
        if admin2_dict:
            d['admin2'] = admin2_dict.get('long_name')
        return d
    except (IndexError, KeyError):
        pass
    return d

In [13]:
pluck(lookups['Pakistan (Punjab)']['M. Garh'])

{'admin1': 'Punjab',
 'admin2': 'Muzaffargarh',
 'alpha2': 'PK',
 'country': 'Pakistan',
 'place_id': 'ChIJFfokmf_XOjkRKhoRC085PB0'}

In [16]:
pd.DataFrame.from_records([ChainMap({'c': d['_meta']['key'][0], 'hh7': d['_meta']['key'][1]}, pluck(d)) for d in r])

Unnamed: 0,admin1,admin2,alpha2,c,country,hh7,place_id
0,Principe,,ST,Sao Tome and Principe,São Tomé and Príncipe,RegiÃ£o Autonoma de Principe,ChIJYRBve0mNcBAR-ejxSaiODH4
1,,,,Sao Tome and Principe,,RegiÃ£o Centro Este,
2,,,,Sao Tome and Principe,,RegiÃ£o Sul Este,
3,,,,Sao Tome and Principe,,RegiÃ£o Norte Oeste,
4,Ashgabat,,TM,Turkmenistan,Turkmenistan,Ashgabat city,ChIJ2YQ2qxv-bz8R6a3TYj8B3jw
5,Ahal,,TM,Turkmenistan,Turkmenistan,Ahal velayat,ChIJEX3vZloKZT8RLueEJqTQ7pU
6,Balkan,,TM,Turkmenistan,Turkmenistan,Balkan velayat,ChIJaeolWIv_KkARdJtx7a_WlxM
7,Dashoguz Province,,TM,Turkmenistan,Turkmenistan,Dashoguz velayat,ChIJeSEwf20D2UERbIbWneItyOE
8,Lebap,,TM,Turkmenistan,Turkmenistan,Lebap velayat,ChIJ-z2ejiPgRD8R8ynVfO4H4gs
9,Mary,,TM,Turkmenistan,Turkmenistan,Mary velayat,ChIJt4hgKiSOQT8Rw5qCF8eRX5Q
