Geocoding with additional metadata
==================================
*JSON-style data format*  
This geocoding process is based on the old, table-based one, but uses JSON data structures instead. This will, with any luck, be good for the eventual goal of putting all of this in MongoDB

In [1]:
import pandas as pd
import numpy as np
import requests, re, json, os, time, datetime

In [2]:
with open("output/enhanced_dataset.json","r",encoding="utf-8") as fp:
    market_db = json.load(fp)

In [3]:
import os
def Ensure_dir(d):
    """Makes sure a directory 'd' exists, and if it doesn't, it creates one."""
    if not os.path.exists(d):
        os.makedirs(d)
    return 0

Geocoding begins here!
======================

In [4]:
def geonames_request(placename,placeid,east=180,west=-180,north=90,south=-90,fuzzy=1):
    """
    Takes a placename and optional bounding box, returns GeoNames response for query in dict form, with added url key
    """
    #print("Running geonames_request()")
    url = "http://api.geonames.org/search"
    Q = {'type':'json','username':'jaguillette','featureClass':'P','name':placename,'north':north,'south':south,'east':east,'west':west,'fuzzy':fuzzy}
    R = requests.get(url,params=Q)
    #print(R.url)
    response = R.json()
    Q.update({'url':R.url,'hgr_id':int(placeid)})
    response.update({'searchinfo':Q})
    return response

In [5]:
def geo_response_parse(response,save_location=None):
    """
    Takes a dict object from parsing the json of a geonames request, as returned by geonames_request.
    Returns a dict of response properties.
    """
    #print("Running geo_response_parse()")
    returnlist = []
    try:
        if response['totalResultsCount'] != 0:
            for p in response['geonames']:
                mod_dict = {}
                for k in p:
                    mod_dict["gname_{}".format(k)] = p[k]
                returnlist.append(mod_dict)
            if save_location:
                if not os.path.exists(save_location):
                    save_json = {"type":"FeatureCollection","features":[]}
                    for f in response['geonames']:
                        feature = {"type":"Feature","geometry":{'type':'Point'},"properties":{}}
                        feature['properties'] = f
                        feature['properties'].update(response['searchinfo'])
                        feature['geometry']['coordinates']=[float(f['lng']),float(f['lat'])]
                        save_json['features'].append(feature)
                    with open(save_location,'w',encoding='utf-8') as fp:
                        json.dump(dict(save_json),fp,sort_keys=True)
            #print("{} was found".format(placename))
        else:
            returnlist = [{}]
        for e in returnlist:
            e['url'] = response['searchinfo']['url']
            e['hgr_id'] = response['searchinfo']['hgr_id']
            e['searched_name'] = response['searchinfo']['name']
            e['fuzzy'] = response['searchinfo']['fuzzy']
            e['north_bound'] = response['searchinfo']['north']
            e['south_bound'] = response['searchinfo']['south']
            e['east_bound'] = response['searchinfo']['east']
            e['west_bound'] = response['searchinfo']['west']
            e['query_datetime'] = datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%dT%H:%M:%S")
        return returnlist
    except KeyError:
        print(response['searchinfo']['url'])
        return([])

In [6]:
def fuzz_geonames(placename,placeid,north,south,east,west,returnlist):
    """
    Request geonames for a place with specified params at increasing 'fuzz',
    which decreases the required similarity between search and place name.
    """
    #print("Running fuzz_geonames()")
    keepgoing = True
    fuzzy = .9
    while keepgoing:
        response = geonames_request(placename,placeid,east=east,west=west,north=north,south=south,fuzzy=fuzzy)
        save_loc = "geonames_jsons/{}_{}_f{:.2}.json".format(placeid,placename.replace(" ","_"),fuzzy)
        parsed = geo_response_parse(response,save_location=save_loc)
        returnlist.extend(parsed)
        if any('gname_geonameId' in d for d in parsed):
            keepgoing=False
        else:
            fuzzy = fuzzy-.1
            if fuzzy<0:
                print("{} was not found at max fuzz.".format(placename))
                keepgoing = False
            time.sleep(1)
    return returnlist

In [7]:
def geonames_lookup(placename,placeid,east=180,west=-180,north=90,south=-90,alt_name=None):
    """
    Queries geonames for given placename within bounding boxes.
    Also queries for alternate spellings if no main spelling is found.
    """
    #print("Running geonames_lookup()")
    Ensure_dir("geonames_jsons")
    returnlist = []
    re_list = [[r'(ой)($|\s)',r'ий\2'],[r'(ск)($|\s)',r'\2'],[r'(в)$',r'\1о']]
    #Generating a list of possible placenames based on above regex.
    #Placename will be added, then removed for de-duplication after set operation.
    pname_list = [placename]
    if alt_name is not None:
        pname_list.append(alt_name)
    for pair in re_list:
        pname_list.append(re.sub(pair[0],pair[1],placename))
    pname_list = set(pname_list)
    pname_list.remove(placename)
    #First geocoding attempt
    response1 = geonames_request(placename,placeid,east=east,west=west,north=north,south=south)
    #print("Now querying {}...".format(placename))
    time.sleep(1)
    save_loc = "geonames_jsons/{}_{}.json".format(placeid,placename.replace(" ","_"))
    parsed1 = geo_response_parse(response1,save_location=save_loc)
    returnlist.extend(parsed1)
    if not any('gname_geonameId' in d for d in parsed1):
        for p in pname_list:
            time.sleep(1)
            print("Querying alternate name {}".format(p))
            response = geonames_request(p,placeid,east=east,west=west,north=north,south=south)
            save_loc = "geonames_jsons/{}_{}.json".format(placeid,p.replace(" ","_"))
            parsed = geo_response_parse(response,save_location=save_loc)
            returnlist.extend(parsed)
    if not any('gname_geonameId' in d for d in returnlist):
        returnlist = fuzz_geonames(placename,placeid,north,south,east,west,returnlist)
        if alt_name is not None:
            returnlist = fuzz_geonames(alt_name,placeid,north,south,east,west,returnlist)
    return returnlist

In [8]:
with open("resources/partof_prov_bounds.json",'r',encoding = 'utf-8') as fp:
    bounding_dict = json.load(fp)

*Old Code*  
no_prov = len(market_df.ix[market_df.admin1_partofID.isnull()])
no_uezd = len(market_df.ix[market_df.admin2_partofID.isnull()])
total = len(market_df)
print("{1} / {0} ({2:.2%}) towns have no Namiestnichestvo recorded\n{3} / {0} ({4:.2%}) towns have no Uyezd recorded".format(total,no_prov,no_prov/total,no_uezd,no_uezd/total))

Market Town Geocoding
=====================

In [10]:
geocoded = []
problem_prov = []
worldsearch = []
bizarro = []
d = 4
for k,v in market_db.items():
    if 'geo' not in v.keys():
        alt = v['alt_name_new_orth']
        if v["admin1_partofID"] is None or v["admin1_partofID"] in ['not_in_text','нот_ин_теxт']:
            geocoded.extend(geonames_lookup(v['name_new_orth'],k,alt_name=alt))
        else:
            try:
                loc = bounding_dict[str(v['admin1_partofID'])]
            except KeyError:
                print(v['admin1_partofID'])
                problem_prov.append(v['admin1.partofID'])
                continue
            except ValueError:
                print("Searching worldwide")
                worldsearch.append(v['admin1.partofID'])
                N, S, E, W = 90, -90, 180, -180
            except:
                print("What even happened here?")
                bizarro.append(v['admin1.partofID'])
            if "boundaries" in loc:
                W, S, E, N = loc["boundaries"][0], loc["boundaries"][1], loc["boundaries"][2], loc["boundaries"][3]
            else:
                N, S, E, W = loc["center"][0]+d, loc["center"][0]-d, loc["center"][1]+d, loc["center"][1]-d
            geocoded.extend(geonames_lookup(v['name_new_orth'],k,north=N,south=S,west=W,east=E,alt_name=alt))
        print("Looked up {}.".format(v['name_new_orth']))
    else:
        print("{} already geocoded".format(v['name_new_orth']))

Печенеги already geocoded
Looked up Кривцо.
Looked up Терешково.
Рожественское already geocoded
Looked up Симеоновка.
Устюжна Железопольская already geocoded
Looked up Орлов.
Looked up Святое Место.
Looked up Серевка.
Красной Колядин already geocoded
Волынцы already geocoded
Кадом already geocoded
Looked up Семендеево.
Юрьевец Поволгский already geocoded
Седнев already geocoded
Looked up Погромца.
Моцына already geocoded
Looked up Краснополь.
Починки already geocoded
Шатск already geocoded
Троицкое already geocoded
Козельск already geocoded
Бежецк already geocoded
Зарайск already geocoded
Querying alternate name Яблуново
Looked up Яблунов.
Чухлома already geocoded
Оренбург already geocoded
Querying alternate name Цыбулево
Looked up Цыбулев.
Белгород already geocoded
Моршанск already geocoded
Looked up Высокое.
Looked up Кемары.
Суздаль already geocoded
Серега already geocoded
Севск already geocoded
Шадринск already geocoded
Пирятин already geocoded
Синбирск already geocoded
Резыцы alre

In [26]:
print(problem_prov)
print(worldsearch)
print(bizarro)

[]
[]
[]


In [27]:
geodf = pd.DataFrame(geocoded)

In [28]:
geodf = geodf[["hgr_id","searched_name","fuzzy","gname_name","gname_adminCode1","gname_adminName1","gname_countryCode","gname_countryId","gname_countryName","gname_fcl","gname_fclName","gname_fcode","gname_fcodeName","gname_geonameId","gname_lat","gname_lng","gname_population","gname_toponymName","north_bound","south_bound","east_bound","west_bound","url","query_datetime]]

In [29]:
len(geodf)

2629

In [30]:
geodf.to_excel('output/process/geocoded_new.xlsx',encoding='utf-8')
geodf.to_csv('output/process/geocoded_new.csv',encoding='utf-8')

In [31]:
geodf = pd.DataFrame.from_csv('output/process/geocoded_new.csv',encoding='utf-8')

In [32]:
finding_counts = geodf.ix[pd.isnull(geodf.gname_name)==False].pivot_table(index='hgr_id',values='gname_lat',aggfunc=lambda x: len(x.unique()))
fdf = pd.DataFrame(finding_counts)

In [33]:
single_results = pd.merge(fdf.ix[fdf.gname_lat==1],geodf,left_index=True,right_on='hgr_id')
single_results = single_results.ix[pd.isnull(single_results.gname_name)==False]
single_results = single_results.ix[single_results.hgr_id.duplicated()==False]
len(single_results)

64

In [18]:
len(single_results[single_results.fuzzy>=.8])

0

In [19]:
single_results[['hgr_id','searched_name','fuzzy','gname_name','gname_adminName1','gname_countryName','gname_geonameId']][single_results.fuzzy>=.8]

Unnamed: 0,hgr_id,searched_name,fuzzy,gname_name,gname_adminName1,gname_countryName,gname_geonameId


In [20]:
print(len(single_results))
print(len(single_results.hgr_id.unique()))

64
64


In [21]:
single_results.index=single_results.hgr_id
sj = json.loads(single_results.to_json(orient='index'))

In [22]:
for k,v in sj.items():
    if v['fuzzy']>=.8:
        market_db[k]['geo'] = {"coded":"automated"}
        market_db[k]['geo']["country_code"] = v['gname_countryCode']
        market_db[k]['geo']["geonameId"] = "http://geonames.org/{}".format(v['gname_geonameId'])
        market_db[k]['geo']["pres_country"] = v['gname_countryName']
        market_db[k]['geo']["pres_loc"] = "{}, {}, {}".format(v['gname_name'],v['gname_adminName1'],v['gname_countryName'])
        market_db[k]['geo']["x_coord"] = v['gname_lng']
        market_db[k]['geo']["y_coord"] = v['gname_lat_y']

In [23]:
with open('output/enhanced_dataset.json','w',encoding='utf-8') as fp:
    json.dump(market_db,fp,sort_keys=True,indent=2)

In [24]:
geojson = {"features":[],"type":"FeatureCollection"}
for k,v in market_db.items():
    if 'geo' in v:
        feature = {"geometry":{'coordinates':[v['geo']['x_coord'],v['geo']['y_coord']],"type":"Point"},"type":"Feature"}
        feature['properties'] = v
        geojson['features'].append(feature)

In [25]:
with open("output/enhanced_dataset.geojson","w",encoding='utf-8') as fp:
    json.dump(geojson,fp,sort_keys=True,indent=1)

In [36]:
print("{}/{} geocoded as of now".format(len(geojson['features']),len(market_db)))

490/802 geocoded as of now


Run all above here
==================

In [29]:
#geodf_merged = pd.merge(geodf,market_df,left_on='hgr_id',right_index=True)

In [31]:
#geodf_merged = geodf_merged[['hgr_id', 'name', 'alt_name', 'partof_id', 'lng', 'lat', '_type', 'source_url', 'text', 'name_modern_sp', 'result_placename','searched_placename', 'admin1', 'country_name', 'src_id', 'country_code', 'url', 'local_result', 'src', 'admin1_old_orth', 'admin1_new_orth', 'admin1_stem', 'admin1_partof', 'admin2_old_orth','admin2_new_orth','admin2_stem', 'admin2_partof', 'page']]

In [32]:
cyrillic_translit={u'\u0410': 'A', u'\u0430': 'a',u'\u0411': 'B', u'\u0431': 'b',u'\u0412': 'V', u'\u0432': 'v',u'\u0413': 'G', u'\u0433': 'g',u'\u0414': 'D', u'\u0434': 'd',u'\u0415': 'E', u'\u0435': 'e',u'\u0416': 'Zh', u'\u0436': 'zh',u'\u0417': 'Z', u'\u0437': 'z',u'\u0418': 'I', u'\u0438': 'i',u'\u0419': 'I', u'\u0439': 'i',u'\u041a': 'K', u'\u043a': 'k',u'\u041b': 'L', u'\u043b': 'l',u'\u041c': 'M', u'\u043c': 'm',u'\u041d': 'N', u'\u043d': 'n',u'\u041e': 'O', u'\u043e': 'o',u'\u041f': 'P', u'\u043f': 'p',u'\u0420': 'R', u'\u0440': 'r',u'\u0421': 'S', u'\u0441': 's',u'\u0422': 'T', u'\u0442': 't',u'\u0423': 'U', u'\u0443': 'u',u'\u0424': 'F', u'\u0444': 'f',u'\u0425': 'Kh', u'\u0445': 'kh',u'\u0426': 'Ts', u'\u0446': 'ts',u'\u0427': 'Ch', u'\u0447': 'ch',u'\u0428': 'Sh', u'\u0448': 'sh',u'\u0429': 'Shch', u'\u0449': 'shch',u'\u042a': '"', u'\u044a': '"',u'\u042b': 'Y', u'\u044b': 'y',u'\u042c': "'", u'\u044c': "'",u'\u042d': 'E', u'\u044d': 'e',u'\u042e': 'Iu', u'\u044e': 'iu',u'\u042f': 'Ia', u'\u044f': 'ia',u'\u0462': 'E', u'\u0463': 'e'}

def transliterate(word, translit_table):
    """
    Transliterates 'word' based on the key/value pairs in 'translit_table'
    """
    converted_word = ''
    for char in word:
        transchar = ''
        if char in translit_table:
            transchar = translit_table[char]
        else:
            transchar = char
        converted_word += transchar
    return converted_word

In [33]:
geodf_merged['lc_translit'] = geodf_merged.name_modern_sp.apply(lambda text: transliterate(text,cyrillic_translit))

In [42]:
geodf_merged.to_excel('output/geocoded_merged.xlsx',encoding = 'utf-8')
geodf_merged.to_csv('output/geocoded_merged.csv',encoding = 'utf-8')

In [39]:
single_results.name_modern_sp = single_results.name_modern_sp.str.title()

In [41]:
single_results.to_csv('output/single_results.csv',encoding='utf-8')

In [37]:
def reverse_geocode(lng,lat):
    keepgoing = True
    while keepgoing == True:
        try:
            url = "https://maps.googleapis.com/maps/api/geocode/json"
            Q = {'latlng':"{},{}".format(lat,lng)}
            R = requests.get(url, params=Q)
            print(R.url)
            geo_response = R.json()
            locality = 'error'
            for component in geo_response['results'][0]['address_components']:
                if 'locality' in component['types']:
                    locality = component['long_name']
                elif 'administrative_area_level_1' in component['types']:
                    admin1 = component['long_name']
                elif 'country' in component['types']:
                    country = component['long_name']
                    countryCode = component['short_name']
            if locality == 'error':
                for component in geo_response['results'][0]['address_components']:
                    if 'administrative_area_level_2' in component['types']:
                        locality = component['long_name']
            print("\"{0}, {1}\",\"{2}\",\"{3}\"".format(locality,admin1,country,countryCode))
            time.sleep(1)
            keepgoing = False
        except IndexError:
            time.sleep(1)
            continue
        return "{}, {}".format(locality,admin1), country, countryCode

In [None]:
single_results['pres_loc'], single_results['country'], single_results['country_code'] = np.vectorize(reverse_geocode)(single_results.lng,single_results.lat_y)

In [None]:
single_results = single_results[['lat_x', 'hgr_id', 'name', 'alt_name', 'partof_id', 'lng', 'lat_y', '_type', 'source_url', 'text', 'name_modern_sp', 'lc_translit', 'pres_loc', 'country', 'country_code', 'result_placename', 'searched_placename', 'admin1', 'country_name', 'src_id', 'url', 'local_result', 'src', 'admin1_old_orth', 'admin1_new_orth', 'admin1_stem', 'admin2_old_orth', 'page']]