# Add libs


In [2]:
import random
import string
import numpy as np
import tensorflow as tf
import pandas as pd

# Prepare data

In [191]:
mergeddata = pd.read_csv("datamerge.csv")
mergeddata = mergeddata[["latitude","longitude","address","zip"]]
mergeddata = mergeddata.drop_duplicates().fillna(0).reset_index(drop="True")
n = mergeddata.shape[0]

# Set up Tensorflow


In [192]:
sess = tf.Session()
test_address = tf.sparse_placeholder( dtype=tf.string)
test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)
ref_address = tf.sparse_placeholder(dtype=tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)

# Distances


In [193]:
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))
address_dist = tf.edit_distance(test_address, ref_address, normalize=True)

# Similarities

In [194]:
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))
zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))
address_sim = tf.subtract(1., address_dist)

# Weights

In [195]:
address_weight = 0.5
zip_weight = 1. - address_weight
weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))
top_match_index = tf.argmax(weighted_sim, 1)

# MISC

In [196]:
def sparse_from_word_vec(word_vec):
    num_words = len(word_vec)
    indices = [[xi, 0, yi] for xi,x in enumerate(word_vec) for yi,y in enumerate(x)]
    chars = list(''.join(word_vec))
    # Now we return our sparse vector
    return(tf.SparseTensorValue(indices, chars, [num_words,1,1]))

def getTestSet(testNumber, maxN):
    # test 
    test = {}
    test["address"] = mergeddata.address[testNumber] 
    test["zip"] = mergeddata.zip[testNumber] 
    test["latitude"] = mergeddata.latitude[testNumber] 
    test["longitude"] = mergeddata.longitude[testNumber] 

    # ref
    ref = {}
    ref["address"] = list(mergeddata.address[:testNumber]) + list((mergeddata.address[(testNumber+1):maxN])) 
    ref["zip"] = list(mergeddata.zip[:testNumber]) + list((mergeddata.zip[(testNumber+1):maxN]))
    ref["latitude"] = list(mergeddata.latitude[:testNumber]) + list((mergeddata.latitude[(testNumber+1):maxN]))
    ref["longitude"] = list(mergeddata.longitude[:testNumber]) + list((mergeddata.longitude[(testNumber+1):maxN]))   
    
    return (test, ref)

def customDistance(test_lat, test_long, ref_lat, ref_long):
    diff_lat = abs(float(test_lat) - float(ref_lat))
    diff_long = abs(float(test_long) - float(ref_long))
    return ( (diff_lat + diff_long)/2.0 )


# Run Tensorflow

In [200]:

scores = []

for i in range(100):
    test, ref = getTestSet(i,n+1)

    reference_addresses = ref["address"]
    reference_zips = np.array([ref["zip"]])
    sparse_ref_set = sparse_from_word_vec(reference_addresses)
    test_address_entry = test["address"]
    test_zip_entry = [[test["zip"]]]

    # Create sparse address vectors
    test_address_repeated = [test_address_entry] * n
    sparse_test_set = sparse_from_word_vec(test_address_repeated)

    feeddict={test_address: sparse_test_set,
               test_zip: test_zip_entry,
               ref_address: sparse_ref_set,
               ref_zip: reference_zips}
    best_match = sess.run(top_match_index, feed_dict=feeddict)
    best_street = reference_addresses[int(best_match)]
    [best_zip] = reference_zips[0][best_match]
    [[test_zip_]] = test_zip_entry
    print('Address: ' + str(test_address_entry) + ', ' + str(test_zip_)) + ', ' + str(test["latitude"])
    print('Match  : ' + str(best_street) + ','  + str(best_zip))+ ', ' + str(ref["latitude"][int(best_match)])
    scores.append( customDistance(test["latitude"],
                         test["longitude"],
                         ref["latitude"][int(best_match)],
                         ref["longitude"][int(best_match)]))
                     
print "%s: %s ± %s"%("neg_mean_absolute_error (degree)", round(np.mean(scores),4), round(np.std(scores),4))

                     

Address: 17725 BELLECHASE CIR, 92128, 33.03666
Match  : 17705 BELLECHASE CIRCLE,92128, 33.03666
Address: 304 HIDDEN TRAILS RD, 92027, 33.14431
Match  : 374 HIDDEN TRAILS,92027, 33.1431
Address: 1876 ELM AVE., 92154, 32.58095
Match  : 4066 PALM AVE,92154, 32.58401
Address: 877 ISLAND AVE, 92101, 32.71008
Match  : 877 ISLAND AVE,92101, 32.71011
Address: 877 ISLAND AVE, 92101, 32.71011
Match  : 877 ISLAND AVE,92101, 32.71008
Address: 5700 BALTIMORE, 91942, 32.78366
Match  : 5700 BALTIMORE,91942, 32.78451
Address: 5700 BALTIMORE, 91942, 32.78451
Match  : 5700 BALTIMORE,91942, 32.78366
Address: 5700 BALTIMORE, 91942, 32.784420000000004
Match  : 5700 BALTIMORE,91942, 32.78366
Address: 28974 CEDAR LANE, 91962, 32.82307
Match  : 28942 CEDAR LANE,91962, 32.82333
Address: 1930 W SAN MARCOS BLVD., 92078, 33.13589
Match  : 1930 W SAN MARCOS BLVD,92078, 33.13589
Address: 13428 TURLOCK COURT, 92129, 32.96376
Match  : 4281 KERWOOD COURT,92130, 32.95287
Address: 524 VIA DE LA VALLE, 92075, 32.98184000

Address: 355 PLAZA TOLUCA, 91914, 32.66943
Match  : 438 PLAZA TOLUCA,91914, 32.66682
Address: 346 LUSTROSOS, 92057, 33.25503
Match  : 3941 ALBATROSS,92103, 32.74904
Address: 2254 BEAR ROCK GLN, 92026, 33.17168
Match  : 2450 BEAR ROCK GLN,92026, 33.16991
Address: 1154 PACIFIC GROVE LOOP, 91915, 32.638659999999994
Match  : 1111 PACIFIC GROVE LOOP,91915, 32.63985
Address: 4996 MOUNT BIGELOW, 92111, 32.81758
Match  : 4711 MOUNT BIGELOW DRIVE,92111, 32.81509
Address: 893 GINGER AVE, 92011, 33.107620000000004
Match  : 1939 RINCON AVE,92026, 33.1638176
neg_mean_absolute_error (degree): 0.0506 ± 0.0885
