## Collecting Zip codes and mapping them to Lat and Lon

The zillow city dataset does not follow the same naming convention as the Census data. About 30% of the zillow data does not map on the city names of the census data. That being said, we have the lat. and long. for each city in the census data and zillow offers a very comprehensive dataset for housing prices based on zipcodes.

The scope of this notebook is to map Latitude and Longidute to each zipcode in the zillow dataset. We are using a geo zip code data with 43000 zip codes to map to the zillow data. There are about 220 zipcodes in the zillow dataset that do not have a match in the geo zip code dataset. For these remaining zipcodes we will be pulling data from the smartystreets.com api.

In [1]:
import pandas as pd

In [2]:
import os

path = 'data/housing/bedroomzip/'
files = os.listdir(path)
names = [i.split('.csv')[0] for i in files]
names

['Zip_Zhvi_1bedroom',
 'Zip_Zhvi_2bedroom',
 'Zip_Zhvi_3bedroom',
 'Zip_Zhvi_4bedroom',
 'Zip_Zhvi_5Bedroom']

In [3]:
for i in files:
    name = i.split('.csv')[0]
    vars()[name] = pd.read_csv(path + i, encoding='latin')
    print(vars()[name].shape)

(13607, 150)
(21864, 150)
(24076, 150)
(24071, 150)
(20184, 150)


In [5]:
Zip_Zhvi_3bedroom.head()

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2008-01,2008-02,2008-03,...,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,,,...,2071432.0,2056974.0,2042780.0,2035366.0,2034423.0,2025748.0,2005972.0,1973243.0,1958949.0,1948357.0
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,,...,589483.7,589788.7,591790.7,592776.3,591098.3,590112.7,587178.0,584723.3,582316.7,581551.0
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,...,3483464.0,3445784.0,3422120.0,3384821.0,3368683.0,3347543.0,3338804.0,3334868.0,3346086.0,3355841.0
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,197135.0,196620.0,196120.333333,...,254559.0,254905.7,254079.7,253359.0,252800.3,253008.3,253191.3,254121.7,254626.7,255515.0
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,,,,...,694226.3,694793.0,698118.0,699796.3,698815.7,696359.0,693426.7,690577.0,689144.3,687817.7


In [8]:
Zip_Zhvi_3bedroom['2008-01'].mean()

205668.97360857067

In [9]:
df = pd.read_csv('data/housing/us-zip-code-latitude-and-longitude.csv', sep=';')

In [14]:
needed = []
num = 0
for i in names:
    out = vars()[i]
    for a in out.RegionName.values:
        if a not in check and a not in needed:
            num += 1
            needed.append(a)
len(needed)

214

In [16]:
import requests

get = requests.get(f"https://us-zipcode.api.smartystreets.com/lookup?auth-id=ffe8db69-6b00-f03d-b5fe-3d39699890ce&auth-token=PQ6B8aOVVhpKgsTcSomX&zipcode=6461")

In [24]:
format_it = get.json()[0]['zipcodes'][0]
lat = format_it['latitude']
lng = format_it['longitude']

In [33]:
def get_data(zipc):
    try:
        get = requests.get(f"https://us-zipcode.api.smartystreets.com/lookup?auth-id=ffe8db69-6b00-f03d-b5fe-3d39699890ce&auth-token=PQ6B8aOVVhpKgsTcSomX&zipcode={str(zipc)}")
        format_it = get.json()[0]['zipcodes'][0]
        lat = format_it['latitude']
        lng = format_it['longitude']
        res = (zipc, lat, lng)
    except:
        res = (zipc, False, False)
    return(res)

In [34]:
it = get_data(39826)

In [35]:
it[1]

31.82074

In [36]:
import time
import concurrent.futures

In [38]:
t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    export_data = executor.map(get_data, needed)


t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')

Finished in 1.766184100000146 seconds


In [39]:
loop_data = [i for i in export_data]

In [48]:
remaining_zips = {}
for i in loop_data:
    remaining_zips[int(i[0])] = {'lat':i[1], 'lng':i[2]}

In [43]:
remaining_zips[85142]

{'lat': 33.23384, 'lng': -111.64473}

In [65]:
main_dic = {}
for i in df[['Zip','Latitude','Longitude']].values:
    main_dic[i[0]] = {'lat':i[1], 'lng':i[2]}

In [66]:
lat_place = []
long_place = []
for i in Zip_Zhvi_1bedroom.RegionName.values:
    try:
        lat_place.append(main_dic[i]['lat'])
        long_place.append(main_dic[i]['lng'])
    except:
        lat_place.append(remaining_zips[i]['lat'])
        long_place.append(remaining_zips[i]['lng'])     
    

In [67]:
len(lat_place)

13607

In [68]:
len(Zip_Zhvi_1bedroom)

13607

In [70]:
for a in names:
    lat_place = []
    long_place = []
    for i in vars()[a].RegionName.values:
        try:
            lat_place.append(main_dic[i]['lat'])
            long_place.append(main_dic[i]['lng'])
        except:
            lat_place.append(remaining_zips[i]['lat'])
            long_place.append(remaining_zips[i]['lng']) 
    vars()[a]['lat'] = lat_place
    vars()[a]['lng'] = long_place

In [71]:
names

['Zip_Zhvi_1bedroom',
 'Zip_Zhvi_2bedroom',
 'Zip_Zhvi_3bedroom',
 'Zip_Zhvi_4bedroom',
 'Zip_Zhvi_5Bedroom']

In [72]:
Zip_Zhvi_4bedroom.head()

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2008-01,2008-02,2008-03,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,lat,lng
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,,,...,2998417.0,3008020.0,3018742.0,3001803.0,2983767.0,2962636.0,2980488.0,3002555.0,40.798502,-73.96811
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,,...,1010288.0,1012454.0,1010338.0,1008764.0,1006188.0,1006068.0,1005388.0,1005377.0,41.940832,-87.65852
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,,...,6367316.0,6409420.0,6425129.0,6446475.0,6487110.0,6514059.0,6543494.0,6524043.0,40.776099,-73.98285
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,267042.0,266167.5,265567.333333,...,331012.0,330225.0,329570.3,329779.0,329879.7,330930.0,331460.3,331845.7,29.760833,-95.81104
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,,,,...,1253091.0,1256735.0,1252838.0,1248100.0,1242790.0,1239398.0,1237450.0,1234638.0,41.922682,-87.65432


In [78]:
new_path = 'data/housing/bedroomziplatlng/'
for i in names:
    vars()[i].to_csv(new_path + i + '_zip'+ '.csv')

In [79]:
os.listdir(new_path)

['Zip_Zhvi_1bedroom_zip.csv',
 'Zip_Zhvi_2bedroom_zip.csv',
 'Zip_Zhvi_3bedroom_zip.csv',
 'Zip_Zhvi_4bedroom_zip.csv',
 'Zip_Zhvi_5Bedroom_zip.csv']

In [80]:
checkdf = pd.read_csv(new_path + names[0] + '_zip' + '.csv')

In [81]:
checkdf.head()

Unnamed: 0.1,Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,2008-01,2008-02,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,lat,lng
0,0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,,,...,724675.0,723230.333333,720543.0,716854.333333,714189.0,710429.666667,709241.333333,707854.666667,40.798502,-73.96811
1,1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,,,...,217307.666667,217699.333333,217167.0,216393.333333,214804.666667,213714.666667,212630.666667,212261.666667,41.940832,-87.65852
2,2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,,,...,826669.333333,825952.333333,822924.0,821563.666667,818487.666667,811886.666667,806982.333333,803770.333333,40.776099,-73.98285
3,3,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,4,,,...,256729.666667,255682.333333,254573.333333,253914.666667,253288.0,252864.666667,252103.666667,251404.666667,41.922682,-87.65432
4,4,91940,77449,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,5,120884.0,120536.5,...,147189.333333,147436.333333,147741.666667,147999.666667,148357.0,148418.666667,148552.666667,148807.0,29.825908,-95.7301
