In [1]:
import uszipcode
from torch import inference_mode
from uszipcode import SearchEngine
import geopy
from geopy.geocoders import Nominatim
import pandas as pd
import os
import csv
from pathlib import Path
import datetime as dt

In [4]:
# Load reduced data source with city and state names
city_file = Path("./Resources/Sale_Prices_City_sample.csv")
city_data = pd.read_csv(city_file, low_memory=False)
city_data

Unnamed: 0,RegionID,RegionName,StateName,SizeRank,2008-03,2008-04,2008-05,2008-06,2008-07,2008-08,...,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03
0,17426,Chicago,Illinois,4,325100.0,314800.0,286900.0,274600.0,268500.0,264400.0,...,271500.0,266500.0,264900.0,265000.0,264100.0,264300.0,270000.0,281400.0,302900.0,309200.0
1,18959,Las Vegas,Nevada,8,244400.0,240100.0,234600.0,227200.0,218100.0,206900.0,...,266000.0,268200.0,268400.0,270400.0,272200.0,274400.0,277500.0,278900.0,283000.0,284700.0
2,38128,Dallas,Texas,10,148400.0,142300.0,150900.0,157300.0,164100.0,163400.0,...,300600.0,301100.0,301500.0,310100.0,314600.0,316900.0,307100.0,321500.0,315500.0,321700.0
3,10920,Columbus,Ohio,19,116000.0,112100.0,114900.0,113800.0,111900.0,108200.0,...,152600.0,154800.0,158500.0,157700.0,162900.0,166600.0,172700.0,175300.0,173600.0,177600.0
4,12455,Louisville,Kentucky,20,124800.0,123500.0,122100.0,121900.0,121500.0,122100.0,...,172700.0,175900.0,176000.0,173900.0,175400.0,174600.0,176600.0,170800.0,173100.0,176200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,27950,Wildwood,New Jersey,3649,268300.0,256500.0,265200.0,289400.0,305100.0,294300.0,...,260100.0,260000.0,267200.0,264100.0,263300.0,266200.0,275300.0,274000.0,264500.0,270200.0
858,36173,Miramar Beach,Florida,3661,342200.0,362900.0,394800.0,434700.0,407400.0,364700.0,...,375700.0,384500.0,379300.0,374100.0,368200.0,370700.0,412500.0,408900.0,409000.0,379500.0
859,21021,Vail,Arizona,3681,280700.0,281200.0,267000.0,258700.0,256600.0,253200.0,...,277600.0,277800.0,283100.0,286700.0,297000.0,297200.0,292100.0,275100.0,265400.0,269300.0
860,25643,Longboat Key,Florida,3690,646800.0,613000.0,546400.0,602800.0,643800.0,696400.0,...,567400.0,554700.0,502000.0,495600.0,535300.0,557900.0,577100.0,557900.0,565600.0,579700.0


In [5]:
# check to see if there are duplicate city names
city_names_pivot = city_data.pivot_table(columns=['RegionName'], aggfunc='size').sort_values(ascending=False)
city_duplicates_df = city_names_pivot.reset_index()
city_duplicates_df.head(66)

Unnamed: 0,RegionName,0
0,Lexington,3
1,Westfield,3
2,Springfield,3
3,Franklin,3
4,Saint Cloud,2
...,...,...
61,Northglenn,1
62,Northport,1
63,Northville,1
64,Norwalk,1


## CREATE ENGINE TO PAIR ZIP CODES W/ USZIPCODE LIBRARY

In [6]:
engine = SearchEngine()
zipcodes_dict = {"RegionID": [], "City": [], "State": [], "Zip Code": []}
bad_cities = {"RegionID": [], "City": [], "State": []}
for i in city_data.index:
    try:
        city=city_data['RegionName'].iloc[i]
        state=city_data['StateName'].iloc[i]
        regionId=city_data['RegionID'].iloc[i]
        zipcodes = engine.by_city_and_state(city=city, state=state)
        for zipcode in zipcodes:
        # Add only the first located zip code for each location to dictionary
            if regionId not in zipcodes_dict["RegionID"]:
                zipcodes_dict["RegionID"].append(regionId)
                zipcodes_dict["City"].append(city)
                zipcodes_dict['State'].append(state)
                zipcodes_dict['Zip Code'].append(zipcode.zipcode)
                print(int(i)+1, zipcode.zipcode, zipcode.major_city, zipcode.state)
    except:
        # Add cities with errors to a dictionary of bad locations
        bad_cities["RegionID"].append(regionId)
        bad_cities["City"].append(city)
        bad_cities["State"].append(state)
        print(f"--------{city}, {state} not found--------")
        pass
    

Download /Users/aryan_sidh/.uszipcode/simple_db.sqlite from https://github.com/MacHu-GWU/uszipcode-project/releases/download/1.0.1.db/simple_db.sqlite ...
  1.00 MB downloaded ...
  2.00 MB downloaded ...
  3.00 MB downloaded ...
  4.00 MB downloaded ...
  5.00 MB downloaded ...
  6.00 MB downloaded ...
  7.00 MB downloaded ...
  8.00 MB downloaded ...
  9.00 MB downloaded ...
  10.00 MB downloaded ...
  11.00 MB downloaded ...
  Complete!
1 60601 Chicago IL
2 89101 Las Vegas NV
3 75201 Dallas TX
4 43085 Columbus OH
5 40202 Louisville KY
6 32801 Orlando FL
7 80202 Denver CO
8 20001 Washington DC
9 97201 Portland OR
10 37201 Nashville TN
11 68102 Omaha NE
12 73102 Oklahoma City OK
13 27601 Raleigh NC
14 80902 Colorado Springs CO
15 55401 Minneapolis MN
16 70801 Baton Rouge LA
17 80010 Aurora CO
18 92801 Anaheim CA
19 27395 Greensboro NC
20 40502 Lexington KY
21 55101 Saint Paul MN
22 34102 Naples FL
23 32301 Tallahassee FL
24 89002 Henderson NV
25 99201 Spokane WA
26 32501 Pensacola FL


In [7]:
# Create a dataframe of the bad locations to see how many we're removing from the datasource
bad_cities_df = pd.DataFrame(bad_cities, columns=["RegionID", "City", "State"])
bad_cities_df

Unnamed: 0,RegionID,City,State
0,38992,Highlands Ranch,Colorado
1,26561,Plymouth,Minnesota
2,17845,Eagan,Minnesota
3,10264,Bartlett,Tennessee
4,25383,Kentwood,Michigan
5,12751,Milwaukie,Oregon
6,6102,Murray,Utah
7,49352,Dunwoody,Georgia
8,51952,Evesham Township,New Jersey
9,26213,Northglenn,Colorado


In [8]:
# Create new dataframe from dictionary of locations with zip codes
zipcodes_df = pd.DataFrame(zipcodes_dict, columns=['RegionID', 'City', 'State', 'Zip Code'])
zipcodes_df = zipcodes_df.drop_duplicates(subset='Zip Code', keep='first')
zipcodes_df

Unnamed: 0,RegionID,City,State,Zip Code
0,17426,Chicago,Illinois,60601
1,18959,Las Vegas,Nevada,89101
2,38128,Dallas,Texas,75201
3,10920,Columbus,Ohio,43085
4,12455,Louisville,Kentucky,40202
...,...,...,...,...
817,27950,Wildwood,New Jersey,08260
818,36173,Miramar Beach,Florida,32550
819,21021,Vail,Arizona,85641
820,25643,Longboat Key,Florida,34228


In [10]:
# create temporary csv to feed into geocoding script
zipcodes_df.to_csv('./Resources/zipcodes.csv', index=False)

In [11]:
# check to see if there are duplicate zip codes
# zipcodes_df = pd.read_csv('./resources/zipcodes.csv')
zipcodes_pivot = zipcodes_df.pivot_table(columns=['Zip Code'], aggfunc='size').sort_values(ascending=False)
zipcodes_dup_df = zipcodes_pivot.reset_index()
zipcodes_dup_df.head(10)

Unnamed: 0,Zip Code,0
0,1013,1
1,70433,1
2,66044,1
3,66061,1
4,66206,1
5,66215,1
6,68046,1
7,68102,1
8,70053,1
9,70062,1


In [28]:
# zipcodes_df = zipcodes_df.drop_duplicates(subset='Zip Code', keep='first')
# zipcodes_df