In [9]:
import json
import pandas as pd
import geojson

In [12]:
#load json files
with open("../static/data/redfin_map.json") as fp:
    rf = json.load(fp)
with open("../static/data/us_counties.geojson", encoding="ISO-8859-1") as fp:
    geo = geojson.load(fp)

[{'_id': {'$oid': '60907edf4751e68e9cb9af40'},
  'region_name': 'Iowa County, WI',
  'period_begin': '2020-12-28',
  'median_sale_price': '133000.0',
  'median_sale_ppsf': '89.28571428571429',
  'inventory': '47.0'},
 {'_id': {'$oid': '60907edf4751e68e9cb9af9b'},
  'region_name': 'Shoshone County, ID',
  'period_begin': '2020-12-28',
  'median_sale_price': '240000.0',
  'median_sale_ppsf': '98.07619766125987',
  'inventory': '21.0'},
 {'_id': {'$oid': '60907edf4751e68e9cb9b085'},
  'region_name': 'Washoe County, NV',
  'period_begin': '2020-12-28',
  'median_sale_price': '403997.5',
  'median_sale_ppsf': '242.17854384812048',
  'inventory': '959.0'},
 {'_id': {'$oid': '60907edf4751e68e9cb9b1ae'},
  'region_name': 'Washtenaw County, MI',
  'period_begin': '2020-12-28',
  'median_sale_price': '285090.0',
  'median_sale_ppsf': '163.96946564885496',
  'inventory': '767.0'},
 {'_id': {'$oid': '60907ee04751e68e9cb9b244'},
  'region_name': 'Indiana County, PA',
  'period_begin': '2020-12-28',

### explore county names in the redfin data
I initially tried separating by a space " " but realised that meant the function broke up county names that were multiple words. So I decided to separate via the comma and replacing the words "County" and "Borough" with an empty string to delete them.

In [87]:
region_in_rf = [county["region_name"].split(sep = ",")[0] for county in rf]
region_in_rf = [county.replace(" Parish", "") for county in region_in_rf]
region_in_rf = [county.replace(" Borough", "") for county in region_in_rf]
region_in_rf = [county.replace(" County", "") for county in region_in_rf]

print(f"number of entries: {len(region_in_rf)}")
print(f"number of unique entries: {len(pd.Series(region_in_rf).unique())}")

3604
874


### explore county names in the geojson data
I do a similar exercise on the geojson data to see if county is a good key to try and join on

In [96]:
#see the structure of each feature 
print(json.dumps(geo.features[0], indent=4))

{
    "type": "Feature",
    "geometry": {
        "type": "Polygon",
        "coordinates": [
            [
                [
                    -85.388717,
                    33.913044
                ],
                [
                    -85.380885,
                    33.873508
                ],
                [
                    -85.379455,
                    33.866291
                ],
                [
                    -85.377426,
                    33.856047
                ],
                [
                    -85.376403,
                    33.850656
                ],
                [
                    -85.364595,
                    33.788446
                ],
                [
                    -85.361844,
                    33.773951
                ],
                [
                    -85.360491,
                    33.767958
                ],
                [
                    -85.357402,
                    33.750104
                ],


In [97]:
#see the structure of a feature's property key
print(json.dumps(geo.features[0].properties, indent=4))

{
    "GEO_ID": "0500000US01029",
    "STATE": "01",
    "COUNTY": "029",
    "NAME": "Cleburne",
    "LSAD": "County",
    "CENSUSAREA": 560.1
}


In [100]:
county_in_geo = [county["properties"]["NAME"] for county in geo.features]
print(len(county_in_geo))
print(len(pd.Series(county_in_geo).unique()))
county_in_geo

3221
1909


['Cleburne',
 'Coffee',
 'Coosa',
 'Covington',
 'Crenshaw',
 'Dale',
 'DeKalb',
 'Escambia',
 'Fayette',
 'Geneva',
 'Henry',
 'Lamar',
 'Lauderdale',
 'Lawrence',
 'Autauga',
 'Baldwin',
 'Barbour',
 'Bibb',
 'Blount',
 'Bullock',
 'Choctaw',
 'Clarke',
 'Clay',
 'Colbert',
 'Conecuh',
 'Cullman',
 'Dallas',
 'Elmore',
 'Etowah',
 'Franklin',
 'Greene',
 'Hale',
 'Jackson',
 'Jefferson',
 'Lowndes',
 'Macon',
 'Madison',
 'Marengo',
 'Marshall',
 'Mobile',
 'Monroe',
 'Pickens',
 'Pike',
 'Russell',
 'St. Clair',
 'Shelby',
 'Sumter',
 'Talladega',
 'Washington',
 'Wilcox',
 'Butler',
 'Calhoun',
 'Chambers',
 'Cherokee',
 'Chilton',
 'Tallapoosa',
 'Tuscaloosa',
 'Walker',
 'Winston',
 'Montgomery',
 'Houston',
 'Lee',
 'Limestone',
 'Marion',
 'Morgan',
 'Perry',
 'Randolph',
 'Aleutians East',
 'Aleutians West',
 'Anchorage',
 'Bethel',
 'Bristol Bay',
 'Denali',
 'Dillingham',
 'Fairbanks North Star',
 'Hoonah-Angoon',
 'Juneau',
 'Kenai Peninsula',
 'Ketchikan Gateway',
 'Lake a

In [89]:
len(geo.features)

3221

In [110]:
checker = []
for i in county_in_geo:
    if i in region_in_rf:
        checker.append(1)
        print(f"FOUND: {i}")
    else:
        checker.append(0)
        print(f"NOT FOUND: {i}")    

FOUND: Cleburne
FOUND: Coffee
FOUND: Coosa
NOT FOUND: Covington
NOT FOUND: Crenshaw
NOT FOUND: Dale
FOUND: DeKalb
NOT FOUND: Escambia
FOUND: Fayette
NOT FOUND: Geneva
FOUND: Henry
FOUND: Lamar
FOUND: Lauderdale
FOUND: Lawrence
NOT FOUND: Autauga
NOT FOUND: Baldwin
NOT FOUND: Barbour
FOUND: Bibb
FOUND: Blount
NOT FOUND: Bullock
NOT FOUND: Choctaw
FOUND: Clarke
FOUND: Clay
NOT FOUND: Colbert
NOT FOUND: Conecuh
NOT FOUND: Cullman
FOUND: Dallas
FOUND: Elmore
FOUND: Etowah
FOUND: Franklin
FOUND: Greene
NOT FOUND: Hale
FOUND: Jackson
FOUND: Jefferson
NOT FOUND: Lowndes
FOUND: Macon
FOUND: Madison
NOT FOUND: Marengo
FOUND: Marshall
NOT FOUND: Mobile
FOUND: Monroe
FOUND: Pickens
FOUND: Pike
NOT FOUND: Russell
FOUND: St. Clair
FOUND: Shelby
FOUND: Sumter
FOUND: Talladega
FOUND: Washington
NOT FOUND: Wilcox
FOUND: Butler
FOUND: Calhoun
FOUND: Chambers
FOUND: Cherokee
NOT FOUND: Chilton
NOT FOUND: Tallapoosa
NOT FOUND: Tuscaloosa
FOUND: Walker
NOT FOUND: Winston
FOUND: Montgomery
FOUND: Houston
F

FOUND: Washington
FOUND: Wayne
NOT FOUND: Webster
FOUND: Wheeler
FOUND: York
NOT FOUND: Perkins
FOUND: Phelps
FOUND: Pierce
FOUND: Platte
FOUND: Polk
NOT FOUND: Red Willow
NOT FOUND: Richardson
FOUND: Rock
FOUND: Sarpy
FOUND: Saunders
NOT FOUND: Scotts Bluff
NOT FOUND: Seward
NOT FOUND: Sheridan
FOUND: Churchill
FOUND: Clark
FOUND: Douglas
NOT FOUND: Elko
NOT FOUND: Esmeralda
NOT FOUND: Eureka
FOUND: Humboldt
NOT FOUND: Lander
FOUND: Lincoln
FOUND: Lyon
FOUND: Washoe
NOT FOUND: White Pine
FOUND: Mineral
FOUND: Nye
FOUND: Pershing
FOUND: Storey
FOUND: Carson City
FOUND: Belknap
FOUND: Carroll
FOUND: Cheshire
FOUND: Coos
FOUND: Grafton
FOUND: Hillsborough
FOUND: Merrimack
FOUND: Sullivan
FOUND: Strafford
FOUND: Rockingham
FOUND: Atlantic
FOUND: Bergen
FOUND: Burlington
FOUND: Camden
FOUND: Cape May
FOUND: Cumberland
FOUND: Essex
FOUND: Gloucester
FOUND: Hudson
FOUND: Hunterdon
FOUND: Mercer
FOUND: Middlesex
FOUND: Monmouth
FOUND: Morris
FOUND: Ocean
FOUND: Passaic
FOUND: Salem
FOUND: Som

In [108]:
print(f"For the {len(checker)} counties in the geojson file, {len(checker)-sum(checker)} are missing in the redfin data")

For the 3221 counties in the geojson file, 1230 are missing in the redfin data


In [None]:
# #establish MongoDB connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.housing_db

# #load in totals
db.census_housing_age.drop()
db.census_housing_demo.drop()
db.census_housing_demo.find()

db.census_housing_demo.insert_one(age_output_dictionary)
db.census_housing_demo.insert_one(race_output_dictionary)

