In [1]:
# Dependencies
import pandas as pd
import pymongo
import time
import datetime as dt
import numpy as np

In [2]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.housing_db
df = pd.DataFrame(list(db.redfin.find().limit(100000)))

In [3]:
sel_cols = ['period_begin',  
'region_name' ,
'region_type',
'total_homes_sold',
'median_sale_price',
]
df_reduced = df[sel_cols]
df_reduced = df_reduced[df_reduced["region_type"]=="county"]
df_reduced = df_reduced.drop(columns = ["region_type"])
df_reduced

Unnamed: 0,period_begin,region_name,total_homes_sold,median_sale_price
0,2019-09-09,"Haywood County, NC",34.0,240000.0
1,2018-11-05,"Eaton County, MI",19.0,187500.0
2,2020-07-20,"Greenville County, SC",207.0,255000.0
3,2019-02-04,"Dane County, WI",65.0,274000.0
4,2017-05-01,"Aroostook County, ME",8.0,58700.0
...,...,...,...,...
99991,2019-08-12,"Murray County, MN",1.0,124900.0
99992,2017-09-18,"Green Lake County, WI",8.0,160500.0
99994,2017-11-13,"Orange County, VT",7.0,140000.0
99995,2019-07-29,"Pennington County, MN",2.0,238000.0


In [4]:
begin_dates = pd.to_datetime(df_reduced["period_begin"], infer_datetime_format=True)
df_reduced["year"] = begin_dates.dt.strftime("%Y")

df_reduced["year_month"] = begin_dates.dt.strftime("%Y-%m")

states = []
for region in df_reduced["region_name"]:
    reg_list = region.split(",")
    if len(reg_list) == 2:
        reg_state = reg_list[1][1:3]
        states.append(reg_list[1][1:3])
    else:
        states.append("")
df_reduced["state"] = states

In [5]:
final_cols = ['year',
              'state',
'total_homes_sold',
'median_sale_price',
]
df_redfinclean = df_reduced[final_cols]
df_redfinclean.head()

Unnamed: 0,year,state,total_homes_sold,median_sale_price
0,2019,NC,34.0,240000.0
1,2018,MI,19.0,187500.0
2,2020,SC,207.0,255000.0
3,2019,WI,65.0,274000.0
4,2017,ME,8.0,58700.0


In [6]:
# state by state data
redfinstatesum = df_redfinclean.groupby(['state', 'year']).agg({'total_homes_sold': "sum", 'median_sale_price': "mean"})
redfinstatesum = redfinstatesum.round(2)
redfinstatesum.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_homes_sold,median_sale_price
state,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2017,2138.0,257185.4
AK,2018,1615.0,260942.04
AK,2019,2446.0,273478.86
AK,2020,2047.0,294148.14
AK,2021,553.0,293342.76


In [7]:
#US data
# df_no_nan = df_redfinclean.dropna(how = "all")
us_data = df_redfinclean.groupby(["year"]).agg({'total_homes_sold': "sum", 'median_sale_price': "mean"})
us_data

Unnamed: 0_level_0,total_homes_sold,median_sale_price
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2017,1027111.0,205786.710458
2018,1032103.0,220678.501349
2019,1036457.0,231978.941045
2020,1082858.0,254373.957034
2021,288175.0,272346.0411


In [8]:
data_dict = {}

state_list = df_redfinclean.state.unique().tolist()
state_list.sort()

for state in state_list:
    df = redfinstatesum.loc[state] 
    record = {
        "total_homes_sold" : df["total_homes_sold"].tolist(),
        "median_sale_price" : df["median_sale_price"].tolist()
    }
    data_dict[state] = record

data_dict["US"] = {
        "total_homes_sold" : us_data["total_homes_sold"].tolist(),
        "median_sale_price" : us_data["median_sale_price"].tolist()
    }

geography_list = state_list.insert(0,"US")
years = df_redfinclean.year.unique().tolist()
years.sort()

output_dictionary = {
    "geographies" : geography_list,
    "years" : years,
    "data" : data_dict
}

In [9]:
# load data
db.redfinclean.drop()
db.redfinclean.find()
db.redfinclean.insert_one(output_dictionary)

<pymongo.results.InsertOneResult at 0x7f8c524b9b08>