In [1]:
import pandas as pd
import numpy as np
import os
import folium
import map_functions

In [2]:
ROOT = "C:/Users/Beau/Documents/GitHub/RealEstate"
RAW = os.path.join(ROOT, "data", "raw")
RAW_REDFIN = os.path.join(RAW, "Redfin_multifamily_sold_in_2022")
SERIALIZED = os.path.join(ROOT, "data", "serialized")
CLEANED = os.path.join(ROOT, "data", "processed")

In [3]:
# combine then serialize data
# df = pd.DataFrame()
# for filename in os.listdir(RAW_REDFIN):
#     full_path = os.path.join(RAW_REDFIN, filename)
#     df = df.append(pd.read_csv(full_path))

# df.reset_index(drop=True, inplace=True)
# df.to_pickle(os.path.join(SERIALIZED, "Redfin_multifamily_sold_in_2022.pkl"))

In [4]:
df = pd.read_pickle(os.path.join(SERIALIZED, "Redfin_multifamily_sold_in_2022.pkl"))

In [5]:
df.shape

(2533, 27)

In [6]:
# format date features

# listings are sorted by date so we can estimate simply by using the previous row's date
for idx, row in df.iterrows():
    if pd.isnull(row["SOLD DATE"]):
        df.loc[idx, "SOLD DATE"] = prev_date
    else:
        prev_date = row["SOLD DATE"]

# separate into year, month, day
df["SOLD DATE"] = pd.to_datetime(df["SOLD DATE"])
df["SOLD_YEAR"] = df["SOLD DATE"].dt.year
df["SOLD_MONTH"] = df["SOLD DATE"].dt.month_name()
df["SOLD_DAY"] = df["SOLD DATE"].dt.day

In [7]:
# filter irrelevant rows
df = df.loc[df.CITY == "Chicago"]
df = df.loc[df["PROPERTY TYPE"].isin(["Multi-Family (2-4 Unit)", "Multi-Family (5+ Unit)"])]
df = df.loc[df["PRICE"] < 50000000]
df = df.loc[df["BEDS"].notnull() & df["BATHS"].notnull()]

In [8]:
# drop unnecessary columns
df.drop([
    "SALE TYPE",
    "CITY",
    "STATE OR PROVINCE",
    "NEXT OPEN HOUSE START TIME",
    "NEXT OPEN HOUSE END TIME",
    "STATUS",
    "SOURCE",
    "FAVORITE",
    "INTERESTED",
    "MLS#",
    "DAYS ON MARKET", # all missing
    "HOA/MONTH", # almost always missing
    "$/SQUARE FEET" # not needed; already have price and sqft
], axis=1, inplace=True)

In [9]:
# Create a Map instance for Chicago
m = folium.Map(location=[41.8781, -87.6298], zoom_start=10)

for idx, row in df.iterrows():
    map_functions.add_map_marker(m, row.LATITUDE, row.LONGITUDE, row["PRICE"])
    # map_functions.add_map_circle(row.LATITUDE, row.LONGITUDE)

# Change the map style
# m.add_tile_layer(tiles='Stamen Toner', name='Stamen Toner')

# Display the map
m

In [10]:
df.reset_index(drop=True, inplace=True)

df.to_pickle(os.path.join(CLEANED, "Redfin_multifamily_sold_in_2022.pkl"))