In [1]:
import real_estate_api as rea

from typing import Union, List
from types import ModuleType
from beartype import beartype
import warnings

import numpy as np
import pandas as pd
import sys
import importlib

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import keys as k
keys = k.getKeys()

import pgeocode

'''
### For pgeocode to work, I am not going to pretend to know what this does ###
# https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error
import certifi
import os
import ssl

openssl_dir, openssl_cafile = os.path.split(
        ssl.get_default_verify_paths().openssl_cafile
)

os.chdir(openssl_dir)
relpath_to_certifi_cafile = os.path.relpath(certifi.where())

try:
    os.remove(openssl_cafile)
except FileNotFoundError:
    pass

os.symlink(relpath_to_certifi_cafile, openssl_cafile)
###
'''

'\n### For pgeocode to work, I am not going to pretend to know what this does ###\n# https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error\nimport certifi\nimport os\nimport ssl\n\nopenssl_dir, openssl_cafile = os.path.split(\n        ssl.get_default_verify_paths().openssl_cafile\n)\n\nos.chdir(openssl_dir)\nrelpath_to_certifi_cafile = os.path.relpath(certifi.where())\n\ntry:\n    os.remove(openssl_cafile)\nexcept FileNotFoundError:\n    pass\n\nos.symlink(relpath_to_certifi_cafile, openssl_cafile)\n###\n'

In [2]:
user_home = rea.get_UserHome(keys['SampleHouse'].strip("\'"))
hoi = rea.get_HousesOfInterest(user_home, n=1000, listed_to_sold_ratio=0.3, verbose=True)
gd = rea.geo_data(hoi['geo'])

In [3]:
# Can this be the first step in the pipeline or no?
fg = rea.FeatureGenerator(
    houses = [rea.house(h) for h in hoi['houses']], 
    gd=gd,
    user_home=user_home
)

In [4]:
# Basic transformer to dataframe.
class ToDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(X)

In [5]:
pipe = Pipeline([
    ('format_data', ToDataFrame())
])

output = pipe.fit_transform(fg.features, fg.targets)

In [13]:
to_data_frame = ToDataFrame()
X_t = to_data_frame.fit_transform(fg.features)
X_t

Unnamed: 0,Days_listed,Days_updated,baths_full,baths_3qtr,baths_half,baths_1qtr,year_built,lot_sqft,sqft,garage,stories,beds,type,tags,new_construction,distance_to_home
0,1,1,1,1,0,0,1942,7000,1992,1,1,4,single_family,"[community_outdoor_space, den_or_office, dinin...",False,0.087471
1,1,1,1,0,1,0,1988,1071,1300,0,1,2,single_family,"[city_view, community_outdoor_space, community...",False,0.101667
2,2,2,1,1,0,0,1913,3500,1750,0,1,3,single_family,"[community_security_features, den_or_office, d...",False,0.205025
3,2,2,1,1,1,0,1980,8100,3280,2,1,3,single_family,"[community_outdoor_space, dining_room, dishwas...",False,0.068888
4,2,2,1,1,1,0,1948,7680,2180,2,1,4,single_family,"[central_air, community_outdoor_space, dining_...",False,0.083503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,0,75,1,1,1,0,1906,4000,2260,1,1,3,single_family,"[city_view, community_outdoor_space, den_or_of...",False,0.091754
971,0,74,3,0,1,0,2006,3600,3340,2,1,5,single_family,"[central_air, community_outdoor_space, communi...",False,0.127658
972,0,73,1,1,0,0,1952,5800,2360,1,1,2,single_family,"[community_outdoor_space, community_security_f...",False,0.080205
973,0,73,1,1,0,0,1906,5700,2880,1,1,2,single_family,"[community_outdoor_space, dining_room, dishwas...",False,0.093157


In [29]:
X_t[X_t['lot_sqft'] > 5_000].sort_values('lot_sqft')

Unnamed: 0,Days_listed,Days_updated,baths_full,baths_3qtr,baths_half,baths_1qtr,year_built,lot_sqft,sqft,garage,stories,beds,type,tags,new_construction,distance_to_home
870,0,67,2,0,1,0,2004,5001,2320,2,1,4,single_family,"[community_outdoor_space, dishwasher, fireplac...",False,0.211181
225,66,6,3,0,1,0,1990,5001,3190,2,1,5,single_family,"[community_outdoor_space, den_or_office, dinin...",False,0.070469
927,0,0,4,1,0,0,2013,5002,3040,0,2,4,single_family,[],False,0.183211
327,0,10,1,1,0,0,1950,5002,2150,1,1,4,single_family,"[community_outdoor_space, dining_room, dishwas...",False,0.142171
559,0,32,2,0,0,0,1980,5005,1460,0,1,3,single_family,"[community_clubhouse, community_outdoor_space,...",False,0.145353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,53,5,0,1,0,0,1997,43560,560,0,1,2,single_family,"[city_view, community_outdoor_space, community...",False,0.102533
803,0,62,1,0,0,0,1920,43560,691,0,1,2,single_family,"[community_outdoor_space, forced_air, hardwood...",False,0.107248
85,28,5,0,1,1,0,1997,43560,1000,0,1,2,single_family,"[city_view, community_boat_facilities, communi...",False,0.101667
375,0,19,1,1,1,0,1995,43560,1954,2,1,2,single_family,"[central_air, community_security_features, dis...",False,0.117246


In [14]:
X_t.head()

Unnamed: 0,Days_listed,Days_updated,baths_full,baths_3qtr,baths_half,baths_1qtr,year_built,lot_sqft,sqft,garage,stories,beds,type,tags,new_construction,distance_to_home
0,1,1,1,1,0,0,1942,7000,1992,1,1,4,single_family,"[community_outdoor_space, den_or_office, dinin...",False,0.087471
1,1,1,1,0,1,0,1988,1071,1300,0,1,2,single_family,"[city_view, community_outdoor_space, community...",False,0.101667
2,2,2,1,1,0,0,1913,3500,1750,0,1,3,single_family,"[community_security_features, den_or_office, d...",False,0.205025
3,2,2,1,1,1,0,1980,8100,3280,2,1,3,single_family,"[community_outdoor_space, dining_room, dishwas...",False,0.068888
4,2,2,1,1,1,0,1948,7680,2180,2,1,4,single_family,"[central_air, community_outdoor_space, dining_...",False,0.083503


In [30]:
fg.houses[727].price

2700000

In [56]:
'''
Days Listed - Linear
Days Updated - Linear
*baths - Normalize
year_built - I want to bucketize these, then keep the dummies.
lot_sqft - Normalize
    Feature generation, multiply lot_sqft normalized and the inverse_distance
sqft - Normalize
garage / stories / beds - Normalize
Tags - Do what we did in the nlp homework
'''

from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

class DictEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.Series(map(lambda l : {k : 1 for k in l}, X))

minmax_cols = ['Days_listed', 'Days_listed']
normalize_cols = ['lot_sqft', 'sqft']
bucketize_cols = ['year_built']
dummy_cols = ['baths_full', 'baths_full', 'baths_full', 'baths_1qtr', 'garage', 'stories', 'beds']

preprocess_tags_cols = Pipeline(
    [
        ('dict_encode', DictEncoder()),
        ('dict_vectorize', DictVectorizer())
    ]
)

preprocess_data = ColumnTransformer(
    [
        ('scale', MinMaxScaler(), minmax_cols),
        ('normalize', StandardScaler(), normalize_cols),
        ('bucketize', KBinsDiscretizer(), bucketize_cols),
        ('dummy', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), dummy_cols),
        ('list', preprocess_list_cols, 'tags')
    ]
)

pd.DataFrame(preprocess_data.fit_transform(X_t))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,179,180,181,182,183,184,185,186,187,188
0,0.012048,0.012048,0.062294,-0.124753,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.012048,0.012048,-1.039531,-0.916821,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.024096,0.024096,-0.588134,-0.401748,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.024096,0.024096,0.266715,1.349501,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.024096,0.024096,0.188663,0.090433,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,0.000000,0.000000,-0.495215,0.182002,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
971,0.000000,0.000000,-0.569550,1.418177,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
972,0.000000,0.000000,-0.160710,0.296463,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
973,0.000000,0.000000,-0.179293,0.891658,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [48]:
X_t.tags[0]

['community_outdoor_space',
 'den_or_office',
 'dining_room',
 'dishwasher',
 'family_room',
 'fireplace',
 'forced_air',
 'hardwood_floors',
 'hill_or_mountain_view',
 'ocean_view',
 'recreation_facilities',
 'rv_or_boat_parking',
 'view',
 'washer_dryer',
 'water_view',
 'basement',
 'garage_1_or_more',
 'two_or_more_stories',
 'updated_kitchen',
 'media_room',
 'mount_rainier_view',
 'groundscare']