In [1]:
import real_estate_api as rea

import os
import pickle
import random
import numpy as np
import pandas as pd

from collections import Counter
from itertools import chain

from sklearn.base import TransformerMixin, BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_absolute_error

import keys as k
keys = k.getKeys()

In [2]:
user_home = rea.get_UserHome(keys['SampleHouse'].strip("\'"))
user_home_details = rea.get_PropertyDetail(user_home['property_id'])
hoi = rea.get_HousesOfInterest(user_home, n=2000, listed_to_sold_ratio=0.3, verbose=True)
gd = rea.geo_data(hoi['geo'])

Shortfall in listed houses detected, appending 114 of current listing to results.


In [3]:
# Can this be the first step in the pipeline or no?
fg = rea.FeatureGenerator(
    houses = hoi['houses'], 
    gd=gd,
    user_home=user_home_details
)

In [4]:
# This will randomly take 20% of the data as a validation set.
test_indicies = random.sample(range(0, len(fg.targets)), int(0.2 * len(fg.targets)))
train_indicies = set(range(0, len(fg.targets))).difference(set(test_indicies))

target_transformer = StandardScaler()
train_targets = target_transformer.fit_transform(np.array(list(map(fg.targets.__getitem__, train_indicies))).reshape(-1,1))
test_targets = target_transformer.transform(np.array(list(map(fg.targets.__getitem__, test_indicies))).reshape(-1,1))

train_features = list(map(fg.features.__getitem__, train_indicies))
test_features = list(map(fg.features.__getitem__, test_indicies))

In [5]:
# This is the whole data set
target_transformer = StandardScaler()
train_targets = target_transformer.fit_transform(np.array(fg.targets).reshape(-1,1))
train_features = fg.features

In [6]:
'''
Days Listed - Linear
Days Updated - Linear
*baths - Normalize
year_built - I want to bucketize these, then keep the dummies.
lot_sqft - Normalize
    Feature generation, multiply lot_sqft normalized and the inverse_distance
        More questions, can you do this?
sqft - Normalize
garage / stories / beds - Normalize
Tags - Do what we did in the nlp homework
'''

class ToDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(X).drop_duplicates(subset=['Property_ID'])

class DictEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tags_to_keep = None

    def fit(self, X, y=None):
        tag_frequency = Counter(chain(*X))
        self.tags_to_keep = [k for k, v in tag_frequency.items() if v > 1]
        return self
    
    def transform(self, X):
        return pd.Series(map(lambda l : {k : 1 for k in l if l in self.tags_to_keep}, X))

#minmax_cols = ['distance_to_home'] # 'Days_listed', 'Days_listed', 
normalize_cols = ['lot_sqft', 'sqft']
bucketize_cols = ['year_built', 'distance_to_home']
lat_long_cols = ['lat', 'long']
dummy_cols = ['baths_full', 'baths_3qtr', 'baths_half', 'baths_1qtr', 'garage', 'stories', 'beds']

preprocess_min_max_cols = Pipeline(
    [
        ('impute', SimpleImputer(missing_values=np.nan, strategy="mean")),
        ('min_max_scale', MinMaxScaler())
    ]
)

preprocess_tags_col = Pipeline(
    [
        ('dict_encode', DictEncoder()),
        ('dict_vectorize', DictVectorizer())
    ]
)

preprocess_bucketize_col = Pipeline(
    [
        ('impute', SimpleImputer(missing_values=np.nan, strategy="median")),
        ('bucketize', KBinsDiscretizer(n_bins=20, strategy='uniform'))
    ]
)

preprocess_data = ColumnTransformer(
    [
        #('scale', preprocess_min_max_cols, minmax_cols),
        ('normalize', StandardScaler(), normalize_cols),
        ('bucketize', preprocess_bucketize_col, bucketize_cols),
        ('lat_long', preprocess_bucketize_col, lat_long_cols),
        ('dummy', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), dummy_cols),
        #('list', preprocess_tags_col, 'tags')
    ]
)

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasRegressor

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

class KerasModel(BaseEstimator, RegressorMixin):
    def __init__(self, 
                 user_home, 
                 load_model_if_available : bool = True, 
                 update_model : bool = False, 
                 save_model : bool = False):
        # Interesting note, this instance is created before paramters are passed into the step of the pipeline.
        self.user_home = user_home
        self.load_model_if_available = load_model_if_available
        self.update_model = update_model
        self.save_model = save_model

        self.earlystopping = EarlyStopping(patience=5, verbose=1, min_delta=0.05)

        if update_model and not self.load_model_if_available:
            raise Exception('Can not update a model not loaded.')

        self.model_name = f'{user_home.get("city")}_{user_home.get("state_code")}'

    def _keras_model(self, n_cols):
        km = Sequential()
        km.add(Dense(256, input_shape=(n_cols,), activation='relu', kernel_initializer='normal', name='dense_1'))
        km.add(Dense(128, activation='relu', kernel_initializer='normal', name='dense_2'))
        km.add(Dropout(0.20, name='dropout'))
        km.add(Dense(64, activation='relu', kernel_initializer='normal', name='dense_3'))
        km.add(Dense(1, activation='linear', kernel_initializer='normal', name='output'))
        km.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error', 'mean_absolute_error'])
        return KerasRegressor(model=km)

    def _generate_model(self, X, y):
            if not self.update_model: 
                self.model = self._keras_model(n_cols=X.shape[1])
            self.model.fit(X, y, epochs=100, batch_size = 50, callbacks = self.earlystopping)

    def fit(self, X, y=None):
        model_file_path = f'Saved Results/KerasModel/{self.model_name}.pkl'

        if self.load_model_if_available:
            if os.path.isfile(model_file_path):
                with open(model_file_path, 'rb') as f:
                    self.model = pickle.load(f)
                if self.update_model:
                    self._generate_model(X, y)
            else:
                print(f'No model found, generating {self.model_name}.')
                self._generate_model(X, y)
        else:
            self._generate_model(X, y)

        if self.update_model or self.save_model:
            with open(model_file_path, 'wb') as f:
                pickle.dump(self.model, f)
        
        return self

    def predict(self, X, tt = target_transformer):
        return tt.inverse_transform(self.model.predict(X))

keras_pipeline = Pipeline(
    [
        ('to_data_frame', ToDataFrame()),
        ('preprocess', preprocess_data),
        ('keras_model', KerasModel(user_home))
    ]
)

keras_pipeline.set_params(**{
    'keras_model__load_model_if_available' : True,
    'keras_model__update_model' : False,
    'keras_model__save_model' : False
})

keras_pipeline.fit(train_features, train_targets)

2023-02-27 16:33:04.129494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-27 16:33:11.184224: W tensorflow/core/util/tensor_slice_reader.cc:96] Could not open ram://b8d86bb6c748410b88301df5115c5654: INVALID_ARGUMENT: ram://b8d86bb6c748410b88301df5115c5654 is a directory.


In [8]:
'''
It is unnessacry to initially transform the test_targets and I inverse the predition 
within the predict method in KerasModel()
'''
preds = keras_pipeline.predict(test_features)
mean_absolute_error(preds, target_transformer.inverse_transform(test_targets))



34898.64951419195

In [8]:
user_pred = keras_pipeline.predict(fg.user_features)
user_pred[0][0], fg.user_target



(955466.5, 972500)

In [10]:
fg.user_home_formatted.address

'3416 19th Ave S'

In [18]:
xx = fg.user_features.copy()
xx['baths_half'] += 1
new_worth = keras_pipeline.predict(xx)
new_worth



array([[981661.1]], dtype=float32)

In [69]:
fg.user_features

{'Property_ID': 2949757771,
 'Status': 'sold',
 'Days_listed': 0,
 'Days_updated': 0,
 'baths_full': 2,
 'baths_3qtr': 0,
 'baths_half': 0,
 'baths_1qtr': 0,
 'year_built': 1915,
 'lot_sqft': 4414,
 'sqft': 2500,
 'garage': 1,
 'stories': 2,
 'beds': 3,
 'tags': ['city_view',
  'community_security_features',
  'dining_room',
  'dishwasher',
  'fireplace',
  'hardwood_floors',
  'hill_or_mountain_view',
  'lake_view',
  'ocean_view',
  'spa_or_hot_tub',
  'view',
  'washer_dryer',
  'water_view',
  'basement',
  'garage_1_or_more',
  'fruit_trees',
  'tennis_court',
  'tennis',
  'groundscare',
  'garage_1'],
 'new_construction': False,
 'distance_to_home': 0,
 'lat': 47.572613,
 'long': -122.306358}

In [60]:
worth_change_ratio = new_worth / user_pred
worth_change_ratio

array([[1.0425082]], dtype=float32)

In [61]:
new_worth_to_user = worth_change_ratio * fg.user_target
new_worth_to_user

array([[1013839.26779032]])

In [62]:
worth_delta = new_worth_to_user - fg.user_target
worth_delta

array([[41339.26779032]])

In [13]:
'year_built' in fg.user_features.keys()

True