In [10]:
import BHU
import os
import numpy as np
import keys as k

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

keys = k.getKeys()

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

user_home = BHU.get_UserHome(keys['SampleHouse'].strip("\'"))
user_home_details = BHU.get_PropertyDetail(str(user_home['property_id']))
hoi = BHU.get_HousesOfInterest(user_home, n=2000, listed_to_sold_ratio=0.3, verbose=True)
gd = BHU.GeoData(hoi['geo'])

fg = BHU.FeatureGenerator(
    houses = hoi['houses'],
    gd=gd,
    user_home=user_home_details
)

Shortfall in listed houses detected, appending 114 of current listing to results.


In [41]:
# Does the walkscore model exist?
import pickle
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd

model_name = f'{fg.user_home_formatted.city}_{fg.user_home_formatted.state}'
model_file_path = f'BHU/Saved Results/WalkScoreModel/{model_name}.pkl'

if os.path.isfile(model_file_path):
    with open(model_file_path, 'rb') as f:
        model = pickle.load(f)
else:
    print(f'No model found, generating {model_name}.')
    fg._sync_walk_score()
    lat = pd.Series([l.lat_long[0] for l in fg.houses])
    long = pd.Series([l.lat_long[1] for l in fg.houses])
    ws = pd.Series([l.walk_score for l in fg.houses])

    lat_ss = SimpleImputer().fit_transform(np.array(lat).reshape(-1,1))
    long_ss = SimpleImputer().fit_transform(np.array(long).reshape(-1,1))
    ws_ss = SimpleImputer().fit_transform(np.array(ws).reshape(-1,1))

    lat_ss = [z[0] for z in lat_ss]
    long_ss = [z[0] for z in long_ss]
    ws_ss = [z[0] for z in ws_ss]

    grf = GradientBoostingRegressor(n_estimators=250, 
                                    min_samples_split=3, 
                                    min_samples_leaf=3, 
                                    max_depth=8)
    
    data = pd.DataFrame({'lat':lat_ss, 'long':long_ss, 'ws':ws_ss})
    model = grf.fit(data.drop('ws', axis=1), data['ws'])
    with open(model_file_path, 'wb') as f:
        pickle.dump(model, f)

No model found, generating SEATTLE_WA.


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop('ws', axis=1), data['ws'], test_size=0.1, random_state=55)

grf = GradientBoostingRegressor(n_estimators=250, 
                                min_samples_split=3, 
                                min_samples_leaf=3, 
                                max_depth=8)

grf.fit(X_train, y_train)
grf.score(X_test, y_test)

0.8117269200523123

In [40]:
sum(abs(grf.predict(X_test) - y_test))/len(y_test)

6.347521493056033

In [7]:
# This is the whole data set
target_transformer = StandardScaler()
train_targets = target_transformer.fit_transform(np.array(fg.targets).reshape(-1,1))
train_features = fg.features

normalize_cols = ['lot_sqft', 'sqft']
bucketize_cols = ['year_built', 'distance_to_home', 'lat_winz', 'long_winz']
dummy_cols = ['baths_full', 'baths_3qtr', 'baths_half', 'baths_1qtr', 'garage', 'stories', 'beds']

preprocess_data = ColumnTransformer(
    [
        #('scale', preprocess_min_max_cols, minmax_cols),
        ('normalize', StandardScaler(), normalize_cols),
        ('bucketize', BHU.KerasTransformers.preprocess_bucketize_col, bucketize_cols),
        ('dummy', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), dummy_cols),
        ('walkscore', pass, [])
    ]
)

keras_pipeline = Pipeline(
    [
        ('to_data_frame', BHU.KerasTransformers.ToDataFrame()),
        ('preprocess', preprocess_data),
        ('keras_model', BHU.KerasModel(user_home, target_transformer))
    ]
)

keras_pipeline.set_params(**{
    'keras_model__load_model_if_available' : True,
    'keras_model__update_model' : False,
    'keras_model__save_model' : False
})

keras_pipeline.fit(train_features, train_targets)

In [10]:
user_pred = keras_pipeline.predict(fg.user_features)
user_pred[0][0], fg.user_target



(955466.5, 972500)

In [11]:
fg.user_home_formatted.address

'3416 19th Ave S'

In [12]:
xx = fg.user_features.copy()
xx['baths_half'] += 1
new_worth = keras_pipeline.predict(xx)
new_worth



array([[981661.1]], dtype=float32)

In [13]:
fg.user_features

{'Property_ID': 2949757771,
 'Address': None,
 'Status': 'sold',
 'Days_listed': 0,
 'Days_updated': 0,
 'baths_full': 2,
 'baths_3qtr': 0,
 'baths_half': 0,
 'baths_1qtr': 0,
 'year_built': 1915,
 'lot_sqft': 4414,
 'sqft': 2500,
 'garage': 1,
 'stories': 2,
 'beds': 3,
 'tags': ['city_view',
  'community_security_features',
  'dining_room',
  'dishwasher',
  'fireplace',
  'hardwood_floors',
  'hill_or_mountain_view',
  'lake_view',
  'ocean_view',
  'spa_or_hot_tub',
  'view',
  'washer_dryer',
  'water_view',
  'basement',
  'garage_1_or_more',
  'fruit_trees',
  'tennis_court',
  'tennis',
  'groundscare',
  'garage_1'],
 'new_construction': False,
 'distance_to_home': 0,
 'lat': 47.572613,
 'long': -122.306358}

In [14]:
worth_change_ratio = new_worth / user_pred
worth_change_ratio

array([[1.0274155]], dtype=float32)

In [15]:
new_worth_to_user = worth_change_ratio * fg.user_target
new_worth_to_user

array([[999161.58735752]])

In [16]:
worth_delta = new_worth_to_user - fg.user_target
worth_delta

array([[26661.58735752]])

In [138]:
pd.DataFrame({'test':[(1,2), (3,4)]})

Unnamed: 0,test
0,"(1, 2)"
1,"(3, 4)"
