In [267]:
from keys import *
from typing import Union, List, Tuple

import requests
from retrying import retry
from ediblepickle import checkpoint
from urllib.parse import quote

import numpy as np
import pandas as pd
import sys
from datetime import datetime
from math import radians, cos, sin, asin, sqrt

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pgeocode

keys = getKeys()
if 'lightrun' not in sys.modules:
    try:
        import lightrun
        lightrun.enable(company_key=keys['LightRunCompanyKey'])
    except ImportError as e:
        print("Error importing Lightrun: ", e)

In [361]:
@retry(stop_max_attempt_number=5)
@checkpoint(key=lambda args, kwargs: quote(args[0]) + '.pkl', work_dir='Saved Results/PropertyDetail/')
def get_PropertyDetail(
        property_id : str
    ) -> dict:
    
    if not isinstance(property_id, str):
        try:
            property_id = str(property_id)
        except:
            raise Exception('Could not convert input to string.')

    url = "https://us-real-estate.p.rapidapi.com/v2/property-detail"

    querystring = {
        "property_id": property_id
    }

    headers = {
        "X-RapidAPI-Key": keys['USRealEstate'],
        "X-RapidAPI-Host": "us-real-estate.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    return response.json()

@retry(stop_max_attempt_number=5)
def get_PropertyForSaleByZipCode(
        zip_code       : str, 
        property_type : str = 'single_family',
        n_results     : int = 100
    ) -> dict:

    url = "https://us-real-estate.p.rapidapi.com/v2/for-sale-by-zipcode"

    # We need to make a loop here and iterate the offset until we hit the end or the limit.
    # This is going to take a lot of API calls.

    # This can be increased to 200 once we get the paid plan.
    limit = min(42, n_results)

    '''
    Other query string parameters:
    sort = (default: relevant)|newest|lowest_price|highest_price|open_house_date|price_reduced_date|largest_sqft|lot_size|sold_date
    price_min/max = $ USD
    beds_min/max = #
    bath_min/max = #
    property_type = multi_family|single_family|mobile|land|farm (I think we should just use : 'single_family')
    '''

    querystring = {
        "zipcode":zip_code,
        "offset":"0",
        "limit":str(limit),
        "property_type":property_type
    }

    headers = {
        "X-RapidAPI-Key": keys['USRealEstate'],
        "X-RapidAPI-Host": "us-real-estate.p.rapidapi.com"
    }

    output = []

    '''
    while(True):
        response = requests.request("GET", url, headers=headers, params=querystring)
        output.append(response.json())

        if len(response) < limit
    '''

    response = requests.request("GET", url, headers=headers, params=querystring)
    return response.json()

@retry(stop_max_attempt_number=5)
def get_PropertySoldByZipCode(
        zip_code      : str, 
        n_results     : int,
        property_type : str = 'single_family'
    ) -> dict:
    
    '''
    NOTE: This does not seem to have a limit arguement. I do not know how this works with offset?
    '''

    url = "https://us-real-estate.p.rapidapi.com/v2/sold-homes-by-zipcode"

    # This can be increased to 200 once we get the paid plan.
    offset = min(42, n_results)

    '''
    Other query string parameters:
    sort = (default: relevant)|newest|lowest_price|highest_price|open_house_date|price_reduced_date|largest_sqft|lot_size|sold_date
    price_min/max = $ USD
    beds_min/max = #
    bath_min/max = #
    property_type = multi_family|single_family|mobile|land|farm (I think we should just use : 'single_family')
    '''

    querystring = {
        "zipcode":zip_code,
        "offset":offset,
        "property_type":property_type
    }

    headers = {
        "X-RapidAPI-Key": keys['USRealEstate'],
        "X-RapidAPI-Host": "us-real-estate.p.rapidapi.com"
    }

    output = []

    response = requests.request("GET", url, headers=headers, params=querystring).json()

    return response

def get_LocationSuggest(
        search_keyword : str, 
        return_all     : bool = False
    ) -> dict:

    url = "https://us-real-estate.p.rapidapi.com/location/suggest"

    querystring = {"input":search_keyword}

    headers = {
        "X-RapidAPI-Key": keys['USRealEstate'],
        "X-RapidAPI-Host": "us-real-estate.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    response_json = response.json()

    return response_json if return_all else response_json['data'][0]


def get_PropertyForSaleByArea(
        city      : str = '',
        state     : str = '',
        n_results : int = 100 # How many houses do you want to get back.
        # Should make a method that if n_results is None, it pulls back everything.
    ) -> dict:

    # This can be increased to 200 once we move to the paid version.
    limit = min(42, n_results)

    url = "https://us-real-estate.p.rapidapi.com/v2/for-sale"

    querystring = {
        "state_code":state,
        "city":city,
        "offset":"0",
        "limit":str(limit),
        "sort":"newest"
    }

    headers = {
        "X-RapidAPI-Key": keys['USRealEstate'],
        "X-RapidAPI-Host": "us-real-estate.p.rapidapi.com"
    }

    response = requests.request("GET", url, headers=headers, params=querystring).json()
    total_houses_available = int(response['data']['home_search']['total'])
    total_houses_in_request = int(response['data']['home_search']['count'])

    print(f'Returning {str(min(total_houses_available, n_results))} out of a possible {str(total_houses_available)}.')

    geo_to_return = response['data']['geo'] # We know this is the same at each iteration.
    houses_to_return = response['data']['home_search']['results']

    # Are the returned 'geo' values the same, or do they change when there are offests?
    # I want this function to be pure, just returning the raw json info.
    # I am thinking we return a dict with two values:
    # - meta_data : ['data']['geo']
    # - houses : ['data']['home_search']['results']

    houses_remaining = min(total_houses_available, n_results) - len(houses_to_return)

    while houses_remaining > 0:
        querystring['offset'] = str(int(querystring['offset']) + limit)
        querystring['limit'] = str(min(limit, houses_remaining))

        response = requests.request("GET", url, headers=headers, params=querystring).json()
        houses_to_return.extend(response['data']['home_search']['results'])
        houses_remaining -= len(response['data']['home_search']['results'])

    return {
        'houses' : houses_to_return,
        'geo' : geo_to_return
    }

We are going to be digesting two areas of the API return:

    - v['data']['geo']
    
    - v['data']['home_search']['results']

In [240]:
# This will be taking in the following: tt['data']['geo']
class geo_data():
    '''
    This is going to be used to organize the meta information about each query.
    I need to think where it is most appropriate to do this.
    '''
    def __init__(self, 
            stats : dict
        ):
        
        self.zip_info = self._parse_areas(stats.get('recommended_zips', {}).get('geos'))
        self.city_info = self._parse_areas(stats.get('recommended_cities', {}).get('geos'))
        self.county_info = self._parse_areas(stats.get('recommended_counties', {}).get('geos'))
        self.neighborhood_info = self._parse_areas(stats.get('recommended_neighborhoods', {}).get('geos'))
        self.market_stats = self._parse_statistics(stats.get('geo_statistics', {}).get('housing_market'))

    def __repr__(self) -> str:
        pass

    def _parse_areas(self, geos : dict) -> dict:
        return None if geos is None else {
            v.get(v.get('geo_type', 'slug_id'), '_parse_areas_FAILED') : {
                'slug_id' : v.get('slug_id'),
                'median_listing_price' : v.get('geo_statistics', {}).get('housing_market', {}).get('median_listing_price'),
                'state_code' : v.get('state_code'),
                'city_code' : v.get('city'),
                'geo_type' : v.get('geo_type')
            } for v in geos
        }
    
    def _parse_statistics(self, geo_stats : dict) -> dict:
        return None if geo_stats is None else {
            'median_days_on_market' : geo_stats.get('median_days_on_market'),
            'median_sold_price' : geo_stats.get('median_sold_price'),
            'median_price_per_sqft' : geo_stats.get('median_price_per_sqft'),
            'median_listing_price' : geo_stats.get('median_listing_price'),
            'month_to_month_metrics' : geo_stats.get('month_to_month'),
            'by_prop_type' : {
                ht.get('type') : {
                    k : v for k, v in ht.get('attributes', {}).items()
                } for ht in geo_stats.get('by_prop_type', {})
            }
        }

In [277]:
# This will be taking in the following: tt['data']['home_search']['results'][n]
class house():
    '''
    This is going to be the class that houses (hehe) all the house data. Each house will have its own instance.
    When we use the API, there is a lot of data reutned nested in a number of dictionaries. This will take the 'juicy' bit.
    The idea for this class is that it will hold all the needed info for:
         1) the GUI, address, google street view, other photos. This will probably be a flask application to start, but we are 
            far from even thinking about that.
         2) the MODEL, tags, list_prices, other flags. What if we created a word cloud and have the user select key words for 
            their house until they have selected some flat number or % contribution to model from the tags TBD. There will be 
            dates there, we will use days old (or something similar) for the model training, while the actual took will use zero, 
            as the user is entering 100% correct info. This may or may not be a good idea, as it might have unintended implications
            within the model.

    Interior functions:
        Date Cleaning
        Location Cleaning
        Description Cleaning
    '''
    def __init__(self, 
            listing : dict
        ):
        
        self.reference_info = { # This is stuff not going into the model
            'id' : listing['property_id'],
            'photos' : list(set([listing['primary_photo']['href']] + [l['href'] for l in listing['photos']]))
        }

        self.raw_last_update = listing['last_update_date']
        self.raw_list_date = listing['list_date']
        self.tags = listing['tags']
        self.list_price = listing['list_price']
        self.new_construction = listing.get('flags', {}).get('is_new_construction', False) or False

        self.raw_location = listing['location']
        self.raw_description = listing['description']

        self._clean_dates()
        self._clean_location()
        self._clean_description()

        # This is going to be used to store stuff in the future.
        self.future_stats = {}
        self.features = {}

    def __repr__(self) -> str:
        pass
        
    def _convert_date(self, date : str) -> datetime:
        return datetime.strptime(date, '%Y-%m-%d')
    
    def _clean_dates(self) -> None:
        last_update_date_parsed = self.raw_last_update.split('T')
        list_date_parsed = self.raw_list_date.split('T')
        self.last_update = self._convert_date(last_update_date_parsed[0]) if len(last_update_date_parsed) == 2 else None
        self.list_date = self._convert_date(list_date_parsed[0]) if len(list_date_parsed) == 2 else None
        self.last_update_delta = None if self.last_update is None else max((datetime.now() - self.last_update).days, 0)
        self.list_date_delta = None if self.list_date is None else max((datetime.now() - self.list_date).days, 0)
        
    def _clean_location(self) -> None:
        self.reference_info.update({
            'zip_code' : self.raw_location.get('address', {}).get('postal_code'),
            'state' : self.raw_location.get('address', {}).get('state'),
            'google_map_street_view' : self.raw_location.get('street_view_url'),
            'fips_code' : self.raw_location.get('county', {}).get('fips_code'),
            'county' : self.raw_location.get('county', {}).get('name'),
            'city' : self.raw_location.get('address', {}).get('city')
        })

        lat_long = self.raw_location.get('address', {}).get('coordinate')
        self.lat_long = (None, None) if lat_long is None else (lat_long.get('lat'), lat_long.get('lon'))     

    def _clean_description(self) -> None:
        self.baths_full = self.raw_description.get('baths_full') or 0
        self.baths_3qtr = self.raw_description.get('baths_3qtr') or 0
        self.baths_half = self.raw_description.get('baths_half') or 0
        self.baths_1qtr = self.raw_description.get('baths_1qtr') or 0
        self.year_built = self.raw_description.get('year_built') or 0
        self.lot_sqft = self.raw_description.get('lot_sqft') or 0
        self.sqft = self.raw_description.get('sqft') or 0
        self.garage = self.raw_description.get('garage') or 0
        self.stories = self.raw_description.get('stories') or 1
        self.beds = self.raw_description.get('beds') or 0
        self.type = self.raw_description.get('type') or 'NONE'

    def _validate(self):
        '''
        This will be used to flag anything the looks strange (missing values, etc)
        '''

In [351]:
# I do not know if this is overkill, or what the best way of doing this, but I am going to make another class.
# This will take in house and geo, and generate stats based on what is fed, and then can output a pd.Dataframe for the pipeline.

# TODO : Find a way to get a zipcode from an address
### Idea, ask user to prompt zipcode for entry or address, figure out what city that is in and query based on that.

class FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, 
        houses   : List[house], 
        gd       : geo_data, 
        nomi     : pgeocode.Nominatim, 
        geod     : pgeocode.GeoDistance,
        home_zip : str
        ):

        '''
        This is the first step in the pipeline.
        This will import a list of houses, and the geo data.
        I am also going to have it pass in the zipcode info getter thing, as it takes some time to load.
        The idea behind this is that if there are multiple runs, I dont want to have to reinitialize it at each iteration.
        TODO: How to keep data/connections in between runs? Not a now problem, very late game problem.

         - Look at distances between zipcode, take the inverse to create the feature
         - Need to look at listing price / median price of the zipcode.
        '''

        self.houses = houses
        self.gd = gd
        self.nomi = nomi
        self.geod = geod
        self.home_zip = str(home_zip)

        if not self.nomi.country == 'US':
            raise Exception(f'This will only work with US data. nomi.country = {self.nomi.country}')
        if not self.geod.country == 'US':
            raise Exception(f'This will only work with US data. geod.country = {self.geod.country}')

        # I need to keep in mind here that this is a list of houses
        # I will need to do do these functions in  some sort of apply or list comprehension
        self._get_unique_zip_codes()
        self._calc_distance_between_zip_codes(self.home_zip)
        self.houses = list(map(self._generate_distance_stats, self.houses))
        
        self.features = list(map(self._generate_features, self.houses))
        self.targets = list(map(self._generate_targets, self.houses))

    def __repr__(self) -> str:
        pass

    def _get_unique_zip_codes(self) -> None:
        zip_codes = [h.reference_info.get('zip_code') or '00000' for h in self.houses]
        if '00000' in zip_codes:
            zip_codes.remove('00000') # need to safely remove this.
        self.unique_zip_codes = list(set(zip_codes))
            
    def _calc_distance_between_zip_codes(self, home_zip_code : str) -> None:
        distances : List[np.float64] = self.geod.query_postal_code(home_zip_code, self.unique_zip_codes)
        self._distance_between_zip_codes = {
            k : v for k, v in zip(self.unique_zip_codes, distances)
        }

    def _generate_distance_stats(self, h : house) -> house:
        h.future_stats['distance_to_other_zips'] = self._distance_between_zip_codes.get(h.reference_info.get('zip_code', 0)) or 0
        h.future_stats['inverse_distance'] = 1.0 / (1.0 + h.future_stats['distance_to_other_zips'])
        return h

    def _generate_features(self, h) -> dict:
        h.features = {
            'Days_listed' : int(h.list_date_delta),
            'Days_updated' : int(h.last_update_delta),
            'baths_full' : int(h.baths_full),
            'baths_3qtr' : int(h.baths_3qtr),
            'baths_half' : int(h.baths_half),
            'baths_1qtr' : int(h.baths_1qtr),
            'year_built' : int(h.year_built),
            'lot_sqft' : int(h.lot_sqft),
            'sqft' : int(h.sqft),
            'garage' : int(h.garage),
            'stories' : int(h.stories),
            'beds' : int(h.beds),
            'type' : str(h.type),
            'tags' : h.tags or [],
            'new_construction' : bool(h.new_construction),
            'distance_to_home' : float(h.future_stats.get('inverse_distance', 1) or 1)
        }
        return h.features
    
    def _generate_targets(self, h) -> int:
        return int(h.list_price)


In [346]:
hh = house(tt['houses'][0])
gd = geo_data(tt['geo'])
#nomi = pgeocode.Nominatim('US')
#geod = pgeocode.GeoDistance('US')
hh.reference_info.get('zip_code')

'98119'

In [359]:
fg = FeatureGenerator(
    houses = [house(h) for h in tt['houses']], 
    gd=gd, 
    nomi = nomi,
    geod=geod,
    home_zip='98144'
)

In [353]:
class ToDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(X)

In [354]:
pipe = Pipeline([
    ('format_data', ToDataFrame())
])

output = pipe.fit_transform(fg.features, fg.targets)

  if LooseVersion(joblib_version) < '0.12':


In [355]:
to_data_frame = ToDataFrame()
X_t = to_data_frame.fit_transform(fg.features)
X_t

Unnamed: 0,Days_listed,Days_updated,baths_full,baths_3qtr,baths_half,baths_1qtr,year_built,lot_sqft,sqft,garage,stories,beds,type,tags,new_construction,distance_to_home
0,0,0,3,0,1,0,2022,1493,1912,0,1,4,single_family,"[city_view, community_outdoor_space, dining_ro...",True,0.116067
1,0,0,2,1,0,0,1947,6934,1950,0,1,5,single_family,"[community_outdoor_space, den_or_office, dinin...",False,0.082262
2,0,0,1,1,1,0,2022,1029,950,0,1,2,condos,"[dishwasher, new_construction, ocean_view, vie...",True,0.124124
3,1,1,1,1,0,0,2022,900,1540,1,1,3,townhomes,"[city_view, community_outdoor_space, dining_ro...",True,0.132968
4,1,1,1,1,0,0,2022,900,1540,1,1,3,townhomes,"[city_view, community_outdoor_space, dining_ro...",True,0.132968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11,11,1,1,0,0,2022,939,1456,0,1,3,townhomes,"[community_outdoor_space, dishwasher, energy_e...",True,0.161516
96,11,2,0,1,0,0,1958,0,722,0,1,2,condos,"[carport, community_gym, dishwasher, fireplace...",False,0.158773
97,12,5,1,1,0,0,1925,4260,2879,1,1,3,single_family,"[dining_room, dishwasher, fireplace, forced_ai...",False,0.164869
98,12,12,1,0,0,0,1985,0,868,0,1,2,condos,"[community_golf, dishwasher, fireplace, hardwo...",False,1.000000


In [362]:
psbz = get_PropertySoldByZipCode('98144', 100)

In [378]:
psbz['data']['home_search']['results'][0].keys()

dict_keys(['primary_photo', 'last_update_date', 'source', 'tags', 'permalink', 'status', 'list_date', 'open_houses', 'tax_record', 'branding', 'photos', 'coming_soon_date', 'list_price', 'matterport', 'property_id', 'flags', 'lead_attributes', 'community', 'products', 'virtual_tours', 'description', 'listing_id', 'price_reduced_amount', 'location', 'other_listings'])

In [389]:
qq = house(psbz['data']['home_search']['results'][0])
# This is looking promising, lets see if this can get all the features we expect.

0