[Reference](https://levelup.gitconnected.com/how-to-get-property-data-to-analyze-your-market-using-python-e9ef5336d141)

In [1]:
!pip install googlesearch-python -q

[K     |████████████████████████████████| 61 kB 2.8 MB/s 
[K     |████████████████████████████████| 115 kB 48.0 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.25.1 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [3]:
from google.colab import drive, files
from googlesearch import search
import pandas as pd
import requests
import json
import time
import io
import plotly.express as px
pd.set_option('display.max_columns',None)

# Functions


In [4]:
def get_zpid(street=None, city=None, state=None, zip_code=None, full_address=None):
  # get search query string
  if full_address == None:
    try:
      query = '{0}, {1}, {2} {3} zillow home details'.format(street, city, state, str(zip_code))
    except:
      return 'Please enter a query string or address details'
  else:
    query = full_address + ' zillow home details'

  # get google search results
  search_results = search(query, tld='com', lang='en', num=3, start=0, stop=1, pause=0)
  search_results_list = [u for u in search_results]
  url = search_results_list[0] # extract first returned result
  
  # return zpid
  try:
    return [x for x in url.split('/') if 'zpid' in x][0].split('_')[0]
  except:
    return None

In [6]:
def get_property_detail(rapid_api_key, zpid):
  # get property details from API
  url = "https://zillow-com1.p.rapidapi.com/property"

  querystring = {"zpid":zpid} # zpid

  headers = {
    "X-RapidAPI-Host": "zillow-com1.p.rapidapi.com",
    "X-RapidAPI-Key": rapid_api_key # your key here
  }

  # request data
  return requests.request("GET", url, headers=headers, params=querystring)

# Locals & Constants


In [9]:
############
# OPTIONAL #
############

# mount drive
drive.mount('/content/drive', force_remount=False)

# data location
file_dir = '/content/drive/My Drive/Colab Data/input/' # optional

In [8]:
# read in api key file
df_api_keys = pd.read_csv(file_dir + 'api_keys.csv')

# get keys
rapid_api_key = df_api_keys.loc[df_api_keys['API'] =='rapid']['KEY'].iloc[0] # replace this with your own key

# Data

In [10]:
property_address = "11622 Pure Pebble Dr, RIVERVIEW, FL 33569"
query = property_address + ' zillow home details'
print('Search this phrase in Google Search:', query)

In [11]:
# google search results
search_results = search(query, tld='com', lang='en', num=3, start=0, stop=3, pause=0)
search_results_list = [u for u in search_results] # get all results
search_results_list

In [12]:
# get the first search result
url = search_results_list[0] # extract first returned result
url

In [13]:
# extract the zpid
zpid = [x for x in url.split('/') if 'zpid' in x][0].split('_')[0]
print('Zpid of the property is:', zpid )

In [14]:
# get property details from API
url = "https://zillow-com1.p.rapidapi.com/property"

querystring = {"zpid":zpid} # zpid

headers = {
	"X-RapidAPI-Host": "zillow-com1.p.rapidapi.com",
	"X-RapidAPI-Key": rapid_api_key # your key here
}

# request data
response = requests.request("GET", url, headers=headers, params=querystring)
# show success
response.status_code # 200 is success!

In [15]:
response.json()

In [16]:
# transform data to pandas dataframe
df_property_detail = pd.json_normalize(data=response.json())
print('Num of rows:', len(df_property_detail))
print('Num of cols:', len(df_property_detail.columns))
df_property_detail.head()

In [17]:
# retrieve property detail elements
bedrooms = df_property_detail['bedrooms'].iloc[0]
bathrooms = df_property_detail['bathrooms'].iloc[0]
year_built = df_property_detail['yearBuilt'].iloc[0]
property_type = df_property_detail['homeType'].iloc[0]
living_area = df_property_detail['resoFacts.livingArea'].iloc[0]
lot_size = df_property_detail['resoFacts.lotSize'].iloc[0]
lot_dimensions = df_property_detail['resoFacts.lotSizeDimensions'].iloc[0]
zoning = df_property_detail['resoFacts.zoning'].iloc[0]
# estimates
zestimate = df_property_detail['zestimate'].iloc[0]
rent_zestimate = df_property_detail['rentZestimate'].iloc[0]

print('PROPERTY DETAILS FOR: ', property_address)
print('------------------------------------------------------------------')
print('  Bedrooms: {}'.format( bedrooms))
print('  Bathrooms: {}'.format( bathrooms))
print('  Year Built: {}'.format( year_built))
print('  Living Area: {}'.format( living_area))
print('  Lot Size: {}'.format( lot_size))
print('  Lot Dimensions: {}'.format( lot_dimensions))
print('  Zoning: {}'.format( zoning))
print('  Property Type: {}'.format( property_type))
print('  Zestimate: ${:,.0f}'.format( zestimate))
print('  Rent Zestimate: ${:,.0f}'.format( rent_zestimate))

In [18]:
# download file
df_property_detail.to_csv('output.csv', index=False)
files.download('output.csv')

In [19]:
# upload document
# Ariel's example - PropStream Woodbridge Tax Liens (sample of 5 properties)
uploaded = files.upload()

In [20]:
# get file name
file_name = list(uploaded.keys())[0]

# read file
df_upload = pd.read_csv(io.BytesIO(uploaded[file_name]))
print('Num of rows:', len(df_upload))
df_upload.head()

In [21]:
# create empty list to store api response for property details
df_list = []
zpid_list = []

# iterate through dataframe
for index, row in df_upload.iterrows():
  # get property address attributes by column name
  street = row['Address']
  city = row['City']
  state = row['State']
  zip_code = "0" + str(row['Zip']) # fix issue with zip code

  # get zpid
  zpid = get_zpid(street=street, city=city, state=state, zip_code=zip_code)
  zpid_list.append(zpid)

  # wait 1 second to not overload Google Search
  time.sleep(1.5)

  # get property details
  # catch errors (possibly no zpid)
  try:
    response = get_property_detail(rapid_api_key, zpid)
    _df = pd.json_normalize(data=response.json())
    # append data to list of property details
    df_list.append(_df)
  except:
    print('Unable to retrieve data for : {0}, {1}, {2}, {3}'.format(street, city, state, zip_code))

In [22]:
# join property details data
df_prop_detail = pd.concat(df_list)
df_prop_detail.head()

In [23]:
# join original dataset to property details
# create new column for zpid
df_upload['zpid'] = zpid_list

# confirm zpid is <str> type
df_upload['zpid'] = df_upload['zpid'].astype(str)
df_prop_detail['zpid'] = df_prop_detail['zpid'].astype(str)

# join on zpid
df_join = pd.merge(df_upload, df_prop_detail, how='left', on='zpid')
print('Num of columns:', len(df_join.columns))
df_join.head()

In [24]:
# reduce number of columns in join
df_mod_output = pd.merge(
    df_upload, 
    df_prop_detail[['zpid', 'zestimate', 'rentZestimate']], 
    how='left', 
    on='zpid')
print('Num of columns:', len(df_mod_output.columns))
df_mod_output.head()

In [25]:
# download file
df_mod_output.to_csv('output2.csv', index=False)
files.download('output2.csv')

In [26]:
# view zestimate
fig = px.box(df_mod_output, y="zestimate")
fig.show()

# Automation

In [27]:
from IPython.display import YouTubeVideo
YouTubeVideo('Dh09SWvCuiM')

In [28]:
from IPython.display import YouTubeVideo
YouTubeVideo('Dh09SWvCuiM')