/
realtorca.py
61 lines (56 loc) · 2.65 KB
/
realtorca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
""" Wrapper the queries module to get property data from realtor.ca. """
from time import sleep
from math import ceil
import os
from random import randint
from requests import HTTPError
import pandas as pd
from queries import get_coordinates, get_property_list, get_property_details
def get_property_list_by_city(city):
""" Gets a list of properties for a given city, and returns it as a CSV file. """
coords = get_coordinates(city) # Creates bounding box for city
max_pages = 1
current_page = 1
filename = city.replace(" ", "").replace(",", "") + ".csv"
if os.path.exists(filename):
results_df = pd.read_csv(filename)
## If the queries were interrupted, this will resume from the last page
current_page = ceil(results_df.shape[0]/200) + 1
max_pages = current_page + 1
else:
results_df = pd.DataFrame()
while current_page <= max_pages:
try:
data = get_property_list(
coords[0], coords[1],
coords[2], coords[3],
current_page=current_page)
## Rounds up the total records by the records per page to nearest int
max_pages = ceil(data["Paging"]["TotalRecords"]/data["Paging"]["RecordsPerPage"])
for json in data["Results"]:
results_df = results_df.append(pd.json_normalize(json))
results_df.to_csv(filename, index=False)
current_page += 1
sleep(randint(600, 900)) # sleep 10-15 minutes to avoid rate-limit
except HTTPError:
print("Error occurred on city: " + city)
sleep(randint(3000, 3600)) # sleep for 50-60 minutes if limited
def get_property_details_from_csv(filename):
""" Gets the details of a list of properties from the CSV file created above. """
results_df = pd.read_csv(filename)
if "HasDetails" not in results_df.columns:
results_df["HasDetails"] = 0
for index, row in results_df.iterrows():
if row["HasDetails"] == 1: # Avoids re-querying properties that already have details
continue
property_id = str(row["Id"])
mls_reference_number = str(row["MlsNumber"])
try:
data = get_property_details(property_id, mls_reference_number)
results_df = results_df.join(pd.json_normalize(data), lsuffix='_')
results_df.loc[index, 'HasDetails'] = 1
results_df.to_csv(filename, index=False)
sleep(randint(600, 900)) # sleep 10-15 minutes to avoid rate-limit
except HTTPError:
print("Error occurred on propertyID: " + property_id)
sleep(randint(3000, 3600)) # sleep for 50-60 minutes if limited