### Imports & Inputs

In [None]:
# Current 02-Jan-2023

In [None]:
import re
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions

In [None]:
# Inputs. Insert coordinates of your desired neighbourhoods
central_frederic_coords = [43.450936, -80.481639] # Latitude, longitude
belmont_village_coords = [43.453351, -80.517898] # Latitude, longitude
vincienzos_coords = [43.459222, -80.519409]# Latitude, longitude

### Retrieving Data

In [None]:
# Using Selenium to launch a headless Firefox session and retrieve the page source
options = FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get("https://www.redfin.ca/on/guelph/filter/max-price=800k,mr=33:2996+33:3316+33:3472,walk-score=70")

text_only = driver.find_element(By.TAG_NAME, "body").text

src1 = driver.page_source

In [None]:
# Write the page source to a local .txt file
from pathlib import Path

path = Path('data\houses.txt')

with path.open(mode='w', encoding='utf-8') as file:
    file.write(src1)

In [None]:
# Split the page text into a list, one element per house which matches the filters
# First item for each house is "mlsId". Split using this term
if type(src1) == type('str'):
    src1 = src1.split('mlsId')
    tail_end = src1[-1].split('isViewedListing')
    src1[-1] = tail_end[0]
len(src1)

In [None]:
# Create a list for each desired feature of each house
prices = []
addresses = []
property_types = []
square_footages = []
prices_per_sqft = []
n_beds = []
n_baths = []
cities = []
zip_codes = []
times_on_redfin = []
latitudes = []
longitudes = []

walk_scores = []

def add_value(feature, feature_list):
    if feature:
        feature_list.append(feature.group(1))
    else:
        feature_list.append(np.nan)

for house_data in src1[1:]:
    price = re.search(r'\\\"price\\\":{\\\"value\\\":(\d*)', house_data)
    add_value(price, prices)
    #prices = np.array(prices).astype(int)

    address = re.search(r'\\\"streetLine\\\":{\\\"value\\\":\\\"(.*?)\\\",', house_data)
    add_value(address, addresses)
    #addresses = np.array(addresses).astype(str)

    property_type = re.search(r'\\\"propertyType\\\":(\d*)', house_data) # Use dict to convert to sensible categories
    add_value(property_type, property_types)
    #property_types = np.array(property_types).astype(int)

    square_footage = re.search(r'\\\"sqFt\\\":{\\\"value\\\":(\d+),', house_data)
    add_value(square_footage, square_footages)
    #square_footage = np.array(square_footage).astype(int)

    price_per_sqft = re.search(r'\\\"pricePerSqFt\\\":{\\\"value\\\":(\d*?),', house_data)
    add_value(price_per_sqft, prices_per_sqft)
    #price_per_sqft = np.array(price_per_sqft).astype(int)

    n_bed = re.search(r'\\\"beds\\\":(\d*?),', house_data)
    add_value(n_bed, n_beds)
    #n_beds = np.array(n_beds).astype(int)

    n_bath = re.search(r'\\\"baths\\\":(\d+.\d*),', house_data)
    add_value(n_bath, n_baths)
    #n_baths = np.array(n_baths).astype(int)

    city = re.search(r'\\\"city\\\":\\\"(\D*?)\\\"', house_data)
    add_value(city, cities)
    #city = np.array(city).astype(str)

    zip_code = re.search(r'\\\"zip\\\":\\\"(\w\w\w \w\w\w)\\\"', house_data)
    add_value(zip_code, zip_codes)
    #zip_code = np.array(zip_code).astype(str)

    time_on_redfin = re.search(r'\\\"timeOnRedfin\\\":{\\\"value\\\":(\d+)', house_data)
    add_value(time_on_redfin, times_on_redfin)

    latitude = re.search(r'\\\"latLong\\\":{\\\"value\\\":{\\\"latitude\\\":(\d+.\d+),\\\"longitude\\\":-\d+.\d+}', house_data)
    longitude = re.search(r'\\\"latLong\\\":{\\\"value\\\":{\\\"latitude\\\":\d+.\d+,\\\"longitude\\\":(-\d+.\d+)}', house_data)
    add_value(latitude, latitudes)
    add_value(longitude, longitudes)

    # walk scores not listed in source text???

In [None]:
# For troubleshooting. Ensure that all feature lists have equal length
print(f'Num prices: {len(prices)}')
print(f'Num addresses: {len(addresses)}')
print(f'Num property types: {len(property_types)}')
print(f'Num sq footage: {len(square_footages)}')
print(f'Num price per sqft: {len(prices_per_sqft)}')
print(f'Num n_beds: {len(n_beds)}')
print(f'Num n_baths: {len(n_baths)}')
print(f'Num city: {len(cities)}')
print(f'Num zip code: {len(zip_codes)}')
print(f'Num uptime: {len(times_on_redfin)}')
print(f'Num latitudes: {len(latitudes)}')
print(f'Num longitudes: {len(longitudes)}')

### Data Munging

In [None]:
# Create dataframe and take care of dtypes
df = pd.DataFrame({'address': addresses,
                    'unit_type': property_types,
                    'price': prices,
                    'sqft': square_footages,
                    'price_per_sqft': prices_per_sqft,
                    'n_beds': n_beds,
                    'n_baths': n_baths,
                    'city': cities,
                    'postal_code': zip_codes,
                    'days_on_redfin': times_on_redfin,
                    'latitude': latitudes,
                    'longitude': longitudes})

df.unit_type = pd.to_numeric(df.unit_type)
df.price = pd.to_numeric(df.price)
df.sqft = pd.to_numeric(df.sqft)
df.price_per_sqft = pd.to_numeric(df.price_per_sqft)
df.n_beds = pd.to_numeric(df.n_beds)
df.n_baths = pd.to_numeric(df.n_baths)
df.days_on_redfin = pd.to_numeric(df.days_on_redfin)
df.latitude = pd.to_numeric(df.latitude)
df.longitude = pd.to_numeric(df.longitude)

df.days_on_redfin = df.days_on_redfin / 1000 / 3600 / 24

In [None]:
# Calculating distances to user's desired neighbourhoods
import geopy.distance

df['km_to_Frederic'] = df.apply(lambda x: geopy.distance.geodesic([x.latitude, x.longitude], central_frederic_coords).km, axis=1)
df['km_to_Belmont'] = df.apply(lambda x: geopy.distance.geodesic([x.latitude, x.longitude], belmont_village_coords).km, axis=1)
df['km_to_Vincenzos'] = df.apply(lambda x: geopy.distance.geodesic([x.latitude, x.longitude], vincienzos_coords).km, axis=1)

df['closest_neighbourhood'] = df[['km_to_Frederic', 'km_to_Belmont', 'km_to_Vincenzos']].min(axis=1)
df = df.drop(['latitude', 'longitude'], axis=1) # Not useful once distances are calculated

In [None]:
# Dropping duplicate addresses
df = df.drop_duplicates(subset=['address'], keep='first') # Lots of duplicate listings for some reason

# Rounding off numbers for a cleaner look
df.days_on_redfin = df.days_on_redfin.round(1)
df.km_to_Frederic = df.km_to_Frederic.round(2)
df.km_to_Belmont = df.km_to_Belmont.round(2)
df.km_to_Vincenzos = df.km_to_Vincenzos.round(2)
df.closest_neighbourhood = df.closest_neighbourhood.round(2)

In [None]:
# Change the unit types from the default to human understandable language
unit_types = {3: 'Apartment', 4: 'Townhouse', 5: 'Townhouse', 6: 'Detached Single Family', 8: 'Vacant Land', 13: 'Condo Townhouse'} # Not 100% correct. Some apartments are listed as type 6, not sure why
df.unit_type = df.unit_type.replace(unit_types)
df.unit_type.unique()

### Search Statistics

In [None]:
# Numerical summary
df.describe() # Price of 5e05 is 500k

In [None]:
# Categorical summary
df.describe(include='object')

### Filtering Dataframe

In [None]:
# Apply filters here to narrow down search
df1 = df[(df.city == 'Kitchener') & (df.unit_type == 'Detached Single Family') & (df.price < 800000)].sort_values('price').head(10)
df1 = df1.reset_index(drop=True)
df1

In [None]:
# Apply filters here to narrow down search
df2 = df[(df.unit_type != 'Apartment') & (df.price < 800000) & (df.closest_neighbourhood <= 2)].sort_values('closest_neighbourhood').head(10)
df2 = df2.reset_index(drop=True)
df2