In [1]:
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict

LOCATION = 'html3'

def parse_features(features):
    bedrooms = 0
    bathrooms = 0
    study = False
    parking = 0
    
    for feature in features:
        if 'bed' in feature:
            bedrooms  += int(f'0{replace_all_but(feature, "1234567890")}')
        elif 'bath' in feature:
            bathrooms += int(f'0{replace_all_but(feature, "1234567890")}')
        elif 'park' in feature:
            parking   += int(f'0{replace_all_but(feature, "1234567890")}')
        elif 'study' in feature:
            study = True
     
    return bedrooms, bathrooms, study, parking

def longest_consec_nums(text):
    longest = 0
    curr = 0
    
    for char in text:
        if char.isdigit():
            curr += 1
            longest = max(longest, curr)
        else:
            curr = 0
    
    return longest

def replace_all_but(text, chars):
    new = ''
    for char in text:
        if char in chars:
            new += char
            
    return new

def remove_decimal(num):
    for i in (-3, -2, -1):
        if len(num) < -i:
            continue
        if num[i] in ',.':
            num = num[:i]
            break
    return num

def contains_num(text):
    return any(num in text for num in '1234567890')

def parse_price(text):
    text = text.replace('/', ' ').replace('-', ' ').replace('\\', ' ').replace('~', ' ').replace('$', ' ')
    no_space = text.replace(' ', '')
    
    num_consec_nums = longest_consec_nums(no_space)
    if num_consec_nums <= 2 or num_consec_nums >= 5:
        return np.nan
    
    just_nums = replace_all_but(text, '1234567890,. ')
    nums = just_nums.split()
    nums = [remove_decimal(num) for num in nums]
    nums = [int(num.replace(',', '').replace('.', '')) for num in nums if contains_num(num)]
    
    return max(nums)

In [2]:
def extract_data(html):
    properties = html.find_all('article')
    data = defaultdict(list)
    
    for prop in properties:
        try:
            info = prop.find_all('div', class_='residential-card__content')[0]
            address = info.find_all('span', class_='')[0].text
        except:
            # If we can't get address or info, then continue 
            continue
            
        try:
            price = info.find_all('span', class_='property-price')[0].text
        except:
            price = np.nan
            
        try:
            property_type = info.find_all('span', class_='residential-card__property-type')[0].text
        except:
            property_type = np.nan
            
        try:
            # class_ often changes, but still starts with "View__PropertyDetail-sc"
            feature_bar = prop.find_all('div', class_='View__PropertyDetail-sc-11ysrk6-0 gIMwxl')
            features = [feature['aria-label'] for feature in feature_bar]
        except:
            features = []  
            
        try:
            url = 'https://www.realestate.com.au' + prop.find_all('a')[0]['href']
        except:
            url = ''
            
        bedrooms, bathrooms, study, parking = parse_features(features)
            
        address = address.replace(',,', ',').split(',')
        data['address'].append(''.join(address[:-1]).strip())
        data['suburb'].append(address[-1].strip())
        data['price'].append(parse_price(price))
        data['property_type'].append(property_type.strip())
        data['bedrooms'].append(bedrooms)
        data['bathrooms'].append(bathrooms)
        data['study'].append(study)
        data['parking'].append(parking)
        data['url'].append(url)
        
    return data

In [3]:
dicts = []

all_html = os.listdir(LOCATION)
for filename in tqdm(all_html):
    with open(f'{LOCATION}/{filename}', encoding='utf-8') as fp:
        text = fp.read()
        html = BeautifulSoup(text, 'html.parser')
        dicts.append(extract_data(html))
        
data = pd.concat(map(pd.DataFrame, dicts)).reset_index(drop=1)

100%|██████████████████████████████████████████████████████████████████████████████████| 98/98 [00:13<00:00,  7.53it/s]


In [4]:
data.to_csv('data3.csv', index=False)

In [68]:
data[((data.bedrooms > 1) | (data.study == True)) & data.price.notna() & (data.price < 750)].sort_values('price').tail(30)

Unnamed: 0,address,suburb,price,property_type,bedrooms,bathrooms,study,parking,url
2069,10/17 Mosman Street,Mosman,725.0,Apartment,2,1,False,1,https://www.realestate.com.au/property-apartme...
891,14.01/1A Lawson Square,Redfern,725.0,Apartment,1,2,True,0,https://www.realestate.com.au/property-apartme...
508,146/267 Bulwara Rd,Ultimo,725.0,Apartment,2,1,False,1,https://www.realestate.com.au/property-apartme...
480,41/1-35 Pine Street,Chippendale,725.0,Apartment,2,1,False,1,https://www.realestate.com.au/property-apartme...
1815,2/143 Willoughby Road,Naremburn,725.0,Unit,2,1,False,2,https://www.realestate.com.au/property-unit-ns...
1159,1/502 New South Head Road,Double Bay,725.0,Apartment,2,1,False,0,https://www.realestate.com.au/property-apartme...
892,10.06/1A Lawson Square,Redfern,725.0,Apartment,1,2,True,0,https://www.realestate.com.au/property-apartme...
1508,106/2 Barr Street,Camperdown,730.0,Apartment,2,1,True,1,https://www.realestate.com.au/property-apartme...
2151,50/10 Drovers Way,Lindfield,730.0,Apartment,2,2,False,1,https://www.realestate.com.au/property-apartme...
966,28/69 Cook Road,Centennial Park,730.0,Apartment,2,1,False,1,https://www.realestate.com.au/property-apartme...
