# Prototyping and testing web scrapers - 28Hse

In [17]:
import requests, bs4
from pprint import pprint
import logging

The base page of 28hse rent/residential by default shows some number of latest listings. The idea is to collect all the information about those latest listings and save these.

In [40]:
def get_domain_from_url(input_url: str) -> str:
    # prefixes
    for prefix in ["//", "www."]:
        input_url = input_url if prefix not in input_url else input_url.split(prefix)[1]

    # postfixes
    for postfixes in ["/"]:
        input_url = input_url if "/" not in url else input_url.split("/")[0]
    return input_url

url = "https://www.28hse.com/en/rent/residential"
domain = get_domain_from_url(url)

Fetch the page, throw an error if we have an issue. Then, soup it to extract the structure.

In [50]:
response = requests.get(url)
response.raise_for_status()
logging.info(f"{domain}: Webpage fetched successfully")

In [53]:
souped_response = bs4.BeautifulSoup(response.text, 'html.parser')
recent_listings = souped_response.select('.property_item')
logging.info(f"{domain}: {len(recent_listings)} recent listings found")

Process a listing entry from 28hse into a dictionary, extracting all available data in a (relatively) failure-safe manner.

However this will fail if an "id" is missing for an entry.

In [190]:
def recent_listing_entry_into_dict(entry: bs4.element.Tag) -> dict:
    listing_dict = {}
    processing_dict = {
        "listingTitle" : lambda entry: entry.select(".detail_page")[1].text,
        "listingId" : lambda entry: entry.select(".detail_page")[1].get_attribute_list("attr1")[0],
        "listingUrl" : lambda entry: entry.select(".detail_page")[1].get_attribute_list("href")[0],
        "listingPostedAgo":  lambda entry: entry.select(".description")[0].select(".ui")[0].text.strip(),
        "listingArea" :  lambda entry: entry.select(".district_area")[0].select("a")[0].text,
        "listingBuilding" :  lambda entry: entry.select(".district_area")[0].select("a")[1].text,
        "listingGrossArea" :  lambda entry: float(entry.select(".areaUnitPrice")[0].select("div")[0].text.split(" ")[2]),
        "listingSaleableArea" :  lambda entry: float(entry.select(".areaUnitPrice")[0].select("div")[1].text.split(" ")[2]),
        "listingCompanyName" :  lambda entry: entry.select(".companyName")[0].text.strip(),
        "listingPrice" : lambda entry: float(entry.select(".green")[0].text.split("$")[-1].replace(",", "")),
        "listingTags" : lambda entry: [x.text.strip() for x in entry.select(".tagLabels")[0].select(".ui")]
    }

    for field in processing_dict:
        try:
            listing_dict[field] = processing_dict[field](entry)
        except IndexError:
            logging.warning(f"{domain}: Listing {listing_dict['listingId']}: Failed to fetch field '{field}")
            listing_dict[field] = None

    return listing_dict