In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import time
from tqdm.notebook import tqdm

In [2]:
def extract_features(listing):
    try:
        location_div = re.sub(r"[\n\t\s]*", "", listing.find("span", class_="p24_location").text)
    except:
        location_div = None
        
    try:
        bedrooms_div = re.sub(r"[\n\t\s]*", "", listing.find("span", title="Bedrooms").text)
    except:
        bedrooms_div = None

    try:
        bathrooms_div = re.sub(r"[\n\t\s]*", "", listing.find("span", title="Bathrooms").text)
    except:
        bathrooms_div = None

    try:
        parking_div = re.sub(r"[\n\t\s]*", "", listing.find("span", title="Parking Spaces").text)
    except:
        parking_div = None

    try:
        floor_size_div = re.sub(r"[\n\t\s]*", "", listing.find("span", title="Floor Size").text)[:-2]
    except:
        floor_size_div = None

    try:
        erf_size_div = re.sub(r"[\n\t\s]*", "", listing.find("span", title="Erf Size").text)[:-2]
    except:
        erf_size_div = None
        
    try:
        description_div = re.sub(r"[\n\t\r]*", "", listing.find("span", class_="p24_excerpt").text)
    except:
        description_div = None
        
            
    return [location_div], [bedrooms_div, bathrooms_div, parking_div, floor_size_div, erf_size_div, description_div]

In [3]:
def scrape_property_data(property_24_url):
    listings_per_page = []
    
    property_24_page = BeautifulSoup(property_24_url, 'html.parser')
    listings_divs = property_24_page.find_all("div", class_= "p24_content")
    listings_spans = property_24_page.find_all("span", class_= "p24_content")
    
    for listing in listings_divs:
        try:
            price_div = re.sub(r"[\n\t\s]*", "", listing.find("div", class_="p24_price").text).replace("R", "")            
        except:
            price_div = None
        
        location, other_info = extract_features(listing)

        listings_per_page.append(location + [price_div] + other_info)
        
    for listing in listings_spans:
        try:
            price_div = re.sub(r"[\n\t\s]*", "", listing.find("span", class_="p24_price").text).replace("R", "")              
        except:
            price_div = None

        location, other_info = extract_features(listing)

        listings_per_page.append(location + [price_div] + other_info)
        
    return listings_per_page

In [4]:
listings_with_developments_and_poa = [["Location", "Price", "Bedrooms", "Bathrooms", "Parking", "Floor Size", "Erf Size", "Description"]]
developments_and_poa = []

for page_number in tqdm(range(1,441)):
    
    if page_number == 1: 
        property_24_url = requests.get("https://www.property24.com/for-sale/cape-town/western-cape/432").text
    else:
        property_24_url = requests.get("https://www.property24.com/for-sale/cape-town/western-cape/432/p{}".format(page_number)).text
#         time.sleep(10)
        
        
    listings_per_page_dirty = scrape_property_data(property_24_url)
    listings_per_page_set = set(tuple(listing) for listing in listings_per_page_dirty)
    listings_per_page = [ list(listing) for listing in listings_per_page_set ]
    
    for listing in listings_per_page:
        listings_with_developments_and_poa.append(listing)
    
    for listing in listings_per_page:
        for features in listing:
            if "From" in str(features):
                developments_and_poa.append(listing)
            elif "POA" in str(features):
                developments_and_poa.append(listing)
    

HBox(children=(FloatProgress(value=0.0, max=440.0), HTML(value='')))




In [5]:
listings = [listing for listing in listings_with_developments_and_poa if listing not in developments_and_poa]
listings_df = pd.DataFrame(listings)
listings_df.columns = listings_df.iloc[0]
listings_df = listings_df[1:]
listings_df.to_csv(r"C:\Users\kile\OneDrive - Esri South Africa\DSI Programme\Module 1\Twist Challenge\Data\listings.csv", index=False)
print("Listings:\n")
listings_df

Listings:



Unnamed: 0,Location,Price,Bedrooms,Bathrooms,Parking,Floor Size,Erf Size,Description
1,Waterfront,28000000,3,3,2,192,,
2,Waterfront,55000000,3,3.5,2,410,,
3,Constantia,5995000,3,3,3,,1487,"If Position, Presentation and Pric..."
4,Constantia,4950000,,,,,2448,
5,Pinelands,1895000,2,1,2,75,,New release - Joint mandate.Move r...
...,...,...,...,...,...,...,...,...
8560,SeaPoint,3600000,1,1,,60,,PALM GARDEN RETREAT - SEA POINTLux...
8561,Observatory,3550000,,,,220,,3 Bedroom Apartment / Flat for Sal...
8562,CapeTownCityCentre,3950000,,,,188,,Selling price includes 4x parking ...
8563,Athlone,750000,,,,,605,Prime vacant land available in a s...
