# Data Scraping

We're going to scrape data from the https://www.pamgolding.co.za/ website.

Looking at having the following features extracted:
- **houseID (int)**: unique property identifier
- **type (string)**: 'house' or 'apartment'
- **location/area(string)**: area of property, each area contains many suburbs
- **Building size (int)**: size of building in $m^2$
- **Number of bedrooms (float)**: number of bedrooms on property
- **Number of Bathrooms (float)**: number of bathrooms on property
- **Number of Garages(float)**: number of garages on property
- **price (int)**: price of property in ZAR



Todo:
- data-date (datetime): date on which property was first listed
- data-isonshow (bool): 'True' if property was on show at the time
    of scraping the data

In [3]:
import requests as rq
import bs4 as bs
import traceback
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import glob, os, time

In [56]:
# # run the bellow file, if it gives an erro, it means you need to install chrome driver and put it in your path
# # this opens a chrome "site" based on the link below which we will scrape from
driver = webdriver.Chrome(executable_path="/home/cate/Downloads/chromedriver_linux64/chromedriver")
driver.get("https://www.property24.com/for-sale/cape-town/western-cape/432?PropertyCategory=House%2cApartmentOrFlat%2cTownhouse")

In [18]:
page_soup = bs.BeautifulSoup(driver.page_source,'lxml')

In [None]:
#dict_data = {"location" :[], "price":[], "floor_size":[], "bathrooms":[], "bedrooms":[],"parking":[] }
#info = page_soup.find_all("div", class_= "p24_information")
# for values in info:
#     price = values.find('div', class_= 'p24_price').text
#     location = values.find('span', class_= "p24_location").text
    
#     dict_data["location"].append(location)
#     dict_data["price"].append(price)

In [16]:
#icons = page_soup.find_all("div", class_= "p24_icons")

In [54]:
#all_items = page_soup.find_all("div", class_="js_listingResultsContainer")

In [68]:
icons

[]

In [69]:
dict_data = {"location" :[], "price":[], "floor_size":[], "bathrooms":[], "bedrooms":[],"parking":[] }
icons = page_soup.find_all("span", class_= "p24_icons")
info = page_soup.find_all("div", class_= "p24_regularTile js_rollover_container")
def getValues(icons, info):
    
    for values in info:
        price = values.find('span', class_= 'p24_price')
        if price:
            price = price.text
        else:
            ""
        location = values.find('span', class_= "p24_location")
        if location:
            location = location.text 
        else:
            ""
        
        dict_data["price"].append(price)
        dict_data["location"].append(location)
        #print(price)

    for value in icons:
        floor_size = value.find("span", class_= "p24_size")
        if floor_size:
            floor_size = floor_size.find("span").text
        else:
            ""
        bathrooms = value.find("span", {"title": "Bathrooms"})
        if bathrooms:
            bathrooms = bathrooms.find("span").text
        else:
            ""
        bedrooms = value.find("span", {"title": "Bedrooms"})
        if bedrooms:
            bedrooms = bedrooms.find("span").text
        else:
            ""
        parking = value.find("span", {"title": "Parking Spaces"})
        if parking:
            parking = parking.find("span").text
        else: 
            ""
        dict_data["floor_size"].append(floor_size)
        dict_data["bathrooms"].append(bathrooms)
        dict_data["bedrooms"].append(bedrooms)
        dict_data["parking"].append(parking)
    return dict_data

In [70]:
dict_data = getValues(icons, info)

In [71]:
dict_data

{'location': ['Tokai',
  'Camps Bay',
  'Kenilworth Upper',
  'Gardens',
  'Sea Point',
  'Parkwood',
  'Bantry Bay',
  'Tokai',
  'Sea Point',
  'Kenilworth',
  'Sea Point',
  'Zonnebloem',
  'Retreat',
  'Thornton',
  'Waterfront',
  'Tamboerskloof',
  'Langa',
  'Southfield',
  'Pinelands',
  'Claremont Upper',
  'Observatory'],
 'price': ['\n\n            R 4\xa0450\xa0000\n        ',
  '\n\n            R 13\xa0950\xa0000\n        ',
  '\n\n            R 20\xa0000\xa0000\n        ',
  '\n\n            R 4\xa0850\xa0000\n        ',
  '\n\n            R 4\xa0950\xa0000\n        ',
  '\n\n            R 690\xa0000\n        ',
  '\n\n            R 3\xa0500\xa0000\n        ',
  '\n\n            R 4\xa0450\xa0000\n        ',
  '\n\n            R 6\xa0250\xa0000\n        ',
  '\n\n            R 1\xa0500\xa0000\n        ',
  '\n\n            R 2\xa0850\xa0000\n        ',
  '\n\n            R 1\xa0090\xa0000\n        ',
  '\n\n            R 1\xa0695\xa0000\n        ',
  '\n\n            R 72

In [72]:
import csv
from csv import writer
def append_list_as_row(file_name, dict_data, field_names):
    # Open file in append mode
    with open(file_name, 'a+', newline='') as write_obj:
        # Create a writer object from csv module
        writer = csv.DictWriter(write_obj, fieldnames = field_names)
        writer.writerow(dict_data)

In [78]:
driver = webdriver.Chrome(executable_path="/home/cate/Downloads/chromedriver_linux64/chromedriver")
driver.get("https://www.property24.com/for-sale/cape-town/western-cape/432?PropertyCategory=House%2cApartmentOrFlat%2cTownhouse")

page_soup = bs.BeautifulSoup(driver.page_source,'lxml')
count= 0

csv_file = "final.csv"

while True:
    try:
        driver.implicitly_wait(10)
        page_soup = bs.BeautifulSoup(driver.page_source,'lxml')
        icons = page_soup.find_all("span", class_= "p24_icons")
        info = page_soup.find_all("div", class_= "p24_regularTile js_rollover_container")
        dict_data = {"location" :[], "price":[], "floor_size":[], "bathrooms":[], "bedrooms":[],"parking":[] }
        dict_data = getValues(icons, info)
        field_names = dict_data.keys()
        append_list_as_row('final.csv', dict_data, field_names)
        count+= 1
        print(f'{count}\r', end = "")
        loadmore = driver.find_element_by_link_text("Next").click()
        time.sleep(5)
        #loadmore.send_keys(Keys.ENTER)
        
    except Exception:
        print("Reached bottom of page")
        traceback.print_exc()
        break

503

KeyboardInterrupt: 