## Scrape Zillow Housing Data
- Get all the housing information for renting location around UCSD, i.e. in La Jolla 

Part 1: From General Overall Page
- bs4 + request
- Data Features: 
    1. Price
    2. Address
    3. beds, baths, sqft 

Part 2: From Individual Listing Page  
- Features: 
    1. More Info: Pets, Amenities, etc
    2. Rent ZEstimate History (User Table View)

### Part 1

In [53]:
import requests 
from bs4 import BeautifulSoup as soup

In [55]:
# sample url for La Jolla renting list
url = "https://www.zillow.com/la-jolla-san-diego-ca/rentals" # url refers to the location 

In [56]:
# taken header from browser http request
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53',
    'refer': url} 

In [57]:
# get the website 
html = requests.get(url, headers=header) # don't to run it too much!!!
print(html.status_code)

200


In [58]:
# save it for later experiment
import os
if os.path.exists('zillow_la_jolla.txt'): os.remove('zillow_la_jolla.txt')
with open('zillow_la_jolla.txt', 'w') as file:
    file.write(html.text)

In [59]:
# function to read HTML file back to string and parse it using bs4
def parse_file(fp):
    with open(fp, mode='r') as file:
        html_txt = file.readlines()
    return soup(''.join(html_txt), 'html.parser')
html_txt = parse_file('zillow_la_jolla.txt')

In [60]:
# extract price, and detail info about the house
import re 
pattern=r"([1-9] bd)[s]*([1-9] ba)([0-9,\-]+ sqft)-(.*)"

info=[]
for house in html_txt.findAll('div', {'class':'list-card-heading'}):
    if len(house.text)==0: # if there is nothing (usually happens at the end)
        print('This is the end of the page')
        continue 
    ins={}
    price=house.find('div', {'class': 'list-card-price'}).text
    detail=house.find('ul', {'class': 'list-card-details'}).text
    match=re.match(pattern, detail)
    for index, cat in enumerate(['bds','ba','sqft','house-type']): 
        if match!=None: 
            ins[cat]=match.group(index+1)
        else:
            print("Reach the End")
    info.append(ins)

Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
Reach the End
This is the end of the page


In [61]:
# extract the address of the house 
link_addr = [] # format: (link, addr)
for addr in html_txt.findAll('div', {'class': 'list-card-info'}): 
    ad=addr.find('address', {'class': 'list-card-addr'})
    link=addr.find('a', {'class': 'list-card-link'})
    if ad and link: 
        link_addr.append((link.get('href'), ad.text))
link_addr

[('/b/solazzo-apartments-homes-la-jolla-ca-5Xk5YK/',
  'Solazzo Apartments Homes | 8506 Villa La Jolla Dr, La Jolla, CA'),
 ('/b/ocean-house-on-prospect-apartment-homes-la-jolla-ca-5Xkm98/',
  'Ocean House on Prospect Apartment Homes | 400 Prospect St, La Jolla, CA'),
 ('https://www.zillow.com/homedetails/8446-Via-Sonoma-UNIT-94-La-Jolla-CA-92037/16840698_zpid/',
  '8446 Via Sonoma UNIT 94, La Jolla, CA 92037'),
 ('https://www.zillow.com/homedetails/8524-Via-Mallorca-La-Jolla-Ca-Office-1-BEDROOM-La-Jolla-CA-92037/2067481695_zpid/',
  '8524 Via Mallorca La Jolla Ca Office #1-BEDROOM, La Jolla, CA 92037'),
 ('https://www.zillow.com/homedetails/3161-Morning-Way-La-Jolla-CA-92037/16835830_zpid/',
  '3161 Morning Way, La Jolla, CA 92037'),
 ('https://www.zillow.com/homedetails/356-Playa-Del-Norte-St-La-Jolla-CA-92037/2069746535_zpid/',
  '356 Playa Del Norte St, La Jolla, CA 92037'),
 ('https://www.zillow.com/homedetails/1804-Caminito-Ascua-La-Jolla-CA-92037/16857254_zpid/',
  '1804 Caminit

In [9]:
len(info), len(link_addr)

(9, 9)

In [10]:
# store data in a dataframe 
import pandas as pd 
import numpy as np
combined = []
for i,j in zip(info, link_addr): 
    if len(i)==0 or len(j)==0: 
        continue
    i.update(dict(zip(['link', 'address'], j)))
    combined.append(i)

In [11]:
df = pd.DataFrame(combined)
# remove unit
df['bds'] = df['bds'].str.replace('[bd,]', '', regex=True)
df['ba'] = df['ba'].str.replace('[ba,]', '', regex=True)
df['sqft'] = df['sqft'].str.replace('[sqft,\-]', '', regex=True)
df = df.replace(' ', 0) # should prob leave it as nan
df.astype({'bds': 'int32',
            'ba': 'int32', 
            'sqft': 'int32'})
df

Unnamed: 0,bds,ba,sqft,house-type,link,address
0,3,3,1992,Coming soon,https://www.zillow.com/homedetails/8619-Via-Ma...,"8619 Via Mallorca UNIT C, La Jolla, CA 92037"
1,4,3,2206,Coming soon,https://www.zillow.com/homedetails/8845-Robin-...,"8845 Robin Hood Ln, La Jolla, CA 92037"
2,5,3,2772,House for sale,https://www.zillow.com/homedetails/8269-Sugarm...,"8269 Sugarman Dr, La Jolla, CA 92037"
3,3,3,1799,House for sale,https://www.zillow.com/homedetails/253-Rosemon...,"253 Rosemont St, La Jolla, CA 92037"
4,4,3,1861,Multi-family home for sale,https://www.zillow.com/homedetails/7256-7258-L...,"7256-7258 La Jolla Blvd, La Jolla, CA 92037"
5,3,3,2284,House for sale,https://www.zillow.com/homedetails/5646-Chelse...,"5646 Chelsea Ave, La Jolla, CA 92037"
6,2,2,992,Condo for sale,https://www.zillow.com/homedetails/457-Coast-B...,"457 Coast Blvd UNIT 403, La Jolla, CA 92037"
7,1,1,668,Condo for sale,https://www.zillow.com/homedetails/860-Turquoi...,"860 Turquoise St UNIT 224, San Diego, CA 92109"


In [39]:
# TODO: Automate multipage scraping 
# TODO: Increase search range to include more housing 
# TODO: Filter the rent data

In [66]:
# Using Dataox's approach 
import numpy as np 
import pandas as pd 
import re 
import lxml 
from lxml.html.soupparser import fromstring
import numbers 
import requests 
from bs4 import BeautifulSoup as soup

In [67]:
request_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webq,image/apng,*/*;1=0.8', 
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1', 
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53',
}

url = 'https://www.zillow.com/la-jolla-san-diego-ca/rentals'
with requests.Session() as session: 
    response = session.get(url, headers=request_headers)

In [71]:
soup = soup(response.content, 'html.parser')

In [78]:
addresses = []
for i in soup:
    address = soup.find_all(class_='list-card-addr')
    addresses.append(address)
print(addresses[0])
print(addresses[1])

[<address class="list-card-addr">Solazzo Apartments Homes | 8506 Villa La Jolla Dr, La Jolla, CA</address>, <address class="list-card-addr">2420 Torrey Pines Rd UNIT B303, La Jolla, CA 92037</address>, <address class="list-card-addr">Ocean House on Prospect Apartment Homes | 400 Prospect St, La Jolla, CA</address>, <address class="list-card-addr">8446 Via Sonoma UNIT 94, La Jolla, CA 92037</address>, <address class="list-card-addr">3161 Morning Way, La Jolla, CA 92037</address>, <address class="list-card-addr">356 Playa Del Norte St, La Jolla, CA 92037</address>, <address class="list-card-addr">1804 Caminito Ascua, La Jolla, CA 92037</address>, <address class="list-card-addr">8529 Villa La Jolla Dr APT E, La Jolla, CA 92037</address>, <address class="list-card-addr">3205 Via Alicante, La Jolla, CA 92037</address>]
[<address class="list-card-addr">Solazzo Apartments Homes | 8506 Villa La Jolla Dr, La Jolla, CA</address>, <address class="list-card-addr">2420 Torrey Pines Rd UNIT B303, La

### Part 2 

In [17]:
df.iloc[0]['link']

'https://www.zillow.com/homedetails/8966-Cliffridge-Ave-La-Jolla-CA-92037/16835000_zpid/'

In [120]:
# get detail specifications from specific rent pages 
# user the first link as an example 
ex_link = df.iloc[0]['link']

# get the website 
page = requests.get(ex_link, headers=header) # don't to run it too much!!!
print(page.status_code)

# save it for later experiment
if os.path.exists('page_example.txt'): os.remove('page_example.txt')
with open('page_example.txt', 'w') as file:
    file.write(page.text)

200


In [5]:
page_txt = parse_file('page_example.txt')

The page elements has dynamic class ids which makes it harder to parse

In [38]:
import re

text = page_txt.text
re.match(r'(Overview)+', text)

### Use Selenium to simulate scrolling

In [1]:
from msedge.selenium_tools import Edge
from msedge.selenium_tools import EdgeOptions
from selenium.common.exceptions import NoSuchElementException
import time
import csv
import requests
from bs4 import BeautifulSoup as soup

In [2]:
# initializes the webscraper
options = EdgeOptions()
options.use_chromium = True
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
# options.add_argument('--headless')
driver = Edge(options = options)

In [3]:
# open target page
url = "https://www.zillow.com/la-jolla-san-diego-ca/rentals"
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53', 'refer': url}
driver.get(url)
time.sleep(3)

In [14]:
# incrementally scroll to the bottom of the page then extract the page html
driver.execute_script("document.getElementById('search-page-list-container').scrollTo(0, 600)") # 600 per 3 rows of listings 

In [15]:
driver.execute_script("document.getElementById('search-page-list-container').scrollTo(0, 1200)")

In [16]:
driver.execute_script("document.getElementById('search-page-list-container').scrollTo(0, 1800)")

In [None]:
# parse source elements with bs4