In [1]:
import re as re
import time
import zipcode
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def zipcodes_list(st_items):
    # If st_items is a single zipcode string.
    if isinstance(st_items, str):
        zc_objects = zipcode.islike(st_items)
        output = [str(i).split(" ", 1)[1].split(">")[0] for i in zc_objects]
    # If st_items is a list of zipcode strings.
    elif isinstance(st_items, list):
        zc_objects = [n for i in st_items for n in zipcode.islike(str(i))]
        output = [str(i).split(" ", 1)[1].split(">")[0] for i in zc_objects]
    else:
        raise ValueError("arg 'st_items' must be of type str or list")
    return(output)

In [3]:
def init_driver(file_path):
    # Starting maximized fixes https://github.com/ChrisMuir/Zillow/issues/1
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(executable_path=file_path, 
                              chrome_options=options)
    driver.wait = WebDriverWait(driver, 10)
    return(driver)

In [4]:
def _is_element_displayed(driver, elem_text, elem_type):
    if elem_type == "class":
        try:
            out = driver.find_element_by_class_name(elem_text).is_displayed()
        except (NoSuchElementException, TimeoutException):
            out = False
    elif elem_type == "css":
        try:
            out = driver.find_element_by_css_selector(elem_text).is_displayed()
        except (NoSuchElementException, TimeoutException):
            out = False
    else:
        raise ValueError("arg 'elem_type' must be either 'class' or 'css'")
    return(out)

In [5]:
def _pause_for_captcha(driver):
    while True:
        time.sleep(30)
        if not _is_element_displayed(driver, "captcha-container", "class"):
            break

In [6]:
def check_for_captcha(driver):
    if _is_element_displayed(driver, "captcha-container", "class"):
        print("\nCAPTCHA!\n"\
              "Manually complete the captcha requirements.")
        _pause_for_captcha(driver)

In [7]:
def navigate_to_website(driver, site):
    driver.get(site)
    # Check to make sure a captcha page is not displayed.
    check_for_captcha(driver)

In [8]:
def enter_search_term(driver, search_term):
    if not isinstance(search_term, str):
        search_term = str(search_term)
    try:
        search_bar = driver.wait.until(EC.presence_of_element_located(
            (By.ID, "citystatezip")))
        button = driver.wait.until(EC.element_to_be_clickable(
            (By.CLASS_NAME, "zsg-icon-searchglass")))
        search_bar.clear()
        time.sleep(3)
        search_bar.send_keys(search_term)
        time.sleep(3)
        button.click()
        time.sleep(3)
        return(True)
    except (TimeoutException, NoSuchElementException):
        return(False)
    # Check to make sure a captcha page is not displayed.
    check_for_captcha(driver)

In [9]:
def test_for_no_results(driver):
    # Check to see if the "zoom out" msg exists (an indication that no results
    # were returned from the search).
    no_results = _is_element_displayed(driver, ".zoom-out-message", "css")
    # If the zoom-out msg is not displayed, check for "invalid zip" msg.
    if not no_results:
        no_results = _is_element_displayed(driver, "zsg-icon-x-thick", "class")
    # Check to make sure a captcha page is not displayed.
    check_for_captcha(driver)
    return(no_results)

In [10]:
def get_html(driver):
    output = []
    keep_going = True
    while keep_going:
        # Pull page HTML
        try:
            output.append(driver.page_source)
        except TimeoutException:
            pass
        # Check to see if a "next page" link exists.
        keep_going = _is_element_displayed(driver, "zsg-pagination-next", 
                                           "class")
        if keep_going:
            # Test to ensure the "updating results" image isnt displayed. 
            # Will try up to 5 times before giving up, with a 5 second wait 
            # between each try.             
            tries = 5
            cover = _is_element_displayed(driver, 
                                          "list-loading-message-cover", 
                                          "class")
            while cover and tries > 0:
                time.sleep(5)
                tries -= 1
                cover = _is_element_displayed(driver, 
                                              "list-loading-message-cover", 
                                              "class")
            # If the "updating results" image is confirmed to be gone 
            # (cover == False), click next page. Otherwise, give up on trying 
            # to click thru to the next page of house results, and return the 
            # results that have been scraped up to the current page.
            if not cover:
                try:
                    driver.wait.until(EC.element_to_be_clickable(
                        (By.CLASS_NAME, "zsg-pagination-next"))).click()
                    time.sleep(3)
                    # Check to make sure a captcha page is not displayed.
                    check_for_captcha(driver)
                except TimeoutException:
                    keep_going = False
            else:
                keep_going = False
    return(output)

In [11]:
def get_listings(list_obj):
    output = []
    for i in list_obj:
        htmlSplit = i.split('" id="zpid_')[1:]
        output += htmlSplit
    return(output)

In [12]:
# Helper function for testing if an object is "empty" or not.
def _is_empty(obj):
    if any([len(obj) == 0, obj == "null"]):
        return(True)
    else:
        return(False)

In [13]:
# For most listings, card_info will contain info on number of bedrooms, 
# number of bathrooms, square footage, and sometimes price.
def get_card_info(soup_obj):
    try:
        card = soup_obj.find(
            "span", {"class" : "zsg-photo-card-info"}).get_text().split(u" \xb7 ")
    except (ValueError, AttributeError):
        card = "NA"
    if _is_empty(card):
        card = "NA"
    return(card)

In [14]:
def get_street_address(soup_obj):
    try:
        street = soup_obj.find(
            "span", {"itemprop" : "streetAddress"}).get_text().strip()
    except (ValueError, AttributeError):
        street = "NA"
    if _is_empty(street):
        street = "NA"
    return(street)

In [15]:
def get_city(soup_obj):
    try:
        city = soup_obj.find(
            "span", {"itemprop" : "addressLocality"}).get_text().strip()
    except (ValueError, AttributeError):
        city = "NA"
    if _is_empty(city):
        city = "NA"
    return(city)

In [16]:
def get_state(soup_obj):
    try:
        state = soup_obj.find(
            "span", {"itemprop" : "addressRegion"}).get_text().strip()
    except (ValueError, AttributeError):
        state = "NA"
    if _is_empty(state):
        state = "NA"
    return(state)

In [17]:
def get_zipcode(soup_obj):
    try:
        zipcode = soup_obj.find(
            "span", {"itemprop" : "postalCode"}).get_text().strip()
    except (ValueError, AttributeError):
        zipcode = "NA"
    if _is_empty(zipcode):
        zipcode = "NA"
    return(zipcode)

In [18]:
def get_price(soup_obj, list_obj):
    # Look for price within the BeautifulSoup object.
    try:
        price = soup_obj.find(
            "span", {"class" : "zsg-photo-card-price"}).get_text().strip()
    except (ValueError, AttributeError):
        # If that fails, look for price within list_obj (object "card_info").
        try:
            price = [n for n in list_obj 
                         if any(["$" in n, "K" in n, "k" in n])]
            if len(price) > 0:
                price = price[0].split(" ")
                price = [n for n in price if re.search("\d", n)]
                if len(price[0]) > 0:
                    price = price[0]
                else:
                    price = "NA"
            else:
                price = "NA"
        except (ValueError, AttributeError):
            price = "NA"
    if _is_empty(price):
        price = "NA"
    if price != "NA":
        # Transformations to the price string.
        price = price.replace(",", "").replace("+", "").replace("$", "").lower()
        if "k" in price:
            price = price.split("k")[0].strip()
            price = price + "000"
        if "m" in price:
            price = price.split("m")[0].strip()
            if "." not in price:
                price = price + "000000"
            else:
                pricelen = len(price.split(".")[0]) + 6
                price = price.replace(".", "")
                price = price + ((pricelen - len(price)) * "0")
        if _is_empty(price):
            price = "NA"

In [19]:
def get_sqft(list_obj):
    sqft = [n for n in list_obj if "sqft" in n]
    if len(sqft) > 0:
        try:
            sqft = float(
                sqft[0].split("sqft")[0].strip().replace(",", "").replace("+", "")
            )
        except (ValueError, IndexError):
            sqft = "NA"
        if sqft == 0:
            sqft = "NA"
    else:
        sqft = "NA"
    return(sqft)

In [20]:
def get_bedrooms(list_obj):
    beds = [n for n in list_obj if any(["bd" in n, "tudio" in n])]
    if len(beds) > 0:
        beds = beds[0].lower()
        if beds == "studio":
            return(0.0)
        try:
            beds = float(beds.split("bd")[0].strip())
        except (ValueError, IndexError):
            beds = "NA"
    else:
        beds = "NA"
    return(beds)

In [21]:
def get_bathrooms(list_obj):
    baths = [n for n in list_obj if "ba" in n]
    if len(baths) > 0:
        try:
            baths = float(baths[0].split("ba")[0].strip())
        except (ValueError, IndexError):
            baths = "NA"
        if baths == 0:
            baths = "NA"
    else:
        baths = "NA"
    return(baths)

In [22]:
def get_days_on_market(soup_obj):
    try:
        dom = soup_obj.find_all(
            "ul", {"class" : "zsg-list_inline zsg-photo-card-badge"})
        if dom is not None:
            dom = [n.get_text().strip().lower() for n in dom]
            dom = [n for n in dom if "zillow" in n]
            if len(dom) > 0:
                dom = int(dom[0].split(" ")[0])
            else:
                dom = "NA"
        else:
            dom = "NA"
    except (ValueError, AttributeError):
        dom = "NA"
    return(dom)

In [23]:
def get_sale_type(soup_obj):
    try:
        sale_type = soup_obj.find(
            "span", {"class" : "zsg-photo-card-status"}).get_text().strip()
    except (ValueError, AttributeError):
        sale_type = "NA"
    if _is_empty(sale_type):
        sale_type = "NA"
    return(sale_type)

In [24]:
def get_url(soup_obj):
    # Try to find url in the BeautifulSoup object.
    href = [n["href"] for n in soup_obj.find_all("a", href = True)]
    url = [i for i in href if "homedetails" in i]
    if len(url) > 0:
        url = "http://www.zillow.com/homes/for_sale/" + url[0]
    else:
        # If that fails, contruct the url from the zpid of the listing.
        url = [i for i in href if "zpid" in i and "avorite" not in i]
        if len(url) > 0:
            zpid = re.findall(r"\d{8,10}", url[0])
            if zpid is not None and len(zpid) > 0:
                url = "http://www.zillow.com/homes/for_sale/" \
                        + str(zpid[0]) \
                        + "_zpid/any_days/globalrelevanceex_sort/29.759534," \
                        + "-95.335321,29.675003,-95.502863_rect/12_zm/"
            else:
                url = "NA"
        else:
            url = "NA"
    return(url)

In [25]:
def close_connection(driver):
    driver.quit()

In [26]:
# Create list of search terms.
# Function zipcodes_list() creates a list of US zip codes that will be 
# passed to the scraper. For example, st = zipcodes_list(["10", "11", "606"])  
# will yield every US zip code that begins with "10", begins with "11", or 
# begins with "606", as a list object.
# I recommend using zip codes, as they seem to be the best option for catching
# as many house listings as possible. If you want to use search terms other 
# than zip codes, simply skip running zipcodes_list() function below, and add 
# a line of code to manually assign values to object st, for example:
# st = ["Chicago", "New Haven, CT", "77005", "Jacksonville, FL"]
# Keep in mind that, for each search term, the number of listings scraped is 
# capped at 520, so in using a search term like "Chicago" the scraper would 
# end up missing most of the results.
# Param st_items can be either a list of zipcode strings, or a single zipcode 
# string.
st = zl.zipcodes_list(st_items = ["100", "770"])

# Initialize the webdriver.
driver = zl.init_driver("C:/Users/username/chromedriver.exe")

# Go to www.zillow.com/homes
zl.navigate_to_website(driver, "http://www.zillow.com/homes")

# Click the "buy" button.
zl.click_buy_button(driver)

# Get total number of search terms.
num_search_terms = len(st)

# Initialize list obj that will house all scraped data.
output_data = []

# Start the scraping.
for idx, term in enumerate(st):
    # Enter search term and execute search.
    if zl.enter_search_term(driver, term):
        print("Entering search term %s of %s" % 
              (str(idx + 1), str(num_search_terms)))
    else:
        print("Search term %s failed, moving on to next search term\n***" % 
              str(idx + 1))
        continue

    # Check to see if any results were returned from the search.
    # If there were none, move onto the next search.
    if zl.test_for_no_results(driver):
        print("Search %s returned zero results. Moving on to next search\n***" %
              str(term))
        continue

    # Pull the html for each page of search results. Zillow caps results at 
    # 20 pages, each page can contain 26 home listings, thus the cap on home 
    # listings per search is 520.
    raw_data = zl.get_html(driver)
    print("%s pages of listings found" % str(len(raw_data)))

    # Take the extracted HTML and split it up by individual home listings.
    listings = zl.get_listings(raw_data)
    print("%s home listings scraped\n***" % str(len(listings)))

    # For each home listing, extract the 11 variables that will populate that 
    # specific observation within the output dataframe.
    for home in listings:
        soup = BeautifulSoup(home, "lxml")
        new_obs = []

        # List that contains number of beds, baths, and total sqft (and 
        # sometimes price as well).
        card_info = zl.get_card_info(soup)

        # Street Address
        new_obs.append(zl.get_street_address(soup))
        
        # City
        new_obs.append(zl.get_city(soup))
        
        # State
        new_obs.append(zl.get_state(soup))
        
        # Zipcode
        new_obs.append(zl.get_zipcode(soup))
        
        # Price
        new_obs.append(zl.get_price(soup, card_info))
        
        # Sqft
        new_obs.append(zl.get_sqft(card_info))
        
        # Bedrooms
        new_obs.append(zl.get_bedrooms(card_info))
        
        # Bathrooms
        new_obs.append(zl.get_bathrooms(card_info))
        
        # Days on the Market/Zillow
        new_obs.append(zl.get_days_on_market(soup))
        
        # Sale Type (House for Sale, New Construction, Foreclosure, etc.)
        new_obs.append(zl.get_sale_type(soup))
        
        # URL for each house listing
        new_obs.append(zl.get_url(soup))
        
        # Append new_obs to list output_data.
        output_data.append(new_obs)

# Close the webdriver connection.
zl.close_connection(driver)

# Write data to data frame, then to CSV file.
file_name = "%s_%s.csv" % (str(time.strftime("%Y-%m-%d")), 
                           str(time.strftime("%H%M%S")))
columns = ["address", "city", "state", "zip", "price", "sqft", "bedrooms", 
           "bathrooms", "days_on_zillow", "sale_type", "url"]
pd.DataFrame(output_data, columns = columns).to_csv(
    file_name, index = False, encoding = "UTF-8"
)

NameError: name 'zl' is not defined

In [3]:
df_counties = pd.read_csv('list-counties-us-436j.csv')

In [4]:
df_counties.head()

Unnamed: 0,County or equivalent,State or district,Core Based Statistical Area,Combined Statistical Area
0,Autauga County,Alabama,"Montgomery, AL Metropolitan Statistical Area",
1,Baldwin County,Alabama,"Daphne-Fairhope-Foley, AL Micropolitan Statist...","Mobile-Daphne-Fairhope, AL Combined Statistica..."
2,Barbour County,Alabama,,
3,Bibb County,Alabama,"Birmingham-Hoover, AL Metropolitan Statistical...","Birmingham-Hoover-Talladega, AL Combined Stati..."
4,Blount County,Alabama,"Birmingham-Hoover, AL Metropolitan Statistical...","Birmingham-Hoover-Talladega, AL Combined Stati..."


In [5]:
counties = [num for num in df_counties['County or equivalent']]

In [None]:
r=requests.get()