In [1]:
# -*- coding: utf-8 -*-

import time
import pandas as pd
import zillow_functions as zl
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display


## Create list of search terms.
Function `zipcodes_list()` creates a list of US zip codes that will be passed to the scraper. For example, 
```
st = zipcodes_list(['10', '11', '606'])  
```
will yield every US zip code that begins with '10', begins with "11", or begins with "606" as a single list. I recommend using zip codes, as they seem to be the best option for catching as many house listings as possible. If you want to use search terms other than zip codes, simply skip running zipcodes_list() function below, and add a line of code to manually assign values to object `st`, for example:
```
st = ['Chicago', 'New Haven, CT', '77005', 'Jacksonville, FL']
```
Keep in mind that, for each search term, the number of listings scraped is capped at 520, so in using a search term like "Chicago" the scraper would end up missing most of the results. Param `st_items` can be either a list of zipcode strings, or a single zipcode string.

In [2]:
#zipcodes = ['84001','84002','84003','84004','84005','84006','84007','84008','84010','84011','84013','84014','84015','84016','84017','84018','84020', '84021', '84022', '84023', '84024', '84025', '84026', '84027', '84028', '84029', '84031', '84032', '84033', '84034', '84035', '84036', '84037', '84038', '84039', '84040', '84041', '84042', '84043', '84044', '84045', '84046', '84047', '84049', '84050', '84051', '84052', '84053', '84054', '84055', '84056', '84057', '84058', '84059', '84060', '84061', '84062', '84063', '84064', '84065', '84066', '84067', '84068', '84069', '84070', '84071', '84072', '84073', '84074', '84075', '84076', '84078', '84079', '84080', '84082', '84083', '84084', '84085', '84086', '84087', '84088', '84089', '84090', '84091', '84092', '84093', '84094', '84095', '84096', '84097', '84098', '84101', '84102', '84103', '84104', '84105', '84106', '84107', '84108', '84109', '84110', '84111', '84112', '84113', '84114', '84115', '84116', '84117', '84118', '84119', '84120', '84121', '84122', '84123', '84124', '84125', '84126', '84127', '84128', '84130', '84131', '84132', '84133', '84134', '84136', '84138', '84139', '84141', '84143', '84144', '84145', '84147', '84148', '84150', '84151', '84152', '84157', '84158', '84165', '84170', '84171', '84180', '84184', '84189', '84190', '84199', '84201', '84244', '84301', '84302', '84304', '84305', '84306', '84307', '84308', '84309', '84310', '84311', '84312', '84313', '84314', '84315', '84316', '84317', '84318', '84319', '84320', '84321', '84322', '84323', '84324', '84325', '84326', '84327', '84328', '84329', '84330', '84331', '84332', '84333', '84334', '84335', '84336', '84337', '84338', '84339', '84340', '84341', '84401', '84402', '84403', '84404', '84405', '84407', '84408', '84409', '84412', '84414', '84415', '84501', '84510', '84511', '84512', '84513', '84515', '84516', '84518', '84520', '84521', '84522', '84523', '84525', '84526', '84528', '84529', '84530', '84531', '84532', '84533', '84534', '84535', '84536', '84537', '84539', '84540', '84542', '84601', '84602', '84603', '84604', '84605', '84606', '84620', '84621', '84622', '84623', '84624', '84626', '84627', '84628', '84629', '84630', '84631', '84632', '84633', '84634', '84635', '84636', '84637', '84638', '84639', '84640', '84642', '84643', '84644', '84645', '84646', '84647', '84648', '84649', '84651', '84652', '84653', '84654', '84655', '84656', '84657', '84660', '84662', '84663', '84664', '84665', '84667', '84701', '84710', '84711', '84712', '84713', '84714', '84715', '84716', '84718', '84719', '84720', '84721', '84722', '84723', '84724', '84725', '84726', '84728', '84729', '84730', '84731', '84732', '84733', '84734', '84735', '84736', '84737', '84738', '84739', '84740', '84741', '84742', '84743', '84744', '84745', '84746', '84747', '84749', '84750', '84751', '84752', '84753', '84754', '84755', '84756', '84757', '84758', '84759', '84760', '84761', '84762', '84763', '84764', '84765', '84766', '84767', '84770','84771','84772','84773','84774','84775','84776','84779','84780','84781','84782','84783','84784','84790','84791']


In [10]:
st = zl.zipcodes_list(st_items = ['84058','84002','84003','84004','84005','84006','84007','84008','84010','84011','84013','84014','84015','84016','84017','84018','84020', '84021', '84022', '84023', '84024', '84025', '84026', '84027', '84028', '84029', '84031', '84032', '84033', '84034', '84035', '84036', '84037', '84038', '84039', '84040', '84041', '84042', '84043', '84044', '84045', '84046', '84047', '84049', '84050', '84051', '84052', '84053', '84054', '84055', '84056', '84057', '84058', '84059', '84060', '84061', '84062', '84063', '84064', '84065', '84066', '84067', '84068', '84069', '84070', '84071', '84072', '84073', '84074', '84075', '84076', '84078', '84079', '84080', '84082', '84083', '84084', '84085', '84086', '84087', '84088', '84089', '84090', '84091', '84092', '84093', '84094', '84095', '84096', '84097', '84098', '84101', '84102', '84103', '84104', '84105', '84106', '84107', '84108', '84109', '84110', '84111', '84112', '84113', '84114', '84115', '84116', '84117', '84118', '84119', '84120', '84121', '84122', '84123', '84124', '84125', '84126', '84127', '84128', '84130', '84131', '84132', '84133', '84134', '84136', '84138', '84139', '84141', '84143', '84144', '84145', '84147', '84148', '84150', '84151', '84152', '84157', '84158', '84165', '84170', '84171', '84180', '84184', '84189', '84190', '84199', '84601', '84602', '84603', '84604', '84605', '84606'])


display = Display(visible=0, size=(1200, 900))  
display.start()

# Initialize the webdriver.
driver = zl.init_driver("/usr/local/bin/chromedriver")

# Go to www.zillow.com/homes
zl.navigate_to_website(driver, "http://www.zillow.com/homes")

# Click the "buy" button.
#zl.click_buy_button(driver)

# Create 11 variables from the scrapped HTML data.
# These variables will make up the final output dataframe.
df = pd.DataFrame({'address' : [], 
                   'bathrooms' : [], 
                   'bedrooms' : [], 
                   'city' : [], 
                   'days_on_zillow' : [], 
                   'price' : [], 
                   'sale_type' : [], 
                   'state' : [], 
                   'sqft' : [], 
                   'url' : [], 
                   'zip' : []})

In [4]:
# Get total number of search terms.
num_search_terms = len(st)

In [8]:
import re as re
import time
import zipcode
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

In [11]:
search_bar = driver.wait.until(EC.presence_of_element_located(
            (By.ID, "citystatezip")))

TimeoutException: Message: 


In [None]:
# Start the scraping.
for k in range(num_search_terms):
    # Define search term (must be str object).
    search_term = st[k]

    # Enter search term and execute search.
    if zl.enter_search_term(driver, search_term):
        print("Entering search term number " + str(k+1) + 
              " out of " + str(num_search_terms))
    else:
        print("Search term " + str(k+1) + 
              " failed, moving onto next search term\n***")
        continue
    
    # Check to see if any results were returned from the search.
    # If there were none, move onto the next search.
    if zl.results_test(driver):
        print("Search " + str(search_term) + 
              " returned zero results. Moving onto the next search\n***")
        continue
    
    # Pull the html for each page of search results. Zillow caps results at 
    # 20 pages, each page can contain 26 home listings, thus the cap on home 
    # listings per search is 520.
    raw_data = zl.get_html(driver)
    print(str(len(raw_data)) + " pages of listings found")
    
    # Take the extracted HTML and split it up by individual home listings.
    listings = zl.get_listings(raw_data)
    
    # For each home listing, extract the 11 variables that will populate that 
    # specific observation within the output dataframe.
    for n in range(len(listings)):
        soup = BeautifulSoup(listings[n], "lxml")
        new_obs = []
        
        # List that contains number of beds, baths, and total sqft (and 
        # sometimes price as well).
        card_info = zl.get_card_info(soup)
        
        # Street Address
        new_obs.append(zl.get_street_address(soup))
        
        # Bathrooms
        new_obs.append(zl.get_bathrooms(card_info))
        
        # Bedrooms
        new_obs.append(zl.get_bedrooms(card_info))
        
        # City
        new_obs.append(zl.get_city(soup))
        
        # Days on the Market/Zillow
        new_obs.append(zl.get_days_on_market(soup))
        
        # Price
        new_obs.append(zl.get_price(soup, card_info))
        
        # Sale Type (House for Sale, New Construction, Foreclosure, etc.)
        new_obs.append(zl.get_sale_type(soup))
        
        # Sqft
        new_obs.append(zl.get_sqft(card_info))
        
        # State
        new_obs.append(zl.get_state(soup))
        
        # URL for each house listing
        new_obs.append(zl.get_url(soup))
        
        # Zipcode
        new_obs.append(zl.get_zipcode(soup))
    
        # Append new_obs to df as a new observation
        if len(new_obs) == len(df.columns):
            df.loc[len(df.index)] = new_obs

In [None]:
# Close the webdriver connection.
zl.close_connection(driver)

# Write df to CSV.
columns = ['address', 'city', 'state', 'zip', 'price', 'sqft', 'bedrooms', 
           'bathrooms', 'days_on_zillow', 'sale_type', 'url']
df = df[columns]
dt = time.strftime("%Y-%m-%d") + "_" + time.strftime("%H%M%S")
file_name = str(dt) + ".csv"
df.to_csv(file_name, index = False)