### Scrap Zillow Data 
#### Features
* Zillow_ID 
* Price
* SQFT
* Bedrooms
* Bathrooms
* Street_Address 
* City
* State
* Zip_Code
* Days_on_Market
* Type
* Laundry
* Heating
* Cooling 
* Pets
* Parking
* Neighbourhood



In [269]:
import re as re
import time
import zipcode
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

import time
import pandas as pd
from bs4 import BeautifulSoup

#### Automatically Find Zipcodes of Nearby place

In [290]:
#zipcode.islike: list of Zip objects that begin with given prefix.
def find_all_zipcode(zip_items):
    #single sting
    if type(zip_items) == str:
        zipcode_obj = zipcode.islike(zip_items)
        output = re.findall('\d+', str(zipcode_obj))
    elif type(zip_items) == list:
        output = [n for each in zip_items for n in re.findall('\d+', str(zipcode.islike(each)))]       
    else:
        raise ValueError("input 'zip_items' must be of type str or list")
    return output

#### Open Chrome Driver

In [377]:
def open_driver(driverpath,url):
    global driver
    driver = webdriver.Chrome(driverpath)
    driver.get(url) #'https://www.zillow.com/homes/for_rent/'
    return driver

#### Open the Zillow Rent Page, Input the Zipcode
#### Scrap the page sources and Keep click the next page
* Features : Zillow_ID, Price, SQFT, Bedrooms, Bathrooms, Street_Address, City, State, Zip_Code, Days_on_Market

In [378]:
def input_grab(driver,search_term):
    wait = WebDriverWait(driver, 10)
    
    actions = wait.until(EC.presence_of_element_located((By.ID, "citystatezip")))
    button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "zsg-icon-searchglass")))
    actions.clear()
    time.sleep(3)
    actions.send_keys(search_term)
    time.sleep(3)
    button.click()
    time.sleep(3)  
    
    output = []
    while True:
    # grab the data
        output.append(driver.page_source)
        # click next link
        try:             
            element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'zsg-pagination-next')))
            element.click()
            time.sleep(3)
        except TimeoutException:
            break
    return output

####  Split the raw HTML into segments, one for each listing.

In [379]:
def get_listings(list_obj):
    
    output = []
    for i in list_obj:
        htmlSplit = i.split('" id="zpid_')[1:]
        output += htmlSplit
    print 
    print str(len(output)) , " results scraped\n***"
    return output

In [8]:
find_all_zipcode('0703')

['07030', '07031', '07032', '07033', '07034', '07035', '07036', '07039']

In [380]:
driver = open_driver('/Users/Lucinda/Downloads/chromedriver', 'https://www.zillow.com/homes/for_rent/')

In [366]:
# It takes some minutes
raw_data = input_grab(driver, '10460')

In [367]:
listings = get_listings(raw_data)


80  results scraped
***


#### Property detail information
* As the webpage that shows different houses do not contain the specific property info, let's click the url of each house.
* Scrap Type, Laundry, Heating, Cooling, Pets, Parking

In [368]:
def scrap_details(url):   
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "hdp-fact-ataglance-heading")))#Feature
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "hdp-fact-ataglance-value")))#value
    
    features = driver.find_elements_by_class_name('hdp-fact-ataglance-heading')
    values = driver.find_elements_by_class_name('hdp-fact-ataglance-value') 
    soup_n = BeautifulSoup(driver.page_source, 'lxml')
    neighbourhood = soup_n.find('h2',{'data-module': "neighborhood"}).get_text()
    
    return neighbourhood, values


In [381]:
Street_Address = []
City = []
State = []
Zip_Code = []
Price = []
SQFT = []
Bedrooms = []
Bathrooms = []
Days_on_Market = []
URL = []
Zillow_ID = []
Neighborhood = []
Type = []
Laundry = []
Heating = []
Cooling = []
Pets = []
Parking = []
House_URL = []
j = 0
for i in  range(len(listings)): 
    soup = BeautifulSoup(listings[i], "lxml")    
    try:
        Street_Address.append(soup.find('span', {"itemprop" : "streetAddress"}).get_text())
    except AttributeError:
        Street_Address.append("NA")            
    try:
        City.append(soup.find('span', {"itemprop": "addressLocality"}).get_text())
    except AttributeError:
        City.append("NA")        
    try:
        State.append(soup.find('span', {"itemprop": "addressRegion"}).get_text())
    except AttributeError:
        State.append("NA")     
    try:
        Zip_Code.append(soup.find('span', {"itemprop": "postalCode"}).get_text())
    except AttributeError:
        Zip_Code.append("NA")
    try:
        Price.append(
            int(soup.find('span', {"class": "zsg-photo-card-price"}).get_text().split('/')[0].replace('$', '').replace(',', '')))
    except AttributeError:
        Price.append("NA")
    
    try: 
        Bedrooms.append([int(s) for s in soup.find('span', {"class": "zsg-photo-card-info"}).get_text().replace(',', '').split() if s.isdigit()][0])
    except (ValueError, IndexError,AttributeError):
        Bedrooms.append("NA")
    
    try: 
        Bathrooms.append([int(s) for s in soup.find('span', {"class": "zsg-photo-card-info"}).get_text().replace(',', '').split() if s.isdigit()][1])

    except (ValueError, IndexError,AttributeError):
        Bathrooms.append("NA")
    
    try:   
        SQFT.append([int(s) for s in soup.find('span', {"class": "zsg-photo-card-info"}).get_text().replace(',', '').split() if s.isdigit()][2])
    except (ValueError, IndexError,AttributeError):
        SQFT.append("NA")
        
    try:   
        Days_on_Market.append(soup.find('span', {"class": "zsg-photo-card-notification"}).get_text())
    except (ValueError, IndexError,AttributeError):
        Days_on_Market.append("NA")
        
    try:
        zpid = soup.find('p').text.split('"')[0]
        Zillow_ID.append(zpid)
    except (ValueError, IndexError,AttributeError):
        Zillow_ID.append("NA")
        
    #open each page of house on market
    url1 = 'https://www.zillow.com/homes/for_rent/Hoboken-NJ/house,condo,apartment_duplex,mobile,townhouse_type/'
   
    url3 = '_zpid/25146_rid/40.753662,-74.000173,40.735779,-74.057679_rect/13_zm/'
    url_full = url1 + zpid + url3
    try:
        values = scrap_details(url_full)[1]
        neighbourhood = scrap_details(url_full)[0].split(':')[1].strip()
        
    except(ValueError, IndexError,AttributeError,TimeoutException):
        pass
    
    try:
        Type.append(values[0].text)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Type.append("NA")
    try:    
        Laundry.append(values[1].text)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Laundry.append("NA")
    try:
        Heating.append(values[2].text)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Heating.append("NA")
    try:
        Cooling.append(values[3].text)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Cooling.append("NA")
    try:
        Pets.append(values[4].text)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Pets.append("NA")
    try:
        Parking.append(values[5].text)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Parking.append("NA")
    try:
        Neighborhood.append(neighbourhood)
    except (ValueError, IndexError,AttributeError,TimeoutException,StaleElementReferenceException):
        Neighborhood.append("NA")

        

columns={'Zillow_ID': Zillow_ID,'streetAddress':Street_Address, 'City': City, 'State':State, 'Zip_Code': Zip_Code, 'Monlthly Rental': Price,
        'SQFT' : SQFT, 'Bedrooms' : Bedrooms, 'Bathrooms' : Bathrooms, ' Days_on_Market':  Days_on_Market, 'Type':Type,
        'Laundry':Laundry,'Heating': Heating, 'Cooling':Cooling, 'Pets':Pets,'Parking':Parking }

# Create a dataframe from the columns variable
df = pd.DataFrame(columns)
#delete NA values in the Bathrooms(Maybe HTML editors use two methods to name classes in web design)
df2 = df[df.Bathrooms != 'NA']

In [386]:
df2

Unnamed: 0,Days_on_Market,Bathrooms,Bedrooms,City,Cooling,Heating,Laundry,Monlthly Rental,Parking,Pets,SQFT,State,Type,Zillow_ID,Zip_Code,streetAddress
0,1 day ago,1,1,Bronx,,,,1050,,,,NY,,29812750,10460,615 Mead St APT 1
1,1 day ago,1,1,Bronx,,,,1350,,,,NY,,2094541921,10460,789 E 183rd St
3,3 days ago,1,1,Bronx,,,,1300,,,,NY,,29798521,10460,1444 Taylor Ave
4,4 days ago,1,1,Bronx,,,,1400,,,3568,NY,,2094581769,10460,Van Buren St
5,8 days ago,1,4,Bronx,,,,2400,,,,NY,,2123568849,10460,922 Bronx Park S APT 2D
6,9 days ago,2,3,Bronx,,,,2000,,,,NY,,2094631869,10460,Southern Blvd
7,9 days ago,1,3,Bronx,,,,2350,,,,NY,,2094636261,10460,1717 Taylor Ave # 1
8,9 days ago,2,3,Bronx,,,,1990,,,1000,NY,,2104995421,10460,Vyse Ave
9,10 days ago,1,2,Bronx,,,,1625,,,18300,NY,,2125309924,10460,878 E 176th St
10,10 days ago,1,1,Bronx,,,,1150,,,600,NY,,2097852634,10460,E Tremont Ave


#### Next step: 
* explore data analysis 
* Data Clean:
        No Data can be infered as No(i.e No Parking Site)

In [287]:
df2

Unnamed: 0,Days_on_Market,Bathrooms,Bedrooms,City,Cooling,Heating,Laundry,Monlthly Rental,Parking,Pets,SQFT,State,Type,Zillow_ID,Zip_Code,streetAddress
0,38 minutes ago,1,2,Hoboken,No Data,No Data,Shared,2200,No Data,No,700,NJ,Condo,2102739440,07030,811 Park Ave APT 9
1,1 hour ago,1,2,Hoboken,No Data,Baseboard,No Data,2375,No Data,No Data,537,NJ,Apartment,88865844,07030,734 Adams St APT 4A
4,1 hour ago,2,2,Hoboken,Other,Forced air,No Data,2850,No Data,No Data,950,NJ,Apartment,52842059,07030,1115 Willow Ave APT 201
5,1 hour ago,1,1,Hoboken,No Data,No Data,No Data,1900,No Data,No Data,,NJ,Apartment,2113758106,07030,116 Washington St APT 1
6,1 hour ago,1,1,Hoboken,Other,Forced air,Shared,2500,"Carport, Garage - Attached",No,738,NJ,Condo,2094544904,07030,Monroe St
9,2 hours ago,2,3,Hoboken,Central,Forced air,In Unit,3850,On street,Cats,,NJ,Apartment,2094545623,07030,529 Madison St APT 4
11,5 hours ago,1,1,Hoboken,Central,No Data,Shared,2100,Off street,"Cats, small dogs",750,NJ,Apartment,2094547218,07030,Madison St
12,5 hours ago,2,2,Hoboken,No Data,No Data,No Data,2800,No Data,No Data,,NJ,Multi Family,2094547631,07030,85 Madison St APT 6E
13,5 hours ago,1,1,Hoboken,No Data,No Data,No Data,2450,No Data,No,775,NJ,Condo,2094548003,07030,620 Garden St
14,6 hours ago,2,2,Hoboken,"Central, Other",Forced air,Shared,2800,"Garage - Attached, Off street, On street","Cats, small dogs",1000,NJ,Apartment,2112850785,07030,(Undisclosed Address)
