# VRBO Scrape



In [1]:
# imports

import pandas as pd

import requests               
from bs4 import BeautifulSoup 

# imports for Selenium

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import time


Unlike Airbnb, VRBO will show all listings for a given location; there is a button at the bottom of the page that needs to be selected to continue on to subsequent listings.

Also unlike Airbnb, VRBO doesn't allow you to be flexible with choosing your visit dates and a pop-up will block the searching by Selenium if I don't select dates. I went far into the future so that the search returned the full number of listings.  

Now, the VRBO website doesn't load all of its listings per page automatically. It loads them as someone scrolls through the website. So, I need to tell Selenium to scroll slowly through the website, allowing all of the listings time to load. 

## Manual Scrape Learning Process

In [2]:
gardiner_url = 'https://www.vrbo.com/search/keywords:gardiner-montana-united-states-of-america/arrival:2024-05-13/departure:2024-05-17/minNightlyPrice/0?filterByTotalPrice=false&petIncluded=false&ssr=true&adultsCount=2&childrenCount=0'

In [9]:
# Like for Airbnb, I search out the tags that I need to grab for each listing

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(gardiner_url)

# this code makes Selenium scroll in increments through the webpage and pause for 1 second to let it load
# adjust the time up if your computer is being really slow

y = 1000

for timer in range(0,50):
    driver.execute_script("window.scrollTo(0, "+str(y)+")")
    y += 1000  
    time.sleep(1)




Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/austinsmith/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache


In [10]:
next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')

  next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')


In [12]:
if next_page.is_enabled() :
    print("there are more pages")

else:
    print("no more pages")

there are more pages


In [None]:
# grab the html

html = driver.page_source
html_soup = BeautifulSoup(html, 'html.parser')

In [None]:
# from inspecting the page, I can see that the information I want for the listings is under the class
# "media-flex__body HitInfo__content"
# (At least for now; this is likely to change with any future website updates!))

listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')


In [None]:
driver.close()

In [None]:
# check point 
# should we 50 listings 
# (Note: going to figure out pagination later)

print(len(listings))

In [None]:
# by printing just one listing, I can identify which tags I need to grab the listing info

print(listings[0].prettify())

In [3]:
# these tags pull the info I want for each listing

search_page = {
    'name': {'tag': 'h2', 'class': 'HitInfo__headline hover-text'},
    'header' : {'tag': 'span','order':0},
    'details': {'tag': 'span','order':1},
    'n_guests': {'tag': 'span', 'order':2},
    'beds': {'tag': 'span', 'order':3}
}

In [4]:
# this function will extract all the elements in "search_page" above from the html 

def extract_elements(listing_html, params) :
     # Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # Get text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [None]:
# do a test on the first listing, refine search elements as needed

#extract_elements(listings[0], search_page['name'])
#extract_elements(listings[0], search_page['header'])
#extract_elements(listings[0], search_page['details'])
#extract_elements(listings[0], search_page['n_guests'])
#extract_elements(listings[0], search_page['beds'])

In [5]:
# extract all of the elements with this function

def extract_page_features(soup, search_items):
    # create a dictionary to hold the features
    features_dict = {}
    
    # go through each item of the search block above and try to find it and put it in dict
    for feature in search_items :
        try:
            features_dict[feature] = extract_elements(soup, search_items[feature])
            
        # if it doesn't exist, place empty in that field
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

In [None]:
# test function on the first listing

extract_page_features(listings[0], search_page)

To get Selenium to advance to the next page of search results, I need to identify the button that goes to the next page. 

As of the date of this code, it's under the class: "btn btn-icon ButtonIcon btn-default btn-icon-circle" with tag "a" and then the href item takes one to the next webpage of search results. 

I can use the "click" method to click on the button. 


In [None]:
# test it out ... 

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(gardiner_url)

y = 1000

for timer in range(0,50):
    driver.execute_script("window.scrollTo(0, "+str(y)+")")
    y += 1000  
    time.sleep(1)

next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')

next_page.click()

In [None]:
driver.close()

In [33]:
# putting it all together

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get(gardiner_url)
    
# scroll to the bottom of the page, loading the whole thing
y = 1000

for timer in range(0,50):
    driver.execute_script("window.scrollTo(0, "+str(y)+")")
    y += 1000  
    time.sleep(1)

# then parse the html on the page
html = driver.page_source
html_soup = BeautifulSoup(html, 'html.parser')

listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')

# then go to the second page 
next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')
next_page.click()




Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/austinsmith/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache
  next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')


In [35]:
len(listings)

50

In [36]:
features_list = []

for listing in listings:
    features = extract_page_features(listing, search_page)
    features_list.append(features)
    
#features_list

In [38]:
# check
len(features_list)

50

In [41]:
# do this four times ... I should be putting this in a function ... but Selenium is scrolling too quickly on subsequent pages

y=1000

for timer in range(0,50):
    driver.execute_script("window.scrollTo(0, "+str(y)+")")
    y += 1000  
    time.sleep(1)
    
html = driver.page_source
html_soup = BeautifulSoup(html, 'html.parser')

listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')

# go to the third page
next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')
next_page.click()

  next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')


In [42]:
len(listings)

50

In [43]:
for listing in listings:
    features = extract_page_features(listing, search_page)
    features_list.append(features)
    

In [44]:
# check 
len(features_list)

100

In [45]:
y = 1000 

for timer in range(0,50):
    driver.execute_script("window.scrollTo(0, "+str(y)+")")
    y += 1000  
    time.sleep(1)
    
html = driver.page_source
html_soup = BeautifulSoup(html, 'html.parser')

listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')

# go to the fourth page
next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')
next_page.click()

  next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')


In [49]:
for listing in listings:
    features = extract_page_features(listing, search_page)
    features_list.append(features)

In [50]:
# check
len(features_list)

189

In [48]:
y = 1000

for timer in range(0,50):
    driver.execute_script("window.scrollTo(0, "+str(y)+")")
    y += 1000  
    time.sleep(1)
    
html = driver.page_source
html_soup = BeautifulSoup(html, 'html.parser')

listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')

# all done
driver.close()

In [None]:
for listing in listings:
    features = extract_page_features(listing, search_page)
    features_list.append(features)

In [None]:
len(features_list)

In [53]:
# save as a csv

gardiner_vrbo_scrape = pd.DataFrame(features_list)
gardiner_vrbo_scrape.to_csv('gardiner_vrbo_lists.csv')

## Programmatic Scrape

The Gardiner scrape was a manual process. Here, I try to create a function to make the scrape more programmatic. 

I need to tell the function to run as long as the button that takes us to the next page isn't disabled. 

In [6]:
# VERSION 1
# didn't work ... 

def extract_page_listings(search_page) :
    
    # navigate to the first page
    driver.get(search_page)
    
    # we need to scroll through the page first so that the button can be "seen" by Selenium 
    y = 1000

    for timer in range(0,50) :
        driver.execute_script("window.scrollTo(0, "+str(y)+")")
        y += 1000
        time.sleep(1)
        
    next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')

    while True:
        # on the last page of the search results, the button has a note ahead of "aria-label" that indicates it's "disabled"       
        if next_page.is_enabled(): 
        
            # parse the html on the page
            html = driver.page_source
            html_soup = BeautifulSoup(html, 'html.parser')

            # grab the listings
            listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')

            # extract the features of each listing
            features_list = []
            for listing in listings:
                features = extract_page_features(listing, search_page)
                features_list.append(features)

            # then go to the second page and scroll through it slowly
            next_page.click()

            y = 1000

            for timer in range(0,50) :
                driver.execute_script("window.scrollTo(0, "+str(y)+")")
                y += 1000
                time.sleep(1)
        
        else :
            print("No more pages.")
            break

In [14]:
# VERSION 2
# also didn't work ... 
def extract_page_listings(search_page) :
    
    # navigate to the first page
    driver.get(search_page)
    
    while True :
    
        # scroll through the page to load it
        y = 1000

        for timer in range(0,50) :
            driver.execute_script("window.scrollTo(0, "+str(y)+")")
            y += 1000
            time.sleep(1)


        # parse the html on the page
        html = driver.page_source
        html_soup = BeautifulSoup(html, 'html.parser')

        # grab the listings
        listings = html_soup.find_all('div', class_ = 'media-flex__body HitInfo__content')

        # extract the features of each listing
        features_list = []

        for listing in listings:
            features = extract_page_features(listing, search_page)
            features_list.append(features)

        # then go to the second page, if possible
        next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')

        if next_page.is_enabled():
            next_page.click()

        else :
            print("No more pages.")
            break 


In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
extract_page_listings(gardiner_url)



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/austinsmith/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache
  next_page = driver.find_element_by_xpath('//*[@id="Application__resultsViewport"]/div[1]/section[1]/div[3]/div[2]/div/div[1]/div/nav/ul/li[3]/a')


In [34]:
driver.close()

In [21]:
len(features_list)

NameError: name 'features_list' is not defined