# Airbnb Web Scrape - Selenium and BS

The code below scrapes information for Airbnb listings in and around Gardiner, MT and West Yellowstone, MT. 

In [None]:
# imports

import pandas as pd

import requests               
from bs4 import BeautifulSoup 

# imports for Selenium

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import time


Airbnb only returns 300 listings per area searched, so in order to grab all of the listings in both Gardiner and West Yellowstone, I used the "search by map" feature to narrow down the view results to less than 300 listings per url. The base search pages to scrape are listed below with the approximate number of listings for each URL. In the cleaning phase, I will need to weed out any accidental duplicates. 

For Gardiner: ~420 listings
- start with the town proper (~232 listings; includes Jardine and Corwin): https://www.airbnb.com/s/Gardiner--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&query=Gardiner%2C%20Montana%2C%20United%20States&place_id=ChIJ0fUtVMO3T1MRH5WCc2sS2a0&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&ne_lat=45.36612117935085&ne_lng=-110.39147267490137&sw_lat=44.690607481299885&sw_lng=-111.0973227992605&zoom=10&search_by_map=true

- then expand to the Tom Miner and Emigrant areas (~187 listings): https://www.airbnb.com/s/Gardiner--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&query=Gardiner%2C%20Montana%2C%20United%20States&place_id=ChIJ0fUtVMO3T1MRH5WCc2sS2a0&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&ne_lat=45.450442941099546&ne_lng=-110.67568351400985&sw_lat=45.114183525111855&sw_lng=-111.02860857618941&zoom=11&search_by_map=true 

For West Yellowstone: ~ 420 listings
- town proper (~242 listings): https://www.airbnb.com/s/West-Yellowstone--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&query=West%20Yellowstone%2C%20Montana%2C%20United%20States&place_id=ChIJAysyesC5UVMRECCQSPedY90&ne_lat=44.73728111375304&ne_lng=-110.99951584594726&sw_lat=44.550659328113674&sw_lng=-111.21683914916991&zoom=12&search_by_map=true

- Hebgen Lake (~40): https://www.airbnb.com/s/West-Yellowstone--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&query=West%20Yellowstone%2C%20Montana%2C%20United%20States&place_id=ChIJAysyesC5UVMRECCQSPedY90&ne_lat=44.859090764900564&ne_lng=-111.0874603574348&sw_lat=44.672861888097714&sw_lng=-111.30478366065745&zoom=12&search_by_map=true

- Henry's Lake (~139): https://www.airbnb.com/s/West-Yellowstone--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&query=West%20Yellowstone%2C%20Montana%2C%20United%20States&place_id=ChIJAysyesC5UVMRECCQSPedY90&ne_lat=44.71527583177368&ne_lng=-111.24139372920149&sw_lat=44.52858315630152&sw_lng=-111.45871703242415&zoom=12&search_by_map=true

In [None]:
# establish the URLs

url_gardiner_town = 'https://www.airbnb.com/s/Gardiner--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&query=Gardiner%2C%20Montana%2C%20United%20States&place_id=ChIJ0fUtVMO3T1MRH5WCc2sS2a0&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&ne_lat=45.36612117935085&ne_lng=-110.39147267490137&sw_lat=44.690607481299885&sw_lng=-111.0973227992605&zoom=10&search_by_map=true'
url_miner_emigrant = 'https://www.airbnb.com/s/Gardiner--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&query=Gardiner%2C%20Montana%2C%20United%20States&place_id=ChIJ0fUtVMO3T1MRH5WCc2sS2a0&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&ne_lat=45.450442941099546&ne_lng=-110.67568351400985&sw_lat=45.114183525111855&sw_lng=-111.02860857618941&zoom=11&search_by_map=true' 

url_west_town = 'https://www.airbnb.com/s/West-Yellowstone--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&query=West%20Yellowstone%2C%20Montana%2C%20United%20States&place_id=ChIJAysyesC5UVMRECCQSPedY90&ne_lat=44.73728111375304&ne_lng=-110.99951584594726&sw_lat=44.550659328113674&sw_lng=-111.21683914916991&zoom=12&search_by_map=true'
url_hebgen = 'https://www.airbnb.com/s/West-Yellowstone--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&query=West%20Yellowstone%2C%20Montana%2C%20United%20States&place_id=ChIJAysyesC5UVMRECCQSPedY90&ne_lat=44.859090764900564&ne_lng=-111.0874603574348&sw_lat=44.672861888097714&sw_lng=-111.30478366065745&zoom=12&search_by_map=true'
url_henry = 'https://www.airbnb.com/s/West-Yellowstone--Montana--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=august&flexible_trip_dates%5B%5D=february&flexible_trip_dates%5B%5D=january&flexible_trip_dates%5B%5D=july&flexible_trip_dates%5B%5D=june&flexible_trip_dates%5B%5D=march&flexible_trip_dates%5B%5D=may&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_dates%5B%5D=september&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&adults=2&source=structured_search_input_header&search_type=user_map_move&query=West%20Yellowstone%2C%20Montana%2C%20United%20States&place_id=ChIJAysyesC5UVMRECCQSPedY90&ne_lat=44.71527583177368&ne_lng=-111.24139372920149&sw_lat=44.52858315630152&sw_lng=-111.45871703242415&zoom=12&search_by_map=true' 


The sections of code below walk through the process step-by-step as I learned how to do this scrape. All code is commented out, but I'm keeping it for posterity. 

In [None]:
# scrape the listings page using Selenium

#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [None]:
# navigate to the URL

#driver.get(url_gardiner)

In [None]:
#html = driver.page_source

In [None]:
# parse the page with soup

#html_soup = BeautifulSoup(html, 'html.parser')


In [None]:
# this class should pull 20 listings
#listings = html_soup.find_all('div', class_ = 'c1tbui0o ltlgcp dir dir-ltr')

In [None]:
#driver.close()

In [None]:
# check to make sure there are 20  

#print(len(listings))

In [None]:
# print one listing to inspect its elements

#print(listings[0].prettify())

From the block above of listing elements, we can pull out the listing title; URL; header; number of beds, baths, and guests; select features like kitchen, wifi, etc.; price; rating; and whether the host is a Superhost. Their tags are as follows, which I found by scrolling through the object above.

In [None]:
# IMPORTANT NOTE: ALL OF THESE IDENTIFIERS ARE LIKELY TO CHANGE!
# THEY WILL CHANGE ANYTIME AIRBNB UPDATES ITS WEBSITE

search_page = {
    'name': {'tag':'meta', 'get':'content', 'order':0},
    'url': {'tag':'meta', 'get':'content', 'order':2},
    'header': {'tag':'div', 'class': 'mjnkf15 dir dir-ltr'},
    'guests': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':0},
    'rooms': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':1},
    'beds': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':2},
    'baths': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':3},
    'wifi': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':4},
    'kitchen': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':5},
    'washer': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':6},
    'free parking': {'tag':'span', 'class': 'mvk3iwl dir dir-ltr', 'order':7},
    'price': {'tag':'span', 'class':'a8jt5op dir dir-ltr'},
    'rating': {'tag':'span', 'class':'r1g2zmv6 dir dir-ltr'},
    'n_reviews': {'tag':'span', 'class': 'rapc1b3 dir dir-ltr'},
    'superhost': {'tag':'div', 'class': 't1qa5xaj dir dir-ltr'} 
}

In [None]:
# this function will extract all the elements in "search_page" above from the html 

def extract_elements(listing_html, params) :
     # Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # Get text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [None]:
# test on the first listing; see if it can extract all the right features
# (did a bunch of adjusting to get the correct items) 

#extract_elements(listings[0], search_page['name'])
#extract_elements(listings[0], search_page['url'])
#extract_elements(listings[0], search_page['header'])
#extract_elements(listings[0], search_page['guests'])
#extract_elements(listings[0], search_page['rooms'])
#extract_elements(listings[0], search_page['beds'])
#extract_elements(listings[0], search_page['baths'])
#extract_elements(listings[0], search_page['wifi'])
#extract_elements(listings[0], search_page['kitchen'])
#extract_elements(listings[0], search_page['washer'])
#extract_elements(listings[0], search_page['free parking'])
#extract_elements(listings[0], search_page['price'])
#extract_elements(listings[0], search_page['rating'])
#extract_elements(listings[0], search_page['n_reviews'])
#extract_elements(listings[0], search_page['superhost'])

In [None]:
# extract all of the elements with this function

def extract_page_features(soup, search_items):
    # create a dictionary to hold the features
    features_dict = {}
    
    # go through each item of the search block above and try to find it and put it in dict
    for feature in search_items :
        try:
            features_dict[feature] = extract_elements(soup, search_items[feature])
            
        # if it doesn't exist, place empty in that field
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

In [None]:
# test function on the first listing

#extract_page_features(listings[0], search_page)

Now I need figure out how to go through each of the pages. The Airbnb website adds '&items_offset=20' to the end of the second page. Then '&items_offset=40' to the third page, and so on. (There are 20 listings per page, so this makes sense).

In [None]:
# create a list of the URLs for each page of each URL for Gardiner and for West
# 5 total

def build_urls(main_url, listings_per_page=20, pages_per_location=15):
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = main_url + f'&items_offset={offset}'
        url_list.append(url_pagination)
    
    return url_list

In [None]:
gardiner_town = build_urls(url_gardiner_town, listings_per_page=20, pages_per_location=12)
miner_emigrant = build_urls(url_miner_emigrant,listings_per_page=20, pages_per_location=9)
west_town = build_urls(url_west_town,listings_per_page=20, pages_per_location=13)
hebgen = build_urls(url_hebgen,listings_per_page=20, pages_per_location=2)
henry = build_urls(url_henry,listings_per_page=20, pages_per_location=7)

Now I need a function that will go through every page in the URL lists and scrape it and pull out the features listed above. First, though, I need to create a function that scrapes a single page (this part was done manually above).     

In [None]:
# this function will scrape the listings on one page

def get_listings(search_page) :
    # put the driver ahead of running this function
    # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # add in some wait time so the page can load
    # 10 seconds is PLENTY; I'm being cautious
    
    driver.get(search_page)
    
    time.sleep(10)
    
    # then parse the html on the page
    html = driver.page_source
    html_soup = BeautifulSoup(html, 'html.parser')
    listings = html_soup.find_all('div', class_ = 'c1tbui0o ltlgcp dir dir-ltr')
    
    # remember to close the driver after running the function
    #driver.close()
    
    return listings

In [None]:
# test
#get_listings(url_gardiner)

In [None]:
# this function should scrape all of the pages in the URL list for a given place and return the features I want

def process_search_pages(url_list) :
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, search_page)
            features_list.append(features)

    return features_list
    

In [None]:
# test on one page

# process_search_pages(url_list[1:2])

In [None]:
# process all of the pages for Gardiner's town
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
gardiner_town_scrape = process_search_pages(gardiner_town)
driver.close()

In [None]:
gardiner_town_listings = pd.DataFrame(gardiner_town_scrape)

In [None]:
# rinse and repeat for the other four URL lists
# give some time between scrapes so the website doesn't get mad

# Emigrant and Tom Miner
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
miner_emigrant_scrape = process_search_pages(miner_emigrant)
driver.close()

miner_emigrant_listings = pd.DataFrame(miner_emigrant_scrape)


In [None]:
# West town scrape
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
west_town_scrape = process_search_pages(west_town)
driver.close()

west_town_listings = pd.DataFrame(west_town_scrape)


In [None]:
# Hebgen Lake scrape
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
hebgen_scrape = process_search_pages(hebgen)
driver.close()

hebgen_listings = pd.DataFrame(hebgen_scrape)



In [None]:
# Henry's Lake scrape
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
henry_scrape = process_search_pages(henry)
driver.close()

henry_listings = pd.DataFrame(henry_scrape)

In [None]:
len(henry_listings.index)

In [None]:
# combine the dataframes
gardiner_listings = pd.concat([gardiner_town_listings,miner_emigrant_listings], axis=0)
west_listings = pd.concat([west_town_listings,hebgen_listings,henry_listings],axis=0)

In [None]:
print(len(gardiner_listings.index))
print(len(west_listings.index))

In [None]:
# save as csvs
gardiner_listings.to_csv('gardiner_listings.csv')
west_listings.to_csv('west_listings.csv')