# Airbnb Web Scrape

My goal is to scrape the details page of all of the listings in and around Gardiner, MT and West Yellowstone, MT. 

In [2]:
# imports for Beautiful Soup

import requests               # To get the pages
from bs4 import BeautifulSoup # and to process them

In [3]:
# imports for Selenium

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selectorlib import Extractor
import time

In [4]:
# establish the Gardiner URL first

url_gardiner = 'https://www.airbnb.com/s/Gardiner--MT/homes?adults=2&place_id=ChIJ0fUtVMO3T1MRH5WCc2sS2a0&tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Gardiner%2C%20MT&date_picker_type=flexible_dates&flexible_trip_lengths%5B%5D=one_week&source=structured_search_input_header&search_type=filter_change'

In [5]:
response = requests.get(url_gardiner)

In [6]:
response

<Response [200]>

In [7]:
# Note: I have tried scarping in a few formats: lxml, html, and html.parser

soup = BeautifulSoup(response.content,'lxml')

In [1]:
#soup.prettify()

In [8]:
# This is the element I want it to find, which includes the pertinent
# listing details. It's on the website when I inspect the element, 
# but it doesn't exist in the soup object created above and I'm not sure why

soup.select('[itemprop=itemListElement]')

[]

In [10]:
# I also tried pulling just the basic top-level listings, which are
# identified with this class: "_8ssblpx"
# there should be 20 of them (that's how many listings airbnb shows per page)

listings = soup.findAll('div', {'class': '_8ssblpx'})

In [11]:
len(listings)

4

## Notes below for future

In [84]:
# extract HTML and put it into a Beautiful Soup object:

def scrape_page(url):
    answer = requests.get(url)
    content = answer.content
    soup = BeautifulSoup(content, features = 'html.parser')
    
    return soup

In [85]:
# this function grabs all of the listings on a given search page

def extract_listing(url):
    page_soup = scrape_page(url)
    listings_title = page_soup.find_all('div',{'class' : 'c1tbui0o ltlgcp dir dir-ltr'})
    
    return listings_title


In [87]:
extract_listing(url_gardiner)

[]

In [None]:
# create a dictionary that holds the information we want from each listing

rules_search_page = {
    'url': {'tag': 'a', 'get': 'href'},
    'name': {'tag': 'span', 'class': 't16jmdcf t5nhi1p t174r01n dir dir-ltr', 'get': 'style'},
    'guests': {'tag': 'span', 'class': 'mvk3iwl dir dir-ltr'},
    # question: how to tell the classes apart for guests, beds, and baths? 
    #'beds' : {'tag': 'div', 'class': ''},
    #'baths': {'tag': 'div', 'class': ''},
    'price': {'tag': 'span', 'class': '_tyxjp1'}
}

In [4]:
# extract all of the data elements that are included on the search page 

def extract_element_data(soup, params):
    
    # 1. Find the right tag
    if 'class' in params:
        elements_found = soup.find_all(params['tag'], params['class'])
    else:
        elements_found = soup.find_all(params['tag'])
        
    # 2. Extract text from these tags
    if 'get' in params:
        element_texts = [el.get(params['get']) for el in elements_found]
    else:
        element_texts = [el.get_text() for el in elements_found]
        
    # 3. Select a particular text or concatenate all of them
    tag_order = params.get('order', 0)
    if tag_order == -1:
        output = '**__**'.join(element_texts)
    else:
        output = element_texts[tag_order]
    
    return output


In [9]:
features_list = []

for listing in listing_soups :
    features_dict = {}
    for feature in rules_search_page :
        features_dict[feature] = extract_element_data(listing, rules_search_page[feature])
        features_list.append(features_dict)

In [10]:
features_list

[]