# Airbnb webscrapping project

This project aims at creating a webscrapper for Airbnb using the *BeautifulSoup* librabry.
As an example, we take the following Airbnb link for vacation stay in Bali between the 29th of Dec 2021 and 5th of Jan 2022: https://www.airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&query=Bali%2C%20Indonesia&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&checkin=2021-12-29&checkout=2022-01-03&source=structured_search_input_header&search_type=autocomplete_click

Note : you may need to change the link, as this links uses travel dates between Dec 29th 2021 and Jan 5th 2022.
Moreover, Airbnb might change the classes. If that is the case, please check the HTML to retrieve the corresponding classes.

# Getting started

Start by defining your URL

In [None]:
#Start by defining the URL you want to use:
url = "https://www.airbnb.com/s/Bali--Indonesia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=november&flexible_trip_dates%5B%5D=october&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&query=Bali%2C%20Indonesia&place_id=ChIJoQ8Q6NNB0S0RkOYkS7EPkSQ&checkin=2021-12-29&checkout=2022-01-03&source=structured_search_input_header&search_type=autocomplete_click"

Import all of the required libraries

In [1]:
#import required libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

Let's begin by making a request to retrieve the HTML code for this website. Since this is an action that you may need to perform several times throughout the project, let's encapsulate the corresponding code in a function.

In [None]:
def get_page(url):
    result = requests.get(url)
    content = result.content
    return BeautifulSoup(content, 'html.parser')
get_page(url)

# Retrieving the data

We start by retrieving information for the first listing available on the Airbnb page. This code retrieves the information as much as necessary, since Airbnb can prevent web scrapping.

In [None]:
#create function to retrieve the first listing, assuming no fake listing
def get_listing(page) :
    return page.find('div', attrs={"itemscope": True})
       
#try this function in case airbnb prevents scraping
def get_page_with_retry(url) :
    attempt = 0
    while attempt < 20 :
        page = get_page(url)
        first_listing = page.find('div', attrs={"itemscope": True})
        if get_listing(page) :
            return page
        
        attempt = attempt + 1
        print("   ...retrying. attempt: " + str(attempt + 1))
    
    print("Can't find page with first listing!")
    return None

#store page to be able to test other functions without reloading
page = get_page_with_retry(url)
#print(page.prettify())   

#test store first_listing for testing other functions
first_listing = get_listing(page)
print(first_listing.prettify())

We then write the code to retrieve the listing title

In [None]:
def get_listing_title(tag):
    title = tag.find('meta',attrs={'itemprop':'name'})
    if title == None:
        return None
    else:
        return title['content']
get_listing_title(first_listing)

Then we retrieve the subtitle

In [None]:
def get_listing_subtitle(tag):
    subtitle = tag.find('div',{'class':'_1xzimiid'}).text
    if subtitle == None:
        return None
    else:
        return subtitle
get_listing_subtitle(first_listing)

We now want to retrieve the listing information, such as the number of rooms, beds, baths and capacity

In [None]:
def get_listing_info(tag):
    info = tag.find('div',{'class':'_3c0zz1'}).text
    if info == None:
        return None
    else:
        return info
get_listing_info(first_listing)

We now move to the ammenities, and extract information such as : pool, hairdryer, kitchen, etc

In [None]:
def get_listing_ammenities(tag):
    ammenities = tag.find('div',{'class':'_3c0zz1'}).next_sibling.text
    if ammenities == None:
        return None
    else:
        return ammenities
get_listing_ammenities(first_listing)

To continue, we retrieve the price of the listing

In [None]:
def get_listing_rating(tag):
    rating = tag.find('span',{'class':'_10fy1f8'})
    if rating == None:
        return None
    else:
        return float(rating.text)
get_listing_rating(first_listing)

Now we retrieve the number of reviews available for each listing

In [None]:
def get_listing_reviews(tag):
    reviews = tag.find('span',{'class':'_a7a5sx'})
    if reviews == None:
        return None
    else:
        return int(reviews.text.split('(')[1].split()[0])
get_listing_reviews(first_listing)

Now we retrieve the price of the listing. Note that we remove the currency sign before the price. Depending in which country you are, you may need to adapt this code.

In [None]:
def get_listing_price_per_night(tag):
    price = tag.find('span',{'class':'_tyxjp1'})
    if price == None:
        return None
    else:
#replace argument in case a price per night over a 1000 appears and has commas
        return float(price.text[1:].replace(',',''))
get_listing_price_per_night(first_listing)

We now retrieve the total price for our stay. Again, we remove the currency sign before the price. Moreover, this code only takes into account the discounted price.

In [None]:
def get_listing_total_price(tag):
    total_price = re.findall(r'\d+',tag.find('div',{'class':'_tt122m'}).text)
    if total_price == None:
        return None
    else:
        return float(total_price[0])
get_listing_total_price(first_listing)

# Looking for additional data

Since we have retrieved the information for the first listing, we will now retrieve the information for all of the listing available at our destination for our set dates.
To begin, we need to extract all of the URL pages.
If we observe the page, we see that there are 15 pages available, and that the URL follows a particular code structure. Each new page is "offset" by 20.

In [None]:
base_url = "https://airbnb.com"
def get_next_page(page):
    a = page.find('a',{'aria-label':'Next'})
    return base_url + a['href'] if a else None
get_next_page(page)

In [None]:
def get_listing_data(listing):
    return {
        'title': get_listing_title(listing),
        'subtitle': get_listing_subtitle(listing),
        'info': get_listing_info(listing), 
        'amenities': get_listing_ammenities(listing),
        'rating': get_listing_rating(listing),
        'reviews': get_listing_reviews(listing),
        'price_per_night': get_listing_price_per_night(listing),
        'total_price': get_listing_total_price(listing)
    }

#test get_listing_data
get_listing_data(first_listing)

We are npw able to retrieve the information for all of the listings

In [None]:
def add_listings_in_page(page, get_listing_data_func, rows):
    listings = page.find_all('div', attrs={"itemscope": True});

    for listing in listings:
        rows.append(get_listing_data_func(listing))

#test add_listings_in_page
rows = []
add_listings_in_page(page, get_listing_data, rows)

In [None]:
def get_all_listings(start_url, get_listing_data_func):
    rows = []

    page_url = url
    page_count = 0
    while page_url != None:
        print('Loading page:' + str(page_count))   #  + ' URL:' + page_url
        page = get_page_with_retry(page_url)
        add_listings_in_page(page, get_listing_data_func, rows)
        page_url = get_next_page(page)
        page_count += 1
    print('Download complete')


    return rows

all_listings = get_all_listings(url, get_listing_data)

# Saving the data

We now save the data in dataframe

In [None]:
airbnb = pd.DataFrame(all_listings, columns=["title","subtitle","info","amenities","rating","reviews","price_per_night","total_price"])
airbnb.reset_index(drop=True)
airbnb