In [1]:
# all imports
import requests
from bs4 import BeautifulSoup

import json
import time

import pandas as pd

from multiprocessing import Pool

import os

In [2]:
airbnb_url = "https://www.airbnb.com/s/Saudi-Arabia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&date_picker_type=flexible_dates&adults=4&source=structured_search_input_header&search_type=autocomplete_click&monthly_start_date=2023-12-01&monthly_length=3&price_filter_input_type=0&price_filter_num_nights=49&channel=EXPLORE&checkin=2023-11-11&checkout=2023-12-30&query=Saudi%20Arabia&place_id=ChIJQSqV5z-z5xURm7YawktQYFk&flexible_trip_lengths%5B%5D=one_week"

In [3]:
def extract_basic_features(listing_html):
    features_dict = {}

    # Extracting basic information
    url_element = listing_html.find('a')
    url = url_element.get('href') if url_element else None
    features_dict['url'] = url

    name_element = listing_html.find("span", {"data-testid": "listing-card-name"})
    name = name_element.text.strip() if name_element else None
    features_dict['name'] = name

    header_element = listing_html.find("div", {"data-testid": "listing-card-title"})
    header = header_element.text.strip() if header_element else None
    features_dict['header'] = header

    # Extracting additional details
    beds_element = listing_html.select_one('.g1qv1ctd .fb4nyux:nth-child(2)')
    beds = beds_element.text.strip() if beds_element else None
    features_dict['beds'] = beds

    bedrooms_element = listing_html.select_one('.g1qv1ctd .fb4nyux:nth-child(3)')
    bedrooms = bedrooms_element.text.strip() if bedrooms_element else None
    features_dict['bedrooms'] = bedrooms

    date_range_element = listing_html.select_one('.g1qv1ctd .fb4nyux:nth-child(4)')
    date_range = date_range_element.text.strip() if date_range_element else None
    features_dict['date_range'] = date_range

    # Extracting pricing information
    price_element = listing_html.select_one('div._1jo4hgw span._tyxjp1')
    price = price_element.text.strip() if price_element else None
    features_dict['price'] = price

    # Extracting rating information
    rating_element = listing_html.select_one('span.r1dxllyb')
    rating = rating_element.text.strip() if rating_element else None
    features_dict['rating'] = rating

    return features_dict


In [4]:
# let's finally write this function
def get_listings(search_page):
    soup = BeautifulSoup(requests.get(search_page).content, 'html.parser')
    listings = soup.find_all('div', 'c1l1h97y')#c1l1h97y dir dir-ltr

    return listings

In [5]:
# it works
len(get_listings(airbnb_url))

18

In [6]:
# let's try next page
new_url = airbnb_url + '&items_offset=20'
len(get_listings(new_url))

18

In [7]:
# let's iterate through all 15 pages
all_listings = []
for i in range(50):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)

    # let's check if it's scraping
    # print(len(all_listings))

In [8]:
def process_search_pages(all_listings):
    features_list = []
    for listing in range(len(all_listings)):
            features = extract_basic_features(all_listings[listing])
            features_list.append(features)

    return features_list

features_list  = process_search_pages(all_listings)
len(features_list)

900

In [10]:
import csv
# Define the CSV file path
csv_file_path = 'suadi_homes.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    # Define the CSV header based on the keys of the first dictionary
    fieldnames = features_list[0].keys()

    # Create a CSV writer
    csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header
    csv_writer.writeheader()

    # Write the data
    csv_writer.writerows(features_list)

print(f'The CSV file has been created at: {csv_file_path}')

The CSV file has been created at: suadi_homes.csv
