# Otodom.pl Web Scraping with BeautifulSoup

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [None]:
# Title
def get_Title(soup):

    try:
        Title = soup.find("h1", attrs = {"data-cy":"adPageAdTitle"}).text

    except AttributeError:
        Title = ""

    return Title

# Price
def get_Price(soup):

    try:
        Price = soup.find("strong", attrs = {"aria-label":"Cena"}).text

    except AttributeError:
        Price = ""

    return Price

# Location
def get_Location(soup):

    try:
        Location = soup.find("a", attrs = {"aria-label":"Adres"}).text

    except AttributeError:
        Location = ""

    return Location

# Surface
def get_Surface(soup):
    try:
        Surface = soup.find("div", attrs = {"data-testid":"table-value-area"}).text

    except AttributeError:
        Surface = ""

    return Surface

# Number_of_Rooms
def get_Number_of_Rooms(soup):
    try:
        Number_of_Rooms = soup.find("div", attrs = {"data-testid":"table-value-rooms_num"}).text

    except AttributeError:
        Number_of_Rooms = ""

    return Number_of_Rooms

# Floor
def get_Floor(soup):
    try:
        Floor = soup.find("div", attrs = {"data-testid":"table-value-floor"}).text

    except AttributeError:
        Floor = ""

    return Floor


# Finishing_Condition
def get_Finishing_Condition(soup):
    try:
        Finishing_Condition = soup.find("div", attrs = {"data-testid":"table-value-construction_status"}).text

    except AttributeError:
        Finishing_Condition = ""

    return Finishing_Condition


# Heating
def get_Heating(soup):
    try:
        Heating = soup.find("div", attrs = {"data-testid":"table-value-heating"}).text

    except AttributeError:
        Heating = ""

    return Heating


# Parking_Space
def get_Parking_Space(soup):
    try:
        Parking_Space = soup.find("div", attrs = {"data-testid":"table-value-car"}).text

    except AttributeError:
        Parking_Space = ""

    return Parking_Space

# Balcony_Garden_Terrace
def get_Balcony_Garden_Terrace(soup):
    try:
        Balcony_Garden_Terrace = soup.find("div", attrs = {"data-testid":"table-value-outdoor"}).text

    except AttributeError:
        Balcony_Garden_Terrace = ""

    return Balcony_Garden_Terrace






In [None]:
if __name__ == '__main__':

    HEADERS = {
        'User-Agent': '',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    # Base URL
    BASE_URL = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/cala-polska?viewType=listing&page="

    # Number of pages to scrape
    num_pages = 200

    # Container to store all data
    all_data = []

    for page in range(1, num_pages + 1):
        URL = BASE_URL + str(page)

        # HTTP Request
        webpage = requests.get(URL, headers=HEADERS)

        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")

        # Fetch links as List of Tag Objects
        links = soup.find_all("a", attrs={'data-cy': 'listing-item-link'})

        # Store the links
        links_list = [link.get('href') for link in links]

        # Loop for extracting product details from each link
        for link in links_list:
            new_webpage = requests.get("https://www.otodom.pl" + link, headers=HEADERS)
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")

            # Function calls to display all necessary product information
            data = {
                'Title': get_Title(new_soup),
                'Price': get_Price(new_soup),
                'Location': get_Location(new_soup),
                'Surface': get_Surface(new_soup),
                'Number_of_Rooms': get_Number_of_Rooms(new_soup),
                'Floor': get_Floor(new_soup),
                'Finishing_Condition': get_Finishing_Condition(new_soup),
                'Heating': get_Heating(new_soup),
                'Parking_Space': get_Parking_Space(new_soup),
                'Balcony_Garden_Terrace': get_Balcony_Garden_Terrace(new_soup),
                'Link': "https://www.otodom.pl" + link
            }

            all_data.append(data)

            # Add a 2-second delay
        time.sleep(2)

    # Create DataFrame from the collected data
    otodom_df = pd.DataFrame(all_data)

    # Drop rows with missing titles
    otodom_df['Title'].replace('', np.nan, inplace=True)
    otodom_df = otodom_df.dropna(subset=['Title'])

    # Save the DataFrame to a CSV file
    otodom_df.to_csv("Otodom_Data.csv", header=True, index=False)


