In [1]:
# setting the website that should be scraped 
website = 'https://www.morizon.pl/mieszkania/krakow/'

In [2]:
# downloading Selenium libraries 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# downloading BeautifulSoup library 
from bs4 import BeautifulSoup

# downloading Numpy & Pandas libraries
import pandas as pd
import numpy as np

# downloading additional libraries
from requests import get
import requests, openpyxl
import re 
from tqdm import tqdm

In [3]:
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)

In [4]:
# Get the HTML source of the website
driver.get(website)
html = driver.page_source

# Replace tricky advertisement link with "Unspecified"
html = html.replace("https://www.lendi.pl/formularz/kredyty-hipoteczne?hypothecValue=539900.00&loanPurpose.what=Apartment&loanPurpose.market=SecondaryMarket&meeting_voivoidship=małopolskie&meeting_city=Kraków&utm_entry_page=https://www.mori", "Unspecified")

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Find the div containing the subcategories
subcategories = soup.find_all("div", class_="_3-KHL6")

# Find the links to the subcategories
links = []
for div in subcategories:
    a_tag = div.find("a", href=re.compile("/mieszkania/krakow/"))
    if a_tag:
        links.append(a_tag.get('href'))

# Remove duplicated links
links = list(dict.fromkeys(links))

# Create full links by adding the base URL to each link
base_url = "https://www.morizon.pl"
full_links = [base_url + link for link in links]

In [5]:
def substring_after(s, delim):
   return s.partition(delim)[2]

In [6]:
# Initialize empty lists
pages = []
pages2 = []

# Initialize the progress bar
pbar = tqdm(total=len(full_links))

# Find the number of pages for each subcategory
for link2 in full_links:
    # Get the HTML source of the website
    driver.get(link2)
    html = driver.page_source

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all pagination elements within the specified div
    pagination_divs = soup.find_all("div", class_="zp8VUe")

    # Extract page numbers from the pagination elements
    page_numbers = []
    for div in pagination_divs:
        link = div.find("a", href=re.compile(r"\?page="))
        if link:
            page_number = re.search(r'page=(\d+)', link['href'])
            if page_number:
                page_numbers.append(int(page_number.group(1)))

    # Remove the last number '2' from the list if it is present
    if page_numbers and page_numbers[-1] == 2:
        page_numbers = page_numbers[:-1]

    # Determine the maximum page number
    maxval = max(page_numbers) if page_numbers else 1

    # Create the URL for each page
    for i in range(1, maxval + 1):
        url = f"{link2}?page={i}"
        pages2.append(url)
    pages = pages2

    # Update the progress bar
    pbar.update(1)
    
# Close the progress bar
pbar.close()

100%|██████████| 18/18 [00:58<00:00,  3.28s/it]


In [7]:
# Initialize the progress bar
pbar = tqdm(total=len(pages))

# Initialize empty list to store the links
flats = []

# Base URL to prepend
base_url = "https://www.morizon.pl"

# Iterate over each page
for page in pages:
    # Get the HTML source of the website
    driver.get(page)
    html = driver.page_source
    
    # Replace advertisement link with "Unspecified"
    html = html.replace("https://www.lendi.pl/formularz/kredyty-hipoteczne?hypothecValue=539900.00&loanPurpose.what=Apartment&loanPurpose.market=SecondaryMarket&meeting_voivoidship=małopolskie&meeting_city=Kraków&utm_entry_page=https://www.mori","Unspecified")
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the links, prepend the base URL, and append them to the list
    flats += [base_url + link.get('href') for link in soup.find_all("a", href=re.compile("/oferta/sprzedaz-mieszkanie"))]
    
    # Update the progress bar
    pbar.update(1)
    
# Remove duplicated links
flats = list(dict.fromkeys(flats))

# Close the progress bar
pbar.close()

100%|██████████| 184/184 [07:28<00:00,  2.44s/it]


In [8]:
# Create an Excel workbook and a sheet
excel = openpyxl.Workbook()
sheet = excel.active

# Change the sheet name
sheet.title = 'Morizonflats'

# Add rows that will be used as column names
column_names = ['Total_Price', 'Price_per_Square_Meter', 'Total_living_area', 'Number_of_Rooms', 'Offer_Link', 'Latitude', 'Longitude', 'Developer_name', 'Walls_height', 'Investment_name', 'Floor', 'Total_number_of_floors', 'Kitchen_type', 'Number_of_bathrooms', 'Toilet_and_WC_type', 'Balcony', 'Total_area_of_Balcony', 'Total_area_of_Garden', 'Market_type', 'Available_from','Type_of_property', 'Type_of_Contract', 'Offer_ID', 'Date_of_last_update', 'Date_of_publication', 'Number_of_levels', 'Bedroom_area', 'Bathroom_area', 'Description']
sheet.append(column_names)

In [9]:
flats

['https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-przewoz-32m2-mzn2042016944',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-tadeusza-szafrana-66m2-mzn2042042364',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-lipska-69m2-mzn2037123987',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-zablocie-65m2-mzn2042984164',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-mysliwska-74m2-mzn2042949660',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-rydlowka-47m2-mzn2042956605',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-rydlowka-26m2-mzn2042987714',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-krakusa-33m2-mzn2042628088',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-heleny-53m2-mzn2042952378',
 'https://www.morizon.pl/oferta/sprzedaz-mieszkanie-krakow-podgorze-37m2-mzn2041365235',
 'https://www.morizon.pl/oferta/sp

In [15]:
option = webdriver.ChromeOptions()
option.add_argument("start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=option)

flats = flats

In [16]:
# Inicjalizacja paska postępu
pbar = tqdm(total=len(flats))

# Lista do przechowywania danych
data = []

for link in flats:
    try:
        driver.get(link)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Słownik do przechowywania danych o mieszkaniu
        flat_data = {}

        flat_data = {"link": link}

        # Cena całkowita i za m²
        price_info = soup.find('div', id="basic-info-price-row")
        if price_info:
            try:
                price = price_info.find('span', class_="OhoajE").text.strip()
                price = price.replace(" zł","").replace(" ","")

                price_per_m2 = price_info.find('span', class_="HXLtdQ").text.strip()
                price_per_m2 = price_per_m2.replace(" zł/m²","").replace(" ","")

                flat_data['cena'] = price
                flat_data['cena za m2'] = price_per_m2
            except AttributeError:
                flat_data['cena'] = "N/A"
                flat_data['cena za m2'] = "N/A"
        

        # Liczba pokoi
        rooms = soup.find('span', class_="bliqZo")
        if rooms:
            rooms = rooms.text.strip()
            rooms = rooms.replace("• ","")
            rooms = rooms.replace("pokój","")
            rooms = rooms.replace("pokoje","")
            flat_data['liczba pokoi'] = rooms

        # Piętro
        floor = soup.find('span', text=lambda x: x and 'piętro' in x)
        if floor:
            flat_data['piętro'] = floor.text.strip()

        # Adres/Lokalizacja
        address = soup.find('h2', class_="OLa28v")
        if address:
            flat_data['adres'] = ' '.join([span.text for span in address.find_all('span')])

        # Opis
        title = soup.find('h1', class_="YN-1ia")
        if title:
            flat_data['tytuł'] = title.text.strip()

        description = soup.find('div', class_="rq0BeK sswXuP")
        if description:
            flat_data['opis'] = description.text.strip()

        features = soup.find_all('div', class_="zyVm89")
        for feature in features:
            key_div = feature.find('div', class_="_0p9fli FClu-8")
            value_div = feature.find('div', class_="_0p9fli EEGlsn")

            if key_div and value_div:
                key = key_div.text.strip()
                value = value_div.text.strip()
                flat_data[key] = value

        features = soup.find_all('div', class_="ldBVNM")
        for feature in features:
            key_div = feature.find('div', class_="_0p9fli FClu-8")
            value_div = feature.find('div', class_="_0p9fli EEGlsn")

            if key_div and value_div:
                key = key_div.text.strip()
                value = value_div.text.strip()
                flat_data[key] = value

        # Aktualizacja danych
        data.append(flat_data)
        
    except Exception as e:
        print(f"Błąd przy przetwarzaniu linku: {link}")
        print(str(e))
        # Możesz tutaj dodać logikę, co zrobić w przypadku błędu (np. dodać pusty słownik danych)

    finally:
        pbar.update(1)
        

pbar.close()
driver.quit()

# Zapis danych do pliku Excel
df = pd.DataFrame(data)

df["Pow. całkowita"] = df["Pow. całkowita"].str.replace(" m²", "", regex=False)
df["Pow. całkowita"] = df["Pow. całkowita"].str.replace(",", ".", regex=False)

# Convert to int
columns_to_int = ["liczba pokoi", "Liczba pięter", "Rok budowy", "Liczba odsłon", "Liczba Punktów Podbić"]
for column in columns_to_int:
    df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')

# Convert to float
columns_to_float = ["cena", "cena za m2", "Pow. całkowita"]
for column in columns_to_float:
    df[column] = pd.to_numeric(df[column], errors='coerce')


  3%|▎         | 187/5766 [07:52<3:54:42,  2.52s/it]

  0%|          | 1/5766 [00:02<4:27:17,  2.78s/it][A
  0%|          | 2/5766 [00:03<2:55:30,  1.83s/it][A
  0%|          | 3/5766 [00:05<2:42:39,  1.69s/it][A
  0%|          | 4/5766 [00:06<2:20:40,  1.46s/it][A
  0%|          | 5/5766 [00:07<2:06:11,  1.31s/it][A
  0%|          | 6/5766 [00:08<2:06:41,  1.32s/it][A
  0%|          | 7/5766 [00:10<2:20:37,  1.47s/it][A
  0%|          | 8/5766 [00:11<2:13:46,  1.39s/it][A
  0%|          | 9/5766 [00:13<2:15:16,  1.41s/it][A
  0%|          | 10/5766 [00:14<2:12:25,  1.38s/it][A
  0%|          | 11/5766 [00:16<2:11:24,  1.37s/it][A
  0%|          | 12/5766 [00:27<6:52:01,  4.30s/it][A
  0%|          | 13/5766 [00:28<5:35:23,  3.50s/it][A
  0%|          | 14/5766 [00:30<4:40:09,  2.92s/it][A
  0%|          | 15/5766 [00:32<4:30:27,  2.82s/it][A
  0%|          | 16/5766 [00:34<3:48:34,  2.39s/it][A
  0%|          | 17/5766 [00:35<3:22:18,  2.11s/it][A
  0%|          | 18

In [18]:
df

Unnamed: 0,link,cena,cena za m2,liczba pokoi,adres,tytuł,opis,Pow. całkowita,Piętro,Liczba pięter,...,Cena do negocjacji,Pow. łazienki,Pow. ogródka,Zapotrzebowanie energetyczne,Urządzony ogród,Typ nieruchomości,Pow. loggii,piętro,Lokalizacja garażu,Dach
0,https://www.morizon.pl/oferta/sprzedaz-mieszka...,534270.0,16500.0,1,"Kraków, Kraków-Podgórze, Podgórze, Przewóz","Kawalerka ul.Przewoz 32,38 m2","Do sprzedania kawalerka o powierzchni 32,38 m...",32.38,3/3,3,...,,,,,,,,,,
1,https://www.morizon.pl/oferta/sprzedaz-mieszka...,1170000.0,17586.0,3,"Kraków, Kraków-Podgórze, Tadeusza Szafrana","Apartament,3 pokoje,66.53 m2,ul.Szafrana 5D,ENG","Na sprzedaż słoneczny, klimatyzowany trzypokoj...",66.53,4/6,6,...,,,,,,,,,,
2,https://www.morizon.pl/oferta/sprzedaz-mieszka...,859401.0,12300.0,4,"Kraków, Podgórze, ul. Siemienowicza, ul. Li...",Opis nieruchomości,Najważniejsze atuty \n✅ Nowoczesna bryła budyn...,69.87,1/5,5,...,,,,,,,,,,
3,https://www.morizon.pl/oferta/sprzedaz-mieszka...,1400000.0,21519.0,3,"Kraków, Podgórze, Zabłocie, Zabłocie",Poczuj unikalny klimat Zabłocia,Klimatyczne 3-pokojowe mieszkanie o powierzchn...,65.06,2/7,7,...,,,,,,,,,,
4,https://www.morizon.pl/oferta/sprzedaz-mieszka...,908000.0,12262.0,4,"Kraków, Myśliwska","Apartament 74,05 m2 > Myśliwska","M 19B74,05 m24 pokojeI piętroParametry mieszka...",74.05,1/4,4,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5761,https://www.morizon.pl/oferta/sprzedaz-mieszka...,633000.0,8792.0,3,"Kraków, Opatkowice, Libertowska",Okolice Krakowa / 3 pokoje z ogrodem / kredyt 2 %,Nowoczesna architektura / mieszkanie z ogrodem...,72.00,,1,...,,,,,,,,,,
5762,https://www.morizon.pl/oferta/sprzedaz-mieszka...,298900.0,4396.0,3,"Kraków M., Kraków, Swoszowice","SWOSZOWICE - BOROWINOWA - 3 POKOJE TBS, 68m2","Sprzedamy ustawne, ŁADNE 3-pokojowe mieszkani...",68.00,,3,...,,5 m²,,,,,,,,
5763,https://www.morizon.pl/oferta/sprzedaz-mieszka...,765000.0,10234.0,4,"Kraków, Opatkowice, Leona Petrażyckiego",Przestronne 4pokojowe mieszkanie kameralne osi...,Do sprzedania przestronne 4 pokojowe mieszkani...,74.75,1/3,3,...,,,,,,,,,,
5764,https://www.morizon.pl/oferta/sprzedaz-mieszka...,830000.0,10050.0,,"Kraków, Swoszowice","Pięć pokoi 107m2, balkon, ul. Petrażyckiego",Klimatyczne mieszkanie dwupoziomowe o całkowit...,82.59,1/1,1,...,,,,,,,,,,


In [17]:
df.to_excel('df.xlsx')