In [1]:
# setting the website that should be scraped 
website = 'https://www.morizon.pl/mieszkania/krakow/'

In [2]:
# downloading Selenium libraries 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# downloading BeautifulSoup library 
from bs4 import BeautifulSoup

# downloading Numpy & Pandas libraries
import pandas as pd
import numpy as np

# downloading additional libraries
from requests import get
import requests, openpyxl
import re 
from tqdm import tqdm

In [3]:
# Set up Selenium options
options = Options()
options.add_argument("start-maximized")

# Start Chrome browser in the background
driver = webdriver.Chrome(
    service=Service(ChromeDriverManager().install()),
    options=options
)

In [4]:
# Get the HTML source of the website
driver.get(website)
html = driver.page_source

# Replace tricky advertisement link with "Unspecified"
html = html.replace("https://www.lendi.pl/formularz/kredyty-hipoteczne?hypothecValue=539900.00&loanPurpose.what=Apartment&loanPurpose.market=SecondaryMarket&meeting_voivoidship=małopolskie&meeting_city=Kraków&utm_entry_page=https://www.mori", "Unspecified")

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Find the div containing the subcategories
subcategories = soup.find("div", {"id": "locationListChildren"})

# Find the links to the subcategories
links = []
for link in subcategories.find_all("a", href=re.compile("/mieszkania/")):
    links.append(link.get('href'))

# Remove duplicated links
links = list(dict.fromkeys(links))

# Create full links by adding the base URL to each link
base_url = "https://www.morizon.pl"
full_links = [base_url + link for link in links]

In [5]:
def substring_after(s, delim):
   return s.partition(delim)[2]

In [6]:
# Initialize empty lists
pages = []
pages2 = []

# Initialize the progress bar
pbar = tqdm(total=len(full_links))

# Find the number of pages for each subcategory
for link2 in full_links:
    # Get the HTML source of the website
    driver.get(link2)
    html = driver.page_source
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the pagination elements
    pagination = soup.find("ul", {"class": "nav nav-pills mz-pagination-number"})
    
    # If pagination elements are found, find the maximum number of pages
    if pagination:
        flats = []
        for link in pagination.find_all("a", href=re.compile("/mieszkania/")):
            flat = link.get('href')
            flat = substring_after(flat, "page=") 
            flats.append(flat)
        flats2 = list(map(int, flats))
        maxval = max(flats2)
    else:
        maxval = 1
        
    # Create the URL for each page
    i = 0
    for j in range(0, maxval):
        i = i + 1
        url = f"{link2}?page={i}"
        pages2.append(url)
    pages = pages2
    
    # Update the progress bar
    pbar.update(1)
    
# Close the progress bar
pbar.close()

100%|██████████| 18/18 [00:45<00:00,  2.55s/it]


In [11]:
# Initialize the progress bar
pbar = tqdm(total=len(pages))

# Initialize empty list to store the links
flats = []

# Iterate over each page
for page in pages:
    # Get the HTML source of the website
    driver.get(page)
    html = driver.page_source
    
    # Replace advertisement link with "Unspecified"
    html = html.replace("https://www.lendi.pl/formularz/kredyty-hipoteczne?hypothecValue=539900.00&loanPurpose.what=Apartment&loanPurpose.market=SecondaryMarket&meeting_voivoidship=małopolskie&meeting_city=Kraków&utm_entry_page=https://www.mori","Unspecified")
    
    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the links and append them to the list
    flats += [link.get('href') for link in soup.find_all("a", href=re.compile("www.morizon.pl/oferta/sprzedaz-mieszkanie"))]
    
    # Update the progress bar
    pbar.update(1)
    
# Remove duplicated links
flats = list(dict.fromkeys(flats))

# Close the progress bar
pbar.close()

100%|██████████| 282/282 [10:04<00:00,  2.14s/it]


In [12]:
# Create an Excel workbook and a sheet
excel = openpyxl.Workbook()
sheet = excel.active

# Change the sheet name
sheet.title = 'Morizonflats'

# Add rows that will be used as column names
column_names = ['Total_Price', 'Price_per_Square_Meter', 'Total_living_area', 'Number_of_Rooms', 'Offer_Link', 'Latitude', 'Longitude', 'Developer_name', 'Walls_height', 'Investment_name', 'Floor', 'Total_number_of_floors', 'Kitchen_type', 'Number_of_bathrooms', 'Toilet_and_WC_type', 'Balcony', 'Total_area_of_Balcony', 'Total_area_of_Garden', 'Market_type', 'Available_from','Type_of_property', 'Type_of_Contract', 'Offer_ID', 'Date_of_last_update', 'Date_of_publication', 'Number_of_levels', 'Bedroom_area', 'Bathroom_area', 'Description']
sheet.append(column_names)

In [16]:
pbar = tqdm(total=len(flats))

for link in flats:
    # Open the link to the flat
    driver.get(link)
    html = driver.page_source

    # Clean the HTML file for easier scraping
    replacements = {
        "Cena": " ",
        "za m²": " ",
        "Powierzchnia": " ",
        "Pokoje": " "
    }
    for old, new in replacements.items():
        html = html.replace(old, new)
        
    soup = BeautifulSoup(html, "html.parser")
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconPrice'}):
        price = soup.find('li', attrs={'class':'paramIconPrice'}).text.strip()
        # cleaning price variable so it will only cointan numbers 
        price = price.replace("zł"," ")
    else: price = "N/A"
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconPriceM2'}):
        pricem2 = soup.find('li', attrs={'class':'paramIconPriceM2'}).text.strip()
        # cleaning price variable so it will only cointan numbers 
        pricem2 = pricem2.replace("zł"," ")
    else: pricem2 = "N/A"
        
    # scraping price per sqm2
    if soup.find('li', attrs={'class':'paramIconLivingArea'}):
        livingarea = soup.find('li', attrs={'class':'paramIconLivingArea'}).text.strip()
        # cleaning price variable so it will only cointan numbers 
        livingarea = livingarea.replace("m²"," ")
    else: livingarea = "N/A"
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconNumberOfRooms'}):
        rooms = soup.find('li', attrs={'class':'paramIconNumberOfRooms'}).text.strip()
    else: rooms = "N/A"
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconPrice'}):
        price = soup.find('li', attrs={'class':'paramIconPrice'}).text.strip()
    else: price = "N/A"
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconPrice'}):
        price = soup.find('li', attrs={'class':'paramIconPrice'}).text.strip()
    else: price = "N/A"
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconPrice'}):
        price = soup.find('li', attrs={'class':'paramIconPrice'}).text.strip()
    else: price = "N/A"
        
    # scraping price
    if soup.find('li', attrs={'class':'paramIconPrice'}):
        price = soup.find('li', attrs={'class':'paramIconPrice'}).text.strip()
    else: price = "N/A"
        
    link2 = link
    
    # scraping description 
    description_element = soup.find("div", {"class": "description"})
    if description_element is not None:
        if description_element.text.strip():
            opis = description_element.text.strip()
        else:
            opis = "N/A"
    else:
        opis = "N/A"
    
    try:
        lat = soup.find("div", class_="GoogleMap")["data-lat"]
    except:
        lat = "N/A"
        
    try:
        lng = soup.find("div", class_="GoogleMap")["data-lng"]
    except:
        lng = "N/A"
        
    # scraping a table with additional informations  
    div=soup.find_all('table')
    
    table0 = pd.DataFrame()
    
    try:
        
        table0=pd.read_html(str(div[0]))

        # transforming the table to string
        ans = np.array(table0, dtype="object")
        ans = (ans[0])

        # saving array as a dataframe 
        df = pd.DataFrame(ans)

        df = df.rename(columns={0: "a", 1: "b"})

        # pivoting table 
        table0 = pd.pivot_table(df, values='b', columns=['a'], aggfunc=np.sum)

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Deweloper:" in table0.columns:
            Deweloper = table0['Deweloper:'].iloc[0]
        else: Deweloper = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Wysokość wnętrza:" in table0.columns:
            Wysokosc = table0['Wysokość wnętrza:'].iloc[0]
        else: Wysokosc = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Inwestycja:" in table0.columns:
            Inwestycja = table0['Inwestycja:'].iloc[0]
        else: Inwestycja = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Powierzchnia użytkowa:" in table0.columns:
            Powierzchnia = table0['Powierzchnia użytkowa:'].iloc[0]
        else: Powierzchnia = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Piętro:" in table0.columns:
            Pietro = table0['Piętro:'].iloc[0]
        else: Pietro = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Liczba pięter:" in table0.columns:
            Lpieter = table0['Liczba pięter:'].iloc[0]
        else: Lpieter = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Typ kuchni:" in table0.columns:
            Kuchnia = table0['Typ kuchni:'].iloc[0]
        else: Kuchnia = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Liczba łazienek:" in table0.columns:
            Lazienki = table0['Liczba łazienek:'].iloc[0]
        else: Lazienki = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Czy łazienka z WC:" in table0.columns:
            LazWC = table0['Czy łazienka z WC:'].iloc[0]
        else: LazWC = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Balkon:" in table0.columns:
            Balkon = table0['Balkon:'].iloc[0]
        else: Balkon = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Powierzchnia balkonu:" in table0.columns:
            Powbalkonu = table0['Powierzchnia balkonu:'].iloc[0]
        else: Powbalkonu = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Powierzchnia ogródka:" in table0.columns:
            PowOgrodka = table0['Powierzchnia ogródka:'].iloc[0]
        else: PowOgrodka = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Rynek:" in table0.columns:
            Rynek = table0['Rynek:'].iloc[0]
        else: Rynek = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Dostępne od:" in table0.columns:
            Dostod = table0['Dostępne od:'].iloc[0]
        else: Dostod = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Forma własności:" in table0.columns:
            Formawlasnosci = table0['Forma własności:'].iloc[0]
        else: Formawlasnosci = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Rodzaj umowy:" in table0.columns:
            RodzajUmowy = table0['Rodzaj umowy:'].iloc[0]
        else: RodzajUmowy = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Numer oferty:" in table0.columns:
            NumOferty = table0['Numer oferty:'].iloc[0]
        else: NumOferty = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Zaktualizowano:" in table0.columns:
            Zaktualizowano = table0['Zaktualizowano:'].iloc[0]
        else: Zaktualizowano = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Opublikowano:" in table0.columns:
            Opublikowano = table0['Opublikowano:'].iloc[0]
        else: Opublikowano = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Liczba poziomów mieszkania:" in table0.columns:
            PozMiesz = table0['Liczba poziomów mieszkania:'].iloc[0]
        else: PozMiesz = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Liczba sypialni:" in table0.columns:
            Sypialn = table0['Liczba sypialni:'].iloc[0]
        else: Sypialn = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Na biuro:" in table0.columns:
            Biur = table0['Na biuro:'].iloc[0]
        else: Biur = "N/A"

        # scraping information about developer (if applicable - else filling it with 'N/A')
        if "Powierzchnia łazienki:" in table0.columns:
            Lazienka = table0['Powierzchnia łazienki:'].iloc[0]
        else: Lazienka = "N/A"
        
        # appending all scraped variables to an excel file 
        sheet.append([price, pricem2, livingarea, rooms, link2, lat, lng, Deweloper, Wysokosc, Inwestycja, Pietro, Lpieter, Kuchnia, Lazienki, LazWC, Balkon, Powbalkonu, PowOgrodka, Rynek, Dostod, Formawlasnosci, RodzajUmowy, NumOferty, Zaktualizowano, Opublikowano, PozMiesz, Sypialn, Lazienka, opis])
    except:
        print('n/a')
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

# saving excel file 
excel.save('Morizon_flats.xlsx')


  8%|▊         | 706/8946 [22:30<4:22:45,  1.91s/it]

  0%|          | 1/8946 [00:02<5:19:22,  2.14s/it][A
  0%|          | 2/8946 [00:04<5:31:30,  2.22s/it][A
  0%|          | 3/8946 [00:06<5:10:53,  2.09s/it][A
  0%|          | 4/8946 [00:08<4:59:58,  2.01s/it][A
  0%|          | 5/8946 [00:10<4:52:41,  1.96s/it][A
  0%|          | 6/8946 [00:12<5:04:58,  2.05s/it][A
  0%|          | 7/8946 [00:14<5:24:18,  2.18s/it][A
  0%|          | 8/8946 [00:16<5:09:54,  2.08s/it][A
  0%|          | 9/8946 [00:18<5:20:04,  2.15s/it][A
  0%|          | 10/8946 [00:21<5:36:47,  2.26s/it][A
  0%|          | 11/8946 [00:23<5:25:00,  2.18s/it][A
  0%|          | 12/8946 [00:27<6:33:23,  2.64s/it][A
  0%|          | 13/8946 [00:30<6:48:52,  2.75s/it][A
  0%|          | 14/8946 [00:32<6:50:18,  2.76s/it][A
  0%|          | 15/8946 [00:37<7:49:49,  3.16s/it][A
  0%|          | 16/8946 [00:41<8:49:33,  3.56s/it][A
  0%|          | 17/8946 [00:43<7:59:31,  3.22s/it][A
  0%|          | 18

n/a



 95%|█████████▌| 8540/8946 [5:02:29<17:50,  2.64s/it][A
 95%|█████████▌| 8541/8946 [5:02:32<18:24,  2.73s/it][A
 95%|█████████▌| 8542/8946 [5:02:35<18:37,  2.77s/it][A
 95%|█████████▌| 8543/8946 [5:02:37<17:17,  2.58s/it][A
 96%|█████████▌| 8544/8946 [5:02:39<16:09,  2.41s/it][A
 96%|█████████▌| 8545/8946 [5:02:41<15:33,  2.33s/it][A
 96%|█████████▌| 8546/8946 [5:02:43<15:50,  2.38s/it][A
 96%|█████████▌| 8547/8946 [5:02:46<16:08,  2.43s/it][A
 96%|█████████▌| 8548/8946 [5:02:49<17:56,  2.70s/it][A
 96%|█████████▌| 8549/8946 [5:02:52<18:30,  2.80s/it][A
 96%|█████████▌| 8550/8946 [5:02:55<19:01,  2.88s/it][A
 96%|█████████▌| 8551/8946 [5:02:59<19:55,  3.03s/it][A
 96%|█████████▌| 8552/8946 [5:03:01<17:57,  2.73s/it][A
 96%|█████████▌| 8553/8946 [5:03:03<17:48,  2.72s/it][A
 96%|█████████▌| 8554/8946 [5:03:06<17:41,  2.71s/it][A
 96%|█████████▌| 8555/8946 [5:03:08<16:15,  2.49s/it][A
 96%|█████████▌| 8556/8946 [5:03:10<14:58,  2.30s/it][A
 96%|█████████▌| 8557/8946 [5:

In [17]:
# saving excel file 
excel.save('Morizon_flats_final.xlsx')