In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

In [2]:
# Define target url. Search for houses in Copenhagen and surroundings, boligtype = Rent
url = "https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/"
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
soup = BeautifulSoup(response.text,'html.parser')

# Get pagination links 

In [5]:
# There are 153 pages, starting from index page 1
pagination_html = soup.find('div',{'class':'css-95dbty'}).find_all('button',{'class':'temporaryButtonClassname'})
for i in range(len(pagination_html)):
    print(pagination_html[i].get_text())

1
2
3
4
5
111


In [6]:
# We can see that when we choose the 2nd page then the offset attribute in the url changes 2nd page -> offset = 18 (or [2-1]*18,
# 3rd page -> offset = 36 (or [3-1]*18) etc)

In [7]:
links_of_pages = []
for i in range(1,int(pagination_html[-1].get_text())+1):
    if i == 1:
        links_of_pages.append('https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/')
    else:
        links_of_pages.append('https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset={}'.format((i-1)*18))

In [8]:
links_of_pages

['https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=18',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=36',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=54',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=72',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=90',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=108',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=126',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=144',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=162',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=180',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?offset=198',
 'https://www.boligportal.dk/en/rental-properties/k%C3%B8benhavn/?of

# Get links of ads in first page 

In [10]:
# first page contains 18 ads 
first_page_ads = soup.find_all('div',{'class':'css-7r8xmo'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-nkly31'})
len(first_page_ads)

IndexError: list index out of range

In [10]:
soup.find_all('div',{'class':'css-7r8xmo'})[1].find_all('div',{'class':'temporaryFlexColumnClassName css-nkly31'})[0].find('div',{'class':'css-m2wmpr'}).find('a')['href']

'/en/rental-apartments/k%C3%B8benhavn/100m2-4-room-id-5216044'

In [11]:
first_page_links = []
for i in range(len(first_page_ads)):
    first_page_links.append('https://www.boligportal.dk{}'.format(first_page_ads[i].find('div',{'class':'css-m2wmpr'}).find('a')['href']))

In [12]:
first_page_links

['https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/100m2-4-room-id-5216044',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/84m2-3-room-id-5242890',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/115m2-5-room-id-5430086',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/58m2-2-room-id-5414477',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/92m2-4-room-id-5344440',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/155m2-4-room-id-5276450',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/55m2-2-room-id-5430114',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/63m2-2-room-id-5430113',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/103m2-4-room-id-5047279',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/63m2-2-room-id-5422366',
 'https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/60m2-2-room-id-5260320',
 'http

# Lets get the links of all ads across all pages 

In [13]:
# So, now we have to iterate through links_of_pages list, connect to each link and get its links.

In [14]:
all_ads_links = []
for link in tqdm(links_of_pages):
    url = link
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    page_ads = soup.find_all('div',{'class':'css-7r8xmo'})[1].find_all('div',{'class':'temporaryFlexColumnClassName css-nkly31'})
    for i in range(len(page_ads)):
        all_ads_links.append('https://www.boligportal.dk{}'.format(page_ads[i].find('div',{'class':'css-m2wmpr'}).find('a')['href']))

100%|████████████████████████████████████████████████████████████████████████████████| 152/152 [01:26<00:00,  1.75it/s]


In [15]:
len(all_ads_links)

2732

# Get data

In [16]:
# Now I have to connect to each link from above and get the data that I need. 
# Store them in a dictionary and append them in a list . 
# Should look like this [{home1:id},{home2:id}, etc]

In [17]:
#########

In [18]:
url = "https://www.boligportal.dk/en/rental-apartments/k%C3%B8benhavn/96m2-2-room-id-5427607"
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')

In [19]:
# adress
soup.find_all('div',{'class':'css-1gjufnd'})[0].find('div',{'class':'css-o9y6d5'}).get_text()

'Nordmarks Alle, 2620 København, Albertslund  - 2nd floor'

In [20]:
# About property 
about_property = soup.find_all('section',{'class':'css-6tqm96'})[0]

In [21]:
# Property tpy
about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[0].find('div',{'class':'css-6tqm96'}).get_text()

'Property type'

In [22]:
about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[0].find('div',{'class':'css-5fqrc1'}).get_text()

'Apartment'

In [23]:
lst1 = []
lst2 = []
for i in range(len(about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'}))):
    lst1.append(about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-6tqm96'}).get_text())
    lst2.append(about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-5fqrc1'}).get_text())
    new = dict(zip(lst1,lst2))

In [24]:
new

{'Property type': 'Apartment',
 'Size': '96 m²',
 'Rooms': '2',
 'Floor': '2nd ',
 'Furnished': 'No',
 'Shareable': 'No',
 'Pets allowed': 'Yes',
 'Elevator': 'Yes',
 'Senior friendly': 'Not specified',
 'Students only': 'Not specified',
 'Balcony': 'Yes',
 'Parking': 'Not specified',
 'Dishwasher': 'Not specified',
 'Washing machine': 'Not specified',
 'Electric charging station': 'No',
 'Dryer': 'Not specified',
 'Energy rating': '-'}

In [25]:
# About rental 
about_rental = soup.find_all('section',{'class':'css-6tqm96'})[1]

In [26]:
about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[0].find('div',{'class':'css-6tqm96'}).get_text()

'Rental period'

In [27]:
about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[0].find('div',{'class':'css-5fqrc1'}).get_text()

'Unlimited'

In [28]:
lst2 = []
lst3 = []
for i in range(len(about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'}))):
    lst2.append(about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-6tqm96'}).get_text())
    lst3.append(about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-5fqrc1'}).get_text())
    new1 = dict(zip(lst2,lst3))

In [29]:
new1

{'Rental period': 'Unlimited',
 'Available from': 'As soon as possible',
 'Monthly net rent': '12.400 kr.',
 'Deposit': '37.200 kr.',
 'Prepaid rent': '12.400 kr.',
 'Move-in price': '62.000 kr.',
 'Creation Date': '21/03/2024',
 'Listing-id': '5427607'}

In [30]:
new.update(new1)

In [31]:
new['Adress'] = soup.find_all('div',{'class':'css-1gjufnd'})[0].find('div',{'class':'css-o9y6d5'}).get_text()

In [32]:
new

{'Property type': 'Apartment',
 'Size': '96 m²',
 'Rooms': '2',
 'Floor': '2nd ',
 'Furnished': 'No',
 'Shareable': 'No',
 'Pets allowed': 'Yes',
 'Elevator': 'Yes',
 'Senior friendly': 'Not specified',
 'Students only': 'Not specified',
 'Balcony': 'Yes',
 'Parking': 'Not specified',
 'Dishwasher': 'Not specified',
 'Washing machine': 'Not specified',
 'Electric charging station': 'No',
 'Dryer': 'Not specified',
 'Energy rating': '-',
 'Rental period': 'Unlimited',
 'Available from': 'As soon as possible',
 'Monthly net rent': '12.400 kr.',
 'Deposit': '37.200 kr.',
 'Prepaid rent': '12.400 kr.',
 'Move-in price': '62.000 kr.',
 'Creation Date': '21/03/2024',
 'Listing-id': '5427607',
 'Adress': 'Nordmarks Alle, 2620 København, Albertslund  - 2nd floor'}

In [33]:
# Now I have to connect to each link from above and get the data that I need. 
# Store them in a dictionary and append them in a list . 
# Should look like this [{home1:id},{home2:id}, etc]

In [34]:
def get_data(link):
    prop_key = []
    prop_value = []
    rental_key = []
    rental_value = []
    properties = {}
    rental_features = {}
    
    url = link
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    about_property = soup.find_all('section',{'class':'css-6tqm96'})[0]
    about_rental = soup.find_all('section',{'class':'css-6tqm96'})[1]

    for i in range(len(about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'}))):
        prop_key.append(about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-6tqm96'}).get_text())
        prop_value.append(about_property.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-5fqrc1'}).get_text())
        properties = dict(zip(prop_key,prop_value))

    for i in range(len(about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'}))):
        rental_key.append(about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-6tqm96'}).get_text())
        rental_value.append(about_rental.find_all('div',{'class':'css-1f8ckkp'})[0].find_all('div',{'class':'temporaryFlexColumnClassName css-1ksgrzt'})[i].find('div',{'class':'css-5fqrc1'}).get_text())
        rental_features = dict(zip(rental_key,rental_value))
    properties.update(rental_features)
    properties['Adress'] = soup.find_all('div',{'class':'css-1gjufnd'})[0].find('div',{'class':'css-o9y6d5'}).get_text()
    return properties

In [35]:
data = [] 
for link in tqdm(all_ads_links):
    data.append(get_data(link))

100%|██████████████████████████████████████████████████████████████████████████████| 2732/2732 [19:07<00:00,  2.38it/s]


In [36]:
data

[{'Property type': 'Apartment',
  'Size': '100 m²',
  'Rooms': '4',
  'Floor': '2nd ',
  'Furnished': 'No',
  'Shareable': 'No',
  'Pets allowed': 'No',
  'Elevator': 'No',
  'Senior friendly': 'Not specified',
  'Students only': 'Not specified',
  'Balcony': 'Yes',
  'Parking': 'No',
  'Dishwasher': 'Not specified',
  'Washing machine': 'Not specified',
  'Electric charging station': 'No',
  'Dryer': 'Not specified',
  'Energy rating': '-',
  'Rental period': 'Unlimited',
  'Available from': '1 July 2024',
  'Monthly net rent': '18.500 kr.',
  'Utilities': '850 kr.',
  'Deposit': '55.500 kr.',
  'Prepaid rent': '55.500 kr.',
  'Move-in price': '130.350 kr.',
  'Creation Date': '04/04/2024',
  'Listing-id': '5216044',
  'Adress': 'Ejgårdsvej, 2920 København, Charlottenlund  - 2nd floor'},
 {'Property type': 'Apartment',
  'Size': '84 m²',
  'Rooms': '3',
  'Floor': '3rd ',
  'Furnished': 'Yes',
  'Shareable': 'No',
  'Pets allowed': 'No',
  'Elevator': 'No',
  'Senior friendly': 'No',


# Load data to a dataframe

In [37]:
df = pd.DataFrame(data)

In [38]:
df

Unnamed: 0,Property type,Size,Rooms,Floor,Furnished,Shareable,Pets allowed,Elevator,Senior friendly,Students only,...,Available from,Monthly net rent,Utilities,Deposit,Prepaid rent,Move-in price,Creation Date,Listing-id,Adress,Housing deposit
0,Apartment,100 m²,4,2nd,No,No,No,No,Not specified,Not specified,...,1 July 2024,18.500 kr.,850 kr.,55.500 kr.,55.500 kr.,130.350 kr.,04/04/2024,5216044,"Ejgårdsvej, 2920 København, Charlottenlund - ...",
1,Apartment,84 m²,3,3rd,Yes,No,No,No,No,No,...,1 June 2024,15.000 kr.,,45.000 kr.,0 kr.,60.000 kr.,04/04/2024,5242890,"Sølvgade, 1307 København, København K - 3rd f...",
2,Apartment,115 m²,5,Ground floor,Yes,No,No,No,No,No,...,As soon as possible,36.000 kr.,3.000 kr.,108.000 kr.,108.000 kr.,255.000 kr.,04/04/2024,5430086,"Holbergsgade, 1057 København, København K - G...",
3,Apartment,58 m²,2,2nd,No,Yes,Yes,Yes,Yes,No,...,30 June 2024,10.300 kr.,700 kr.,30.900 kr.,10.300 kr.,52.200 kr.,04/04/2024,5414477,"Fjerkløvervej, 2750 København, Ballerup - 2nd...",
4,Apartment,92 m²,4,1st,No,Yes,No,Yes,No,No,...,30 June 2024,18.200 kr.,1.000 kr.,54.600 kr.,18.200 kr.,92.000 kr.,04/04/2024,5344440,"Amerikavej, 1756 København, København V - 1st...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2727,Apartment,55 m²,2,Ground floor,Yes,No,No,No,Yes,No,...,15 July 2024,14.000 kr.,0 kr.,2.000 kr.,0 kr.,16.000 kr.,27/12/2021,5218308,"Hedemannsgade, 2100 København, København Ø - ...",
2728,Room,15 m²,1,3rd,Yes,Yes,Yes,No,No,No,...,14 April 2024,8.000 kr.,0 kr.,10.000 kr.,0 kr.,18.000 kr.,17/07/2021,5091127,"Marstalsgade, 2100 København, København Ø - 3...",
2729,Apartment,125 m²,3,4th,Yes,Yes,No,No,No,No,...,As soon as possible,30.000 kr.,0 kr.,10.000 kr.,0 kr.,40.000 kr.,07/07/2021,5204260,"Arkonagade, 1726 København, København V - 4th...",
2730,Room,22 m²,1,4th,Yes,No,No,Not specified,Not specified,Not specified,...,As soon as possible,3.800 kr.,0 kr.,7.600 kr.,3.800 kr.,15.200 kr.,29/10/2020,5024726,"Jagtvej, 2100 København, København Ø - 4th floor",


In [39]:
df.to_csv('ads_data.csv',index=False)