# Hemnet web scraping
## Project goal
- Scrap apartment sold price in the past three month (scraping date: 2023 Jan 9th) </br>
- Apartments are in Stockholm city center: `Gamla stan`, `Kungsholmen`, `Norrmalm`, `Södermalm`, `Vasastan`, `Östermalm`</br>
</br>

## Strategy
- There are 34 pages and 50 items per page. In total 1666 apartments were sold in the past three month  </br>
If I scrape too aggressively I can only scrape 5 pages. Therefore I scrape 10 pages each time. 
    1. [Scrape page 1-15](#p1-15)
    2. [Scrape page 16-26](#p16-26)
    3. [Scrape page 27-34](#p27-34)
</br>

## Overview
1. [Import libraries](#lib)
2. [Define functions](#functions)
3. [Web scraping](#web_scraping)
4. [Merge data](#merge)
5. [Clean data](#data_cleaning)
6. [About the data](#data)

# 1. Import libraries <a id = "lib"></a>

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
import os
from bs4 import BeautifulSoup as bs
import pandas as pd
import unicodedata 
import numpy as np

# 2. Define functions <a id = 'functions'></a>
- function for get data
- function to clean data

In [7]:
# get data from a container
def get_element(key, html):
    element = html.select(key)
    temp = []
    if len(element) > 0:
        return element[0].text.replace('\n', '').replace('  ', '')
    else:
        return np.nan

# clean \x0
def clean_x(s):                                                
    clean_text = unicodedata.normalize("NFKD",s)
    clean_text = clean_text.replace(" ", "")
    return clean_text    

# clean \x0
def clean_x1(s):                                                
    clean_text = unicodedata.normalize("NFKD",s)
    #clean_text = clean_text.replace(" ", "")
    return clean_text    

# 3. Web scraping <a id = 'web_scraping'></a>

## 3.1 Scrape page 1-15<a id = 'p1-15'></a>

In [153]:
# creat a datafram for scraped data
data = {'address': [],'sqm_room': [],'price': [],'price_change': [],'area': [],'list_fee': [],'sold_date': [],'price_sqm': [],'extra': []}
df = pd.DataFrame(data)

# create a driver
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")   # creat a driver
driver.get("https://www.hemnet.se/")                                   # go to the website you want

# cookie related buttom
WebDriverWait(driver, 2)                                               # wait 2 sec 
element = driver.find_element_by_css_selector("div.consent__buttons>div.consent__button-wrapper>button.hcl-button") # find the location of the bottom
element.click()   # click the bottom "Min Val"

# second page of cookie related buttoms

WebDriverWait(driver, 2)   
button1 = driver.find_element(By.CSS_SELECTOR,"div.consent-model__content-wrapper > div > ul.consent__purposes-with-details > li.qa-consent-item-analytics > div.checkbox-consent__container > div.checkbox-consent__radio-group  > fieldset.hcl-fieldset > label.hcl-radiobutton > span.hcl-radiobutton__icon")
button1.click()
WebDriverWait(driver, 0.3)
button2 = driver.find_element(By.CSS_SELECTOR,"div.consent-model__content-wrapper > div > ul.consent__purposes-with-details > li.qa-consent-item-marketing > div.checkbox-consent__container > div.checkbox-consent__radio-group  > fieldset.hcl-fieldset > label.hcl-radiobutton > span.hcl-radiobutton__icon")
button2.click()

# press the buttom 'Spara val'
WebDriverWait(driver, 0.3)
button3 = driver.find_element(By.CSS_SELECTOR,"div.consent__buttons > div:nth-child(2) > button")
button3.click()

# click Slutpricer
WebDriverWait(driver, 0.3)
Slutpricer = driver.find_element(By.CSS_SELECTOR, 'div.start-page-hero__content > ul.search-tabs > li:nth-child(2) > label.search-tabs__tab-link')
Slutpricer.click()

# put all area we want to search into search bar
areas = ['Gamla stan', 'Kungsholmen', 'Norrmalm', 'Södermalm', 'Vasastan', 'Östermalm']
for i in areas:
    # searching 
    search = driver.find_element(By.CSS_SELECTOR,"div.start-search-form > div.start-search-form__content > div.start-search-form-location > div.start-search-form-location__panes > div.autocomplete-pane > div.location-search-post > ul.token-input-list > li.input-field >input")
    search.send_keys(i)

    # click the aimed area/click_the_first_option.click()
    WebDriverWait(driver, 0.3)

    # find the suggested search result and click it
    a = "div.token-input-dropdown > ul > li.item-first.item.alt > b"
    element = WebDriverWait(driver, 10).until(lambda x: x.find_element(By.CSS_SELECTOR, a)) 
    print(element)
    element.click()

# click three month buttom
WebDriverWait(driver, 0.3)
three_m_key = "div.radio-button-group.radio-button-group--full-width > div.radio-button-group__item:nth-child(1) > label.radio-button-group__label"
button_search = driver.find_element(By.CSS_SELECTOR,three_m_key)
button_search.click()

#click the search buttom
WebDriverWait(driver, 0.3)
search_key = 'button.hcl-button.hcl-button--primary.hcl-button--full-width.js-submit-button.js-show-on-sold.qa-start-search-form-selling-price-submit-button'
button_search = driver.find_element(By.CSS_SELECTOR, search_key)
button_search.click()

# scraping data
# define keys
address_key = 'h2.sold-property-listing__heading.qa-selling-price-title'
sqm_room_key = 'div.sold-property-listing__subheading.sold-property-listing__area'
price_key = "div.sold-property-listing__price > div.sold-property-listing__subheading"
price_change_key = "div.sold-property-listing__price-change"
area_key = "div.sold-property-listing__location > div:nth-child(2)"
list_fee_key = "div.sold-property-listing__fee"
sold_date_key = "div.sold-property-listing__sold-date"
price_sqm_key = "div.sold-property-listing__price-per-m2"
extra_key = "div.hcl-labels-list.hcl-labels-list--row-direction > span"

# make soup
soup = bs(driver.page_source, "lxml")
key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
containers = soup.select(key)


address = []
sqm_room = []
price = []
price_change = []
area = []
list_fee = []
sold_date = []
price_sqm = []
extra = []

keys = [address_key, sqm_room_key, price_key, price_change_key, area_key, list_fee_key, sold_date_key, price_sqm_key]
lists = [address, sqm_room, price, price_change, area, list_fee, sold_date, price_sqm, extra]        

for i in containers:
    for j, k in zip(keys, lists):
        k.append(get_element(j, i))
    # extra
    e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
    extra.append(e)

# go to the next page p2-p5 (250 rows)
for i in range(2,6):

    WebDriverWait(driver, 3)
    next_page = f"div.pagination > div.pagination__item:nth-child({i}) > a.hcl-button.hcl-button--secondary"
    button_search = driver.find_element(By.CSS_SELECTOR, next_page)
    button_search.click()
    
    # make soup
    soup = bs(driver.page_source, "lxml")
    key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
    containers = soup.select(key)
    
    # scrape
    for i in containers:
        for j, k in zip(keys, lists):
            k.append(get_element(j, i))
        # extra
        e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
        extra.append(e)

# go to p 6-15
for i in range(10):
    WebDriverWait(driver, 3)
    next_page = f"div.pagination > div.pagination__item:nth-child(5) > a.hcl-button.hcl-button--secondary"
    button_search = driver.find_element(By.CSS_SELECTOR, next_page)
    button_search.click()
    
    # make soup
    soup = bs(driver.page_source, "lxml")
    key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
    containers = soup.select(key)
    
    # scrape
    for i in containers:
        for j, k in zip(keys, lists):
            k.append(get_element(j, i))
        # extra
        e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
        extra.append(e)
        
# save the reaults
femton_page = {'address': address, 
         'sqm_room': sqm_room, 
         'price': price, 
         'price_change': price_change, 
         'area': area, 
         'list_fee': list_fee, 
         'sold_date': sold_date, 
         'price_sqm': price_sqm, 
         'extra': extra} 
page = pd.DataFrame(femton_page)
page = pd.concat([df, page])
page.to_csv('page_15.csv')

  driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")   # creat a driver
  element = driver.find_element_by_css_selector("div.consent__buttons>div.consent__button-wrapper>button.hcl-button") # find the location of the bottom


<selenium.webdriver.remote.webelement.WebElement (session="345b9d814889eaf05afe89029920cbae", element="d4b0154c-21e2-4efe-82e2-a57484db7857")>
<selenium.webdriver.remote.webelement.WebElement (session="345b9d814889eaf05afe89029920cbae", element="c259d594-bd4f-4cb2-b0e6-4d24782fd22e")>
<selenium.webdriver.remote.webelement.WebElement (session="345b9d814889eaf05afe89029920cbae", element="fa6aa455-91eb-46cf-8c1c-282a123839ef")>
<selenium.webdriver.remote.webelement.WebElement (session="345b9d814889eaf05afe89029920cbae", element="2d3747a4-fd1b-4cda-bb2d-63233d99489f")>
<selenium.webdriver.remote.webelement.WebElement (session="345b9d814889eaf05afe89029920cbae", element="16c65d51-9199-4311-b718-dff9f30fc88e")>
<selenium.webdriver.remote.webelement.WebElement (session="345b9d814889eaf05afe89029920cbae", element="572a6166-71fd-48ea-b51b-6341a9a49b33")>


WebDriverException: Message: unknown error: cannot determine loading status
from disconnected: unable to send message to renderer
  (Session info: chrome=108.0.5359.125)
Stacktrace:
Backtrace:
	(No symbol) [0x00C0F243]
	(No symbol) [0x00B97FD1]
	(No symbol) [0x00A8D04D]
	(No symbol) [0x00A7E9F0]
	(No symbol) [0x00A7EEF7]
	(No symbol) [0x00A881A9]
	(No symbol) [0x00A93455]
	(No symbol) [0x00A96766]
	(No symbol) [0x00A7F2A1]
	(No symbol) [0x00A930DD]
	(No symbol) [0x00AECDCE]
	(No symbol) [0x00AD8386]
	(No symbol) [0x00AB163C]
	(No symbol) [0x00AB269D]
	GetHandleVerifier [0x00EA9A22+2655074]
	GetHandleVerifier [0x00E9CA24+2601828]
	GetHandleVerifier [0x00CB8C0A+619850]
	GetHandleVerifier [0x00CB7830+614768]
	(No symbol) [0x00BA05FC]
	(No symbol) [0x00BA5968]
	(No symbol) [0x00BA5A55]
	(No symbol) [0x00BB051B]
	BaseThreadInitThunk [0x76F400F9+25]
	RtlGetAppContainerNamedObjectPath [0x77DD7BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77DD7B8E+238]


In [152]:
page

Unnamed: 0,address,sqm_room,price,price_change,area,list_fee,sold_date,price_sqm,extra
0,Älvsborgsgatan 3,36 m² 2 rum,Slutpris 3 495 000 kr,,"LägenhetLägenhetSödermalm - Helgalunden,Stockh...",2 698 kr/mån,Såld 9 januari 2023,97 083 kr/m²,[Hiss]
1,"Älvsborgsgatan 3A, 2 tr",106 m² 4 rum,Slutpris 11 000 000 kr,-4 %,"LägenhetLägenhetMaria,Stockholms kommun",5 691 kr/mån,Såld 9 januari 2023,103 774 kr/m²,"[Balkong, Hiss]"
2,"John Ericssonsgatan 18, 3 tr",24 m² 1 rum,Slutpris 2 815 000 kr,+10 %,"LägenhetLägenhetKungsholmen,Stockholms kommun",1 013 kr/mån,Såld 7 januari 2023,117 292 kr/m²,"[Balkong, Hiss]"
3,"Torsgatan 70, 4 tr",31 m² 2 rum,Slutpris 3 800 000 kr,,"LägenhetLägenhetVasastan,Stockholms kommun",2 210 kr/mån,Såld 6 januari 2023,122 581 kr/m²,[Hiss]
4,"Frejgatan 33, 3tr",98 m² 4 rum,Slutpris 8 700 000 kr,+2 %,"LägenhetLägenhetVasastan - Sibirien,Stockholms...",4 758 kr/mån,Såld 6 januari 2023,88 776 kr/m²,"[Balkong, Hiss]"
...,...,...,...,...,...,...,...,...,...
745,Heleneborgsgatan 8C,39 m² 1 rum,Slutpris 3 300 000 kr,+14 %,"LägenhetLägenhetStockholm – Södermalm,Stockhol...",2 151 kr/mån,Såld 11 november 2022,84 615 kr/m²,"[Uteplats, Hiss]"
746,"Ludvigsbergsgatan 13, 6tr",118 m² 4 rum,Slutpris 12 650 000 kr,+8 %,"LägenhetLägenhetSödermalm Högalid,Stockholms k...",4 829 kr/mån,Såld 11 november 2022,107 203 kr/m²,"[Balkong, Hiss]"
747,Fatburs Brunnsgata 18,61 m² 2 rum,Slutpris 4 500 000 kr,-9 %,"LägenhetLägenhetSödermalm,Stockholms kommun",3 334 kr/mån,Såld 11 november 2022,73 770 kr/m²,"[Balkong, Hiss]"
748,"Roslagsgatan 41, 3tr",34 m² 1 rum,Slutpris 3 600 000 kr,+7 %,"LägenhetLägenhetVasastan - Sibirien,Stockholms...",2 202 kr/mån,Såld 11 november 2022,105 882 kr/m²,[]


## 3.2 Scrape p16-26<a id = 'p16-26'></a>

In [160]:
page16 = "https://www.hemnet.se/salda/bostader?location_ids%5B%5D=473362&location_ids%5B%5D=925968&location_ids%5B%5D=925969&location_ids%5B%5D=898472&location_ids%5B%5D=925970&location_ids%5B%5D=473448&page=16&sold_age=3m"
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(page16)

# cookie related buttom
WebDriverWait(driver, 2)                                               # wait 2 sec 
element = driver.find_element_by_css_selector("div.consent__buttons>div.consent__button-wrapper>button.hcl-button") # find the location of the bottom
element.click()   # click the bottom "Min Val"

# second page of cookie related buttoms

WebDriverWait(driver, 2)   
button1 = driver.find_element(By.CSS_SELECTOR,"div.consent-model__content-wrapper > div > ul.consent__purposes-with-details > li.qa-consent-item-analytics > div.checkbox-consent__container > div.checkbox-consent__radio-group  > fieldset.hcl-fieldset > label.hcl-radiobutton > span.hcl-radiobutton__icon")
button1.click()
WebDriverWait(driver, 0.3)
button2 = driver.find_element(By.CSS_SELECTOR,"div.consent-model__content-wrapper > div > ul.consent__purposes-with-details > li.qa-consent-item-marketing > div.checkbox-consent__container > div.checkbox-consent__radio-group  > fieldset.hcl-fieldset > label.hcl-radiobutton > span.hcl-radiobutton__icon")
button2.click()

# press the buttom 'Spara val'
WebDriverWait(driver, 0.3)
button3 = driver.find_element(By.CSS_SELECTOR,"div.consent__buttons > div:nth-child(2) > button")
button3.click()

# scrape page 16
# make soup
soup = bs(driver.page_source, "lxml")
key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
containers = soup.select(key)
address = []
sqm_room = []
price = []
price_change = []
area = []
list_fee = []
sold_date = []
price_sqm = []
extra = []

keys = [address_key, sqm_room_key, price_key, price_change_key, area_key, list_fee_key, sold_date_key, price_sqm_key]
lists = [address, sqm_room, price, price_change, area, list_fee, sold_date, price_sqm, extra]        

for i in containers:
    for j, k in zip(keys, lists):
        k.append(get_element(j, i))
    # extra
    e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
    extra.append(e)
    
# scrape pages 17- 26    
for i in range(10):
    WebDriverWait(driver, 3)
    next_page = f"div.pagination > div.pagination__item:nth-child(5) > a.hcl-button.hcl-button--secondary"
    button_search = driver.find_element(By.CSS_SELECTOR, next_page)
    button_search.click()
    
    # make soup
    soup = bs(driver.page_source, "lxml")
    key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
    containers = soup.select(key)
    
    # scrape
    for i in containers:
        for j, k in zip(keys, lists):
            k.append(get_element(j, i))
        # extra
        e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
        extra.append(e)

# save the reaults        
page16_26 = {'address': address, 
         'sqm_room': sqm_room, 
         'price': price, 
         'price_change': price_change, 
         'area': area, 
         'list_fee': list_fee, 
         'sold_date': sold_date, 
         'price_sqm': price_sqm, 
         'extra': extra} 
page = pd.DataFrame(page16_26)
page = pd.concat([df, page])
page.to_csv('page_16_26.csv')

  driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
  element = driver.find_element_by_css_selector("div.consent__buttons>div.consent__button-wrapper>button.hcl-button") # find the location of the bottom


In [161]:
page

Unnamed: 0,address,sqm_room,price,price_change,area,list_fee,sold_date,price_sqm,extra
0,Sven Rinmans Gata 5,30 m² 1 rum,Slutpris 2 985 000 kr,+3 %,"LägenhetLägenhetKungsholmen,Stockholms kommun",1 195 kr/mån,Såld 11 november 2022,99 500 kr/m²,[Hiss]
1,Västmannagatan 79,48 m² 2 rum,Slutpris 5 100 000 kr,+2 %,"LägenhetLägenhetVasastan,Stockholms kommun",1 748 kr/mån,Såld 11 november 2022,106 250 kr/m²,[]
2,Alströmergatan 8 B,120 m² 4 rum,Slutpris 12 000 000 kr,-7 %,"LägenhetLägenhetKungsholmen,Stockholms kommun",5 015 kr/mån,Såld 11 november 2022,100 000 kr/m²,"[Balkong, Hiss]"
3,"Rutger Fuchsgatan 9, 2 tr","30 m² 1,5 rum",Slutpris 2 850 000 kr,+6 %,"LägenhetLägenhetKatarina,Stockholms kommun",1 861 kr/mån,Såld 10 november 2022,95 000 kr/m²,[Hiss]
4,"Götgatan 122, etage - 5+6 tr!",99 m² 5 rum,Slutpris 8 320 000 kr,+4 %,"LägenhetLägenhetKatarina,Stockholms kommun",4 254 kr/mån,Såld 10 november 2022,84 040 kr/m²,"[Balkong, Hiss]"
...,...,...,...,...,...,...,...,...,...
545,Lundagatan 50,35 m² 2 rum,Slutpris 3 750 000 kr,,"LägenhetLägenhetSödermalm - Högalid,Stockholms...",1 839 kr/mån,Såld 18 oktober 2022,107 143 kr/m²,[Balkong]
546,Atlasgatan 7,"68,2 m² 3 rum",Slutpris 7 100 000 kr,+1 %,"LägenhetLägenhetBirkastan/Vasastan,Stockholms ...",2 905 kr/mån,Såld 18 oktober 2022,104 106 kr/m²,[Hiss]
547,"Kocksgatan 40, 6 tr",39 m² 1 rum,Slutpris 3 950 000 kr,+32 %,"LägenhetLägenhetSödermalm - Katarina,Stockholm...",2 250 kr/mån,Såld 18 oktober 2022,101 282 kr/m²,"[Balkong, Hiss]"
548,"Högalidsgatan 40B, 4tr",60 m² 2 rum,Slutpris 6 000 000 kr,+9 %,"LägenhetLägenhetSödermalm Högalid,Stockholms k...",2 528 kr/mån,Såld 18 oktober 2022,100 000 kr/m²,[Hiss]


## 3.3 Scrape p27-34<a id = 'p27-34'></a>

In [162]:
page27 = "https://www.hemnet.se/salda/bostader?location_ids%5B%5D=473362&location_ids%5B%5D=925968&location_ids%5B%5D=925969&location_ids%5B%5D=898472&location_ids%5B%5D=925970&location_ids%5B%5D=473448&page=27&sold_age=3m"
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(page27)

# cookie related buttom
WebDriverWait(driver, 2)                                               # wait 2 sec 
element = driver.find_element_by_css_selector("div.consent__buttons>div.consent__button-wrapper>button.hcl-button") # find the location of the bottom
element.click()   # click the bottom "Min Val"

# second page of cookie related buttoms

WebDriverWait(driver, 2)   
button1 = driver.find_element(By.CSS_SELECTOR,"div.consent-model__content-wrapper > div > ul.consent__purposes-with-details > li.qa-consent-item-analytics > div.checkbox-consent__container > div.checkbox-consent__radio-group  > fieldset.hcl-fieldset > label.hcl-radiobutton > span.hcl-radiobutton__icon")
button1.click()
WebDriverWait(driver, 0.3)
button2 = driver.find_element(By.CSS_SELECTOR,"div.consent-model__content-wrapper > div > ul.consent__purposes-with-details > li.qa-consent-item-marketing > div.checkbox-consent__container > div.checkbox-consent__radio-group  > fieldset.hcl-fieldset > label.hcl-radiobutton > span.hcl-radiobutton__icon")
button2.click()

# press the buttom 'Spara val'
WebDriverWait(driver, 0.3)
button3 = driver.find_element(By.CSS_SELECTOR,"div.consent__buttons > div:nth-child(2) > button")
button3.click()

# scrape page 16
# make soup
soup = bs(driver.page_source, "lxml")
key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
containers = soup.select(key)
address = []
sqm_room = []
price = []
price_change = []
area = []
list_fee = []
sold_date = []
price_sqm = []
extra = []

keys = [address_key, sqm_room_key, price_key, price_change_key, area_key, list_fee_key, sold_date_key, price_sqm_key]
lists = [address, sqm_room, price, price_change, area, list_fee, sold_date, price_sqm, extra]        

for i in containers:
    for j, k in zip(keys, lists):
        k.append(get_element(j, i))
    # extra
    e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
    extra.append(e)
    
# 28-34   
for i in range(7):
    WebDriverWait(driver, 3)
    next_page = f"div.pagination > div.pagination__item:nth-child(5) > a.hcl-button.hcl-button--secondary"
    button_search = driver.find_element(By.CSS_SELECTOR, next_page)
    button_search.click()
    
    # make soup
    soup = bs(driver.page_source, "lxml")
    key = 'div.clear-children.sold.result.normal.sorted-by-sale-date > ul.sold-results > li.sold-results__normal-hit'
    containers = soup.select(key)
    
    # scrape
    for i in containers:
        for j, k in zip(keys, lists):
            k.append(get_element(j, i))
        # extra
        e = [j.text.replace('\n', '').replace('  ', '') for j in i.select(extra_key) ]
        extra.append(e)
        
# save the reaults
page17_34 = {'address': address, 
         'sqm_room': sqm_room, 
         'price': price, 
         'price_change': price_change, 
         'area': area, 
         'list_fee': list_fee, 
         'sold_date': sold_date, 
         'price_sqm': price_sqm, 
         'extra': extra} 
page = pd.DataFrame(page17_34)
page = pd.concat([df, page])
page.to_csv('page_17_34.csv')

  driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
  element = driver.find_element_by_css_selector("div.consent__buttons>div.consent__button-wrapper>button.hcl-button") # find the location of the bottom


In [163]:
page

Unnamed: 0,address,sqm_room,price,price_change,area,list_fee,sold_date,price_sqm,extra
0,"Slipgatan 11, 5tr.",37 m² 2 rum,Slutpris 3 225 000 kr,-10 %,"LägenhetLägenhetSödermalm,Stockholms kommun",2 566 kr/mån,Såld 18 oktober 2022,87 162 kr/m²,[]
1,"Rörstrandsgatan 35, Mot Gård","61 m² 2,5 rum",Slutpris 5 650 000 kr,+3 %,"LägenhetLägenhetVasastan - Birkastan,Stockholm...",1 691 kr/mån,Såld 18 oktober 2022,92 623 kr/m²,[Hiss]
2,Klippgatan 14C,64 m² 3 rum,Slutpris 6 700 000 kr,+5 %,"LägenhetLägenhetSödermalm,Stockholms kommun",3 557 kr/mån,Såld 18 oktober 2022,104 688 kr/m²,[Balkong]
3,Gästrikegatan 8,66 m² 2 rum,Slutpris 6 800 000 kr,+2 %,"LägenhetLägenhetVasastan,Stockholms kommun",4 158 kr/mån,Såld 18 oktober 2022,103 030 kr/m²,[Hiss]
4,Pontonjärgatan 32,40 m² 2 rum,Slutpris 4 350 000 kr,+9 %,LägenhetLägenhetKungsholmen - Norr Mälarstrand...,1 674 kr/mån,Såld 18 oktober 2022,108 750 kr/m²,[Hiss]
...,...,...,...,...,...,...,...,...,...
361,"Kellgrensgatan 10, 1 tr",88 m² 4 rum,Slutpris 6 450 000 kr,-1 %,"LägenhetLägenhetKungsholmen,Stockholms kommun",4 981 kr/mån,Såld 1 oktober 2022,73 295 kr/m²,"[Balkong, Hiss]"
362,Roslagsgatan 23 B,60 m² 2 rum,Slutpris 6 042 000 kr,+1 %,"LägenhetLägenhetVasastan,Stockholms kommun",2 299 kr/mån,Såld 1 oktober 2022,100 700 kr/m²,[Balkong]
363,"Sankt Eriksgatan 52A, 5tr","73 m² 2,5 rum",Slutpris 7 050 000 kr,+1 %,"LägenhetLägenhetKungsholmen,Stockholms kommun",3 197 kr/mån,Såld 1 oktober 2022,96 575 kr/m²,"[Balkong, Hiss]"
364,Gästrikegatan 14A,61 m² 3 rum,Slutpris 7 700 000 kr,+5 %,"LägenhetLägenhetVasastan,Stockholms kommun",2 926 kr/mån,Såld 1 oktober 2022,126 230 kr/m²,[]


# 4. Merge all data (page 1-16, 16-26, 17-34) <a id = 'merge'></a>

In [153]:
page_15 = pd.read_csv('page_15.csv', index_col = 'Unnamed: 0')
page_16_26 = pd.read_csv('page_16_26.csv', index_col = 'Unnamed: 0')
page_17_34 = pd.read_csv('page_17_34.csv', index_col = 'Unnamed: 0')
hemnet = pd.concat([page_15, page_16_26, page_17_34])
hemnet = hemnet.reset_index(drop = True)

# 5. Data cleaning <a id = 'data_cleaning'></a>

In [154]:
# data cleaning

hemnet[['sqm', 'rooms']] = hemnet.sqm_room.apply(clean_x).str.replace('rum', '').str.split('m2', expand = True)
hemnet.rooms = hemnet.rooms.str.replace(',', '.')


hemnet.drop(columns = ['sqm_room'], inplace = True)


hemnet.price = hemnet.price.str.replace('Slutpris', '').str.replace('kr', '').apply(clean_x)
hemnet.area = hemnet.area.str.replace('LägenhetLägenhet', '').str.replace(',Stockholms kommun', '')


# cleran list_fee
hemnet.list_fee = [i if type(i) == float else clean_x(i) for i in hemnet.list_fee.str.replace('kr/mån', '')]
hemnet.list_fee = hemnet.list_fee.astype('float')

# clean sold_date
hemnet.sold_date = hemnet.sold_date.str.replace('Såld ', '').apply(clean_x1)
months = {'januari': 1, 'december': 12, 'november': 11, 'oktober': 10}
for month, num in months.items():
    hemnet.sold_date = hemnet.sold_date.str.replace(month, str(num))
hemnet.sold_date = pd.to_datetime(hemnet.sold_date, format = "%d %m %Y")

# clean price_sqm
hemnet.price_sqm = [i if type(i) == float else clean_x(i).replace('kr/m2', '') for i in hemnet.price_sqm]

# clean extra spaces
extra = hemnet.extra.str.replace("[", '', regex = False).str.replace("]", '', regex = False).str.replace("'", '', regex = False).str.replace(" ", '', regex = False).str.split(',')
extra_space = list(set([j for i in extra for j in i]))[1:]
temp = {}
for num, i in enumerate(extra):
    extra_space_dic = {'Hiss': 0, 'Uteplats': 0, 'Nyproduktion': 0, 'Balkong': 0} 
    for j in i:
        if j in extra_space_dic.keys():
            extra_space_dic[j] = 1
        temp[num] = extra_space_dic
hemnet[['Hiss', 'Uteplats', 'Nyproduktion', 'Balkong']] = pd.DataFrame(temp).T
hemnet.drop(columns = ['extra'], inplace = True)

# clean price_change
hemnet.price_change = [i if type(i) == float else clean_x(i) for i in hemnet.price_change.str.replace('%', '').str.replace('±0', '0')]
hemnet.price_change = hemnet.price_change.astype(float)/100

# save the data as csv
hemnet.to_csv('hemnet.csv')

# 6. About the data <a id = 'data'></a>
### columns
`address`: address of the sold apartment </br>
`price`: price of the apartment</br>
`price_change`: price change of the apartment</br>
`area`: area of the apartment</br>
`list_fee (kr/mån)`: how much to pay per month</br>
`sold_date`: sold date</br>
`price_sqm`: price per sqm</br>
`sqm`: sqm</br>
`rooms`: how many rooms</br>
`Hiss`: elevator</br>
`Uteplats`: Patio</br>
`Nyproduktion`: newly built</br>
`Balkong`: balcony</br>

In [172]:
hemnet

Unnamed: 0,address,price,price_change,area,list_fee,sold_date,price_sqm,sqm,rooms,Hiss,Uteplats,Nyproduktion,Balkong
0,Älvsborgsgatan 3,3495000,,Södermalm - Helgalunden,2698.0,2023-01-09,97083,36,2,1,0,0,0
1,"Älvsborgsgatan 3A, 2 tr",11000000,-0.04,Maria,5691.0,2023-01-09,103774,106,4,1,0,0,1
2,"John Ericssonsgatan 18, 3 tr",2815000,0.10,Kungsholmen,1013.0,2023-01-07,117292,24,1,1,0,0,1
3,"Torsgatan 70, 4 tr",3800000,,Vasastan,2210.0,2023-01-06,122581,31,2,1,0,0,0
4,"Frejgatan 33, 3tr",8700000,0.02,Vasastan - Sibirien,4758.0,2023-01-06,88776,98,4,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1661,"Kellgrensgatan 10, 1 tr",6450000,-0.01,Kungsholmen,4981.0,2022-10-01,73295,88,4,1,0,0,1
1662,Roslagsgatan 23 B,6042000,0.01,Vasastan,2299.0,2022-10-01,100700,60,2,0,0,0,1
1663,"Sankt Eriksgatan 52A, 5tr",7050000,0.01,Kungsholmen,3197.0,2022-10-01,96575,73,2.5,1,0,0,1
1664,Gästrikegatan 14A,7700000,0.05,Vasastan,2926.0,2022-10-01,126230,61,3,0,0,0,0
