In [1]:
from bs4 import BeautifulSoup
import cloudscraper
import pandas as pd
import re
import json

In [2]:
def get_house_links(soup):
    """Takes the soup of a page and returns a list of links to each house"""
    a_tags = soup.find_all('a',class_='sold-property-link js-sold-property-card-link')
    return [tag.get('href') for tag in a_tags]

def get_houses_info(links:list[str]):
    """Takes the link to a house and returns a dictionary with the house properties"""
    scraper = cloudscraper.create_scraper()  # returns a CloudScraper instance
    
    house_properties = []
    
    for link in links:
        html = scraper.get(link).content
        soup = BeautifulSoup(html, 'html.parser')
        
        data_raw = json.loads(soup.find('div', class_='sold-property__map js-listing-map-sold')['data-initial-data'])
        
        data_parsed = {
            "adress": f"{data_raw['listing']['streetAddress']}, {data_raw['listing']['area']}, {data_raw['listing']['municipality']['fullName']}",
            "longitude": f"{data_raw['listing']['coordinate'][0]}",
            "latitude": f"{data_raw['listing']['coordinate'][1]}",
            "living_area": f"{data_raw['listing']['livingArea']}",
            "land_area": f"{data_raw['listing']['landArea']}",
            "supplemental_area": f"{data_raw['listing']['supplementalArea']}",
            "patio":'1' if 'Uteplats' in [feature['text'] for feature in data_raw['listing']['labels']] else '0',
            "balcony":'1' if 'Balkong' in [feature['text'] for feature in data_raw['listing']['labels']] else '0',
            "number_of_rooms": f"{data_raw['listing']['numberOfRooms']}",
            "build_year": re.sub(r"^\s+|\s+$", "", soup.find("dl", class_="sold-property__attributes").find("dt", text="Byggår").find_next_sibling('dd').text) if soup.find("dl", class_="sold-property__attributes").find("dt", text="Byggår") is not None else 'None',
            "operating cost":' '.join(re.findall(r'\d+', soup.find("dl", class_="sold-property__attributes").find("dt", text="Driftskostnad").find_next_sibling('dd').text)) if soup.find("dl", class_="sold-property__attributes").find("dt", text="Driftskostnad") is not None else 'None',
            "sold_price": f"{data_raw['listing']['soldAt']}"
        }
        
        house_properties.append(data_parsed)
        
    return house_properties
    


In [3]:
scraper = cloudscraper.create_scraper()  # returns a CloudScraper instance
html = scraper.get("https://www.hemnet.se/salda/bostader?item_types%5B%5D=villa&page=1&sold_age=12m").content
soup = BeautifulSoup(html, 'html.parser')

links = get_house_links(soup)

data = get_houses_info(links)

In [4]:
data

[{'adress': 'Lännavägen 49, Hörningsnäs, Huddinge',
  'longitude': '59.230648',
  'latitude': '17.99937',
  'living_area': '122.0',
  'land_area': '1184.0',
  'supplemental_area': '133.0',
  'patio': '0',
  'balcony': '0',
  'number_of_rooms': '9.0',
  'build_year': '1951',
  'operating cost': '50 850',
  'sold_price': '1671836400'},
 {'adress': 'Pråmgatan 7C, Älvsborg, Göteborgs',
  'longitude': '57.6778374',
  'latitude': '11.8780241',
  'living_area': '175.0',
  'land_area': '951.8',
  'supplemental_area': '66.0',
  'patio': '0',
  'balcony': '0',
  'number_of_rooms': '6.0',
  'build_year': '1985',
  'operating cost': '66 016',
  'sold_price': '1671800400'},
 {'adress': 'Irsta kyrkby 7, Irsta, Västerås',
  'longitude': '59.59839',
  'latitude': '16.7031326',
  'living_area': '754.0',
  'land_area': '9513.0',
  'supplemental_area': '170.0',
  'patio': '0',
  'balcony': '0',
  'number_of_rooms': '8.0',
  'build_year': '2001',
  'operating cost': '127 971',
  'sold_price': '1671789600'